mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-07 03:16:01 +10:00
improve plugin tests and config
This commit is contained in:
21
archivebox/plugins/accessibility/config.json
Normal file
21
archivebox/plugins/accessibility/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"ACCESSIBILITY_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_ACCESSIBILITY", "USE_ACCESSIBILITY"],
|
||||
"description": "Enable accessibility tree capture"
|
||||
},
|
||||
"ACCESSIBILITY_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for accessibility capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -212,13 +212,13 @@ async function main() {
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_ACCESSIBILITY', true)) {
|
||||
console.log('Skipping accessibility (SAVE_ACCESSIBILITY=False)');
|
||||
if (!getEnvBool('ACCESSIBILITY_ENABLED', true)) {
|
||||
console.log('Skipping accessibility (ACCESSIBILITY_ENABLED=False)');
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'SAVE_ACCESSIBILITY=False',
|
||||
output_str: 'ACCESSIBILITY_ENABLED=False',
|
||||
}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
@@ -67,6 +67,8 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, override
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'apt',
|
||||
'machine_id': machine_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
0
archivebox/plugins/apt/templates/icon.html
Normal file
0
archivebox/plugins/apt/templates/icon.html
Normal file
@@ -3,20 +3,20 @@
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"ARCHIVE_ORG_ENABLED": {
|
||||
"ARCHIVEDOTORG_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_ARCHIVE_DOT_ORG", "USE_ARCHIVE_ORG", "SUBMIT_ARCHIVE_DOT_ORG"],
|
||||
"x-aliases": ["SAVE_ARCHIVEDOTORG", "USE_ARCHIVEDOTORG", "SUBMIT_ARCHIVEDOTORG"],
|
||||
"description": "Submit URLs to archive.org Wayback Machine"
|
||||
},
|
||||
"ARCHIVE_ORG_TIMEOUT": {
|
||||
"ARCHIVEDOTORG_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for archive.org submission in seconds"
|
||||
},
|
||||
"ARCHIVE_ORG_USER_AGENT": {
|
||||
"ARCHIVEDOTORG_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
|
||||
@@ -6,10 +6,10 @@ Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes archive.org.txt to $PWD with the archived URL
|
||||
|
||||
Environment variables:
|
||||
ARCHIVE_ORG_TIMEOUT: Timeout in seconds (default: 60)
|
||||
ARCHIVEDOTORG_TIMEOUT: Timeout in seconds (default: 60)
|
||||
USER_AGENT: User agent string
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if ARCHIVE_ORG_* not set:
|
||||
# Fallback to ARCHIVING_CONFIG values if ARCHIVEDOTORG_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
|
||||
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
|
||||
@@ -52,7 +52,7 @@ def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
|
||||
except ImportError:
|
||||
return False, None, 'requests library not installed'
|
||||
|
||||
timeout = get_env_int('ARCHIVE_ORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
timeout = get_env_int('ARCHIVEDOTORG_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
submit_url = f'https://web.archive.org/save/{url}'
|
||||
@@ -105,31 +105,35 @@ def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Submit a URL to archive.org for archiving."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
# Check if feature is enabled
|
||||
if get_env('ARCHIVEDOTORG_ENABLED', 'True').lower() in ('false', '0', 'no', 'off'):
|
||||
print('Skipping archive.org submission (ARCHIVEDOTORG_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = submit_to_archive_org(url)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult with output file
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error (network, timeout, HTTP error) - emit NO JSONL
|
||||
# System will retry later
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Unexpected error - also transient, emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
{% load config_tags %}
|
||||
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
{% if enabled %}
|
||||
<!-- Archive.org embed - full iframe view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed archivedotorg-embed"
|
||||
style="width: 100%; height: 600px; border: 1px solid #ddd;"
|
||||
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
|
||||
</iframe>
|
||||
{% endif %}
|
||||
@@ -1,10 +0,0 @@
|
||||
{% load config_tags %}
|
||||
{% get_config "ARCHIVEDOTORG_ENABLED" as enabled %}
|
||||
{% if enabled %}
|
||||
<!-- Archive.org fullscreen - full page iframe -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen archivedotorg-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms">
|
||||
</iframe>
|
||||
{% endif %}
|
||||
@@ -12,16 +12,16 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
ARCHIVE_ORG_HOOK = PLUGIN_DIR / 'on_Snapshot__13_archive_org.py'
|
||||
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archive_org.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert ARCHIVE_ORG_HOOK.exists()
|
||||
assert ARCHIVEDOTORG_HOOK.exists()
|
||||
|
||||
def test_submits_to_archive_org():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir, capture_output=True, text=True, timeout=60
|
||||
)
|
||||
|
||||
@@ -40,23 +40,29 @@ def test_submits_to_archive_org():
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] in ['succeeded', 'failed'], f"Should succeed or fail: {result_json}"
|
||||
if result.returncode == 0:
|
||||
# Success - should have ArchiveResult
|
||||
assert result_json, "Should have ArchiveResult JSONL output on success"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
else:
|
||||
# Transient error - no JSONL output, just stderr
|
||||
assert not result_json, "Should NOT emit JSONL on transient error"
|
||||
assert result.stderr, "Should have error message in stderr"
|
||||
|
||||
def test_config_save_archive_org_false_skips():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['SAVE_ARCHIVE_DOT_ORG'] = 'False'
|
||||
env['ARCHIVEDOTORG_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - no JSONL emission, just logs to stderr
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
@@ -68,13 +74,20 @@ def test_handles_timeout():
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '1'
|
||||
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVE_ORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode in (0, 1)
|
||||
|
||||
# Timeout is a transient error - should exit 1 with no JSONL
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
# If it timed out (exit 1), should have no JSONL output
|
||||
if result.returncode == 1:
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n')
|
||||
if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, "Should not emit JSONL on timeout (transient error)"
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
"""
|
||||
Install a binary using Homebrew package manager.
|
||||
|
||||
Usage: on_Dependency__install_using_brew_provider.py --binary-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
|
||||
Usage: on_Binary__install_using_brew_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
|
||||
Output: Binary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
@@ -72,7 +72,7 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'brew',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
0
archivebox/plugins/brew/templates/icon.html
Normal file
0
archivebox/plugins/brew/templates/icon.html
Normal file
@@ -1,194 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create symlinks from plugin outputs to canonical legacy locations.
|
||||
|
||||
This plugin runs after all extractors complete and creates symlinks from the
|
||||
new plugin-based output structure to the legacy canonical output paths that
|
||||
ArchiveBox has historically used. This maintains backward compatibility with
|
||||
existing tools and scripts that expect outputs at specific locations.
|
||||
|
||||
Canonical output paths:
|
||||
- favicon.ico → favicon/favicon.ico
|
||||
- singlefile.html → singlefile/singlefile.html
|
||||
- readability/content.html → readability/content.html
|
||||
- mercury/content.html → mercury/content.html
|
||||
- htmltotext.txt → htmltotext/htmltotext.txt
|
||||
- output.pdf → pdf/output.pdf
|
||||
- screenshot.png → screenshot/screenshot.png
|
||||
- output.html → dom/output.html
|
||||
- headers.json → headers/headers.json
|
||||
- warc/{timestamp} → wget/warc/{timestamp}
|
||||
|
||||
New plugin outputs:
|
||||
- ssl.json → ssl/ssl.json
|
||||
- seo.json → seo/seo.json
|
||||
- accessibility.json → accessibility/accessibility.json
|
||||
- outlinks.json → outlinks/outlinks.json
|
||||
- redirects.json → redirects/redirects.json
|
||||
- console.jsonl → consolelog/console.jsonl
|
||||
|
||||
Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
|
||||
DATA_DIR: ArchiveBox data directory
|
||||
ARCHIVE_DIR: Archive output directory
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Mapping from canonical path to plugin output path
|
||||
CANONICAL_MAPPINGS = {
|
||||
# Legacy extractors
|
||||
'favicon.ico': 'favicon/favicon.ico',
|
||||
'singlefile.html': 'singlefile/singlefile.html',
|
||||
'readability/content.html': 'readability/content.html',
|
||||
'mercury/content.html': 'mercury/content.html',
|
||||
'htmltotext.txt': 'htmltotext/htmltotext.txt',
|
||||
'output.pdf': 'pdf/output.pdf',
|
||||
'screenshot.png': 'screenshot/screenshot.png',
|
||||
'output.html': 'dom/output.html',
|
||||
'headers.json': 'headers/headers.json',
|
||||
|
||||
# New plugins
|
||||
'ssl.json': 'ssl/ssl.json',
|
||||
'seo.json': 'seo/seo.json',
|
||||
'accessibility.json': 'accessibility/accessibility.json',
|
||||
'outlinks.json': 'parse_dom_outlinks/outlinks.json',
|
||||
'redirects.json': 'redirects/redirects.json',
|
||||
'console.jsonl': 'consolelog/console.jsonl',
|
||||
}
|
||||
|
||||
|
||||
def create_symlink(target: Path, link: Path, relative: bool = True) -> bool:
|
||||
"""
|
||||
Create a symlink from link to target.
|
||||
|
||||
Args:
|
||||
target: The actual file/directory (source)
|
||||
link: The symlink to create (destination)
|
||||
relative: Whether to create a relative symlink (default: True)
|
||||
|
||||
Returns:
|
||||
True if symlink was created or already exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Skip if target doesn't exist
|
||||
if not target.exists():
|
||||
return False
|
||||
|
||||
# Remove existing symlink/file if present
|
||||
if link.exists() or link.is_symlink():
|
||||
if link.is_symlink() and link.resolve() == target.resolve():
|
||||
# Already correctly symlinked
|
||||
return True
|
||||
link.unlink()
|
||||
|
||||
# Create parent directory
|
||||
link.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create relative or absolute symlink
|
||||
if relative:
|
||||
# Calculate relative path from link to target
|
||||
rel_target = os.path.relpath(target, link.parent)
|
||||
link.symlink_to(rel_target)
|
||||
else:
|
||||
link.symlink_to(target)
|
||||
|
||||
return True
|
||||
except (OSError, FileNotFoundError, PermissionError) as e:
|
||||
# Symlink creation failed, skip
|
||||
return False
|
||||
|
||||
|
||||
def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
|
||||
"""
|
||||
Create all canonical symlinks for a snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: The snapshot directory (e.g., archive/<timestamp>/)
|
||||
|
||||
Returns:
|
||||
Dict mapping canonical path to success status
|
||||
"""
|
||||
results = {}
|
||||
|
||||
for canonical_path, plugin_output in CANONICAL_MAPPINGS.items():
|
||||
target = snapshot_dir / plugin_output
|
||||
link = snapshot_dir / canonical_path
|
||||
|
||||
success = create_symlink(target, link, relative=True)
|
||||
results[canonical_path] = success
|
||||
|
||||
# Special handling for warc/ directory symlink
|
||||
# wget plugin outputs to wget/warc/, but canonical expects warc/ at root
|
||||
wget_warc = snapshot_dir / 'wget' / 'warc'
|
||||
canonical_warc = snapshot_dir / 'warc'
|
||||
if wget_warc.exists():
|
||||
results['warc/'] = create_symlink(wget_warc, canonical_warc, relative=True)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL being archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Create symlinks from plugin outputs to canonical legacy locations."""
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
symlinks_created = 0
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_canonical:
|
||||
status = 'skipped'
|
||||
click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
|
||||
# Parent is the snapshot directory
|
||||
output_dir = Path.cwd()
|
||||
snapshot_dir = output_dir.parent
|
||||
|
||||
if not snapshot_dir.exists():
|
||||
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||
|
||||
# Create canonical symlinks
|
||||
results = create_canonical_symlinks(snapshot_dir)
|
||||
|
||||
# Count successful symlinks
|
||||
symlinks_created = sum(1 for success in results.values() if success)
|
||||
|
||||
status = 'succeeded'
|
||||
output = str(snapshot_dir)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
# Print JSON result for hook runner
|
||||
result = {
|
||||
'status': status,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
'symlinks_created': symlinks_created,
|
||||
}
|
||||
click.echo(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
21
archivebox/plugins/captcha2/config.json
Normal file
21
archivebox/plugins/captcha2/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"CAPTCHA2_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_CAPTCHA2"],
|
||||
"description": "Enable Captcha2 browser extension for CAPTCHA solving"
|
||||
},
|
||||
"CAPTCHA2_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for CAPTCHA solving in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,7 +20,7 @@ const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
const extensionUtils = require('../chrome/chrome_extension_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
|
||||
0
archivebox/plugins/captcha2/templates/icon.html
Normal file
0
archivebox/plugins/captcha2/templates/icon.html
Normal file
@@ -14,8 +14,8 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__01_captcha2.js"
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / "on_Snapshot__21_captcha2_config.js"
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
|
||||
CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
|
||||
@@ -97,12 +97,12 @@ def main():
|
||||
# Get config values
|
||||
chrome_binary = get_env('CHROME_BINARY', 'chromium')
|
||||
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
|
||||
save_screenshot = get_env_bool('SAVE_SCREENSHOT', True)
|
||||
save_pdf = get_env_bool('SAVE_PDF', True)
|
||||
save_dom = get_env_bool('SAVE_DOM', True)
|
||||
screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
|
||||
pdf_enabled = get_env_bool('PDF_ENABLED', True)
|
||||
dom_enabled = get_env_bool('DOM_ENABLED', True)
|
||||
|
||||
# Compute USE_CHROME (derived from SAVE_* flags)
|
||||
use_chrome = save_screenshot or save_pdf or save_dom
|
||||
# Compute USE_CHROME (derived from extractor enabled flags)
|
||||
use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
|
||||
computed['USE_CHROME'] = str(use_chrome).lower()
|
||||
|
||||
# Detect Docker and adjust sandbox
|
||||
|
||||
0
archivebox/plugins/chrome/templates/icon.html
Normal file
0
archivebox/plugins/chrome/templates/icon.html
Normal file
@@ -24,69 +24,18 @@ import tempfile
|
||||
import shutil
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_chrome_install.py'
|
||||
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = PLUGIN_DIR / 'on_Snapshot__30_chrome_navigate.js'
|
||||
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
|
||||
|
||||
def test_hook_scripts_exist():
|
||||
"""Verify chrome hooks exist."""
|
||||
assert CHROME_INSTALL_HOOK.exists(), f"Hook not found: {CHROME_INSTALL_HOOK}"
|
||||
assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
|
||||
assert CHROME_TAB_HOOK.exists(), f"Hook not found: {CHROME_TAB_HOOK}"
|
||||
assert CHROME_NAVIGATE_HOOK.exists(), f"Hook not found: {CHROME_NAVIGATE_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_install_hook():
|
||||
"""Test chrome install hook checks for Chrome/Chromium binary."""
|
||||
import os
|
||||
|
||||
# Try with explicit CHROME_BINARY first (faster and more reliable)
|
||||
chrome_app_path = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
|
||||
|
||||
if Path(chrome_app_path).exists():
|
||||
# Use explicit CHROME_BINARY env var
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env={**os.environ, 'CHROME_BINARY': chrome_app_path},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# When CHROME_BINARY is set and valid, hook exits 0 immediately (silent success)
|
||||
assert result.returncode == 0, f"Should find Chrome at {chrome_app_path}. Error: {result.stderr}"
|
||||
else:
|
||||
# Run install hook to find or install Chrome
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300 # Longer timeout for potential @puppeteer/browsers install
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Binary found or installed - verify Binary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
assert record['name'] == 'chrome'
|
||||
assert record['abspath']
|
||||
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output Binary record when binary found"
|
||||
else:
|
||||
# Failed to find or install Chrome
|
||||
pytest.fail(f"Chrome installation failed. Please install Chrome manually or ensure @puppeteer/browsers is available. Error: {result.stderr}")
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify chrome is available via abx-pkg."""
|
||||
from abx_pkg import Binary, NpmProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
21
archivebox/plugins/consolelog/config.json
Normal file
21
archivebox/plugins/consolelog/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"CONSOLELOG_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_CONSOLELOG", "USE_CONSOLELOG"],
|
||||
"description": "Enable console log capture"
|
||||
},
|
||||
"CONSOLELOG_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for console log capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -207,9 +207,9 @@ async function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!getEnvBool('SAVE_CONSOLELOG', true)) {
|
||||
console.error('Skipping (SAVE_CONSOLELOG=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'SAVE_CONSOLELOG=False'}));
|
||||
if (!getEnvBool('CONSOLELOG_ENABLED', true)) {
|
||||
console.error('Skipping (CONSOLELOG_ENABLED=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'CONSOLELOG_ENABLED=False'}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ Install a binary using a custom bash command.
|
||||
This provider runs arbitrary shell commands to install binaries
|
||||
that don't fit into standard package managers.
|
||||
|
||||
Usage: on_Dependency__install_using_custom_bash.py --dependency-id=<uuid> --bin-name=<name> --custom-cmd=<cmd>
|
||||
Usage: on_Binary__install_using_custom_bash.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> --custom-cmd=<cmd>
|
||||
Output: Binary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
@@ -22,22 +22,23 @@ from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dependency-id', required=True, help="Dependency UUID")
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--binary-id', required=True, help="Binary UUID")
|
||||
@click.option('--machine-id', required=True, help="Machine UUID")
|
||||
@click.option('--name', required=True, help="Binary name to install")
|
||||
@click.option('--binproviders', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', required=True, help="Custom bash command to run")
|
||||
def main(dependency_id: str, bin_name: str, binproviders: str, custom_cmd: str):
|
||||
def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_cmd: str):
|
||||
"""Install binary using custom bash command."""
|
||||
|
||||
if binproviders != '*' and 'custom' not in binproviders.split(','):
|
||||
click.echo(f"custom provider not allowed for {bin_name}", err=True)
|
||||
click.echo(f"custom provider not allowed for {name}", err=True)
|
||||
sys.exit(0)
|
||||
|
||||
if not custom_cmd:
|
||||
click.echo("custom provider requires --custom-cmd", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {bin_name} via custom command: {custom_cmd}", err=True)
|
||||
click.echo(f"Installing {name} via custom command: {custom_cmd}", err=True)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
@@ -57,13 +58,13 @@ def main(dependency_id: str, bin_name: str, binproviders: str, custom_cmd: str):
|
||||
# Use abx-pkg to load the binary and get its info
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).load()
|
||||
binary = Binary(name=name, binproviders=[provider]).load()
|
||||
except Exception as e:
|
||||
click.echo(f"{bin_name} not found after custom install: {e}", err=True)
|
||||
click.echo(f"{name} not found after custom install: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{bin_name} not found after custom install", err=True)
|
||||
click.echo(f"{name} not found after custom install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
@@ -71,18 +72,18 @@ def main(dependency_id: str, bin_name: str, binproviders: str, custom_cmd: str):
|
||||
# Output Binary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': bin_name,
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'custom',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
|
||||
click.echo(f"Installed {name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
0
archivebox/plugins/custom/templates/icon.html
Normal file
0
archivebox/plugins/custom/templates/icon.html
Normal file
@@ -15,9 +15,29 @@
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* SAVE_DOM: Enable DOM extraction (default: true)
|
||||
* DOM_ENABLED: Enable DOM extraction (default: true)
|
||||
*/
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Check if DOM is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('DOM_ENABLED', true)) {
|
||||
console.error('Skipping DOM (DOM_ENABLED=False)');
|
||||
// Temporary failure (config disabled) - NO JSONL emission
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
@@ -40,18 +60,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
@@ -229,18 +237,7 @@ async function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if DOM is enabled
|
||||
if (!getEnvBool('SAVE_DOM', true)) {
|
||||
console.error('Skipping DOM (SAVE_DOM=False)');
|
||||
// Feature disabled - no ArchiveResult, just exit
|
||||
process.exit(0);
|
||||
}
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.error(`Skipping DOM - staticfile extractor already downloaded this`);
|
||||
@@ -251,46 +248,40 @@ async function main() {
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const result = await dumpDom(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const size = fs.statSync(output).size;
|
||||
console.error(`DOM saved (${size} bytes)`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await dumpDom(url);
|
||||
|
||||
if (result.success) {
|
||||
// Success - emit ArchiveResult
|
||||
const size = fs.statSync(result.output).size;
|
||||
console.error(`DOM saved (${size} bytes)`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: result.output,
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
|
||||
if (error) console.error(`ERROR: ${error}`);
|
||||
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: output || error || '',
|
||||
}));
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- DOM embed - full iframe of captured DOM HTML -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed dom-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- DOM fullscreen - full page iframe -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen dom-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||
</iframe>
|
||||
@@ -22,9 +22,8 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
DOM_HOOK = PLUGIN_DIR / 'on_Snapshot__36_dom.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
DOM_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_dom.*'), None)
|
||||
NPM_PROVIDER_HOOK = next((PLUGINS_ROOT / 'npm').glob('on_Binary__install_using_npm_provider.py'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
@@ -33,66 +32,6 @@ def test_hook_script_exists():
|
||||
assert DOM_HOOK.exists(), f"Hook not found: {DOM_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome install hook to install puppeteer-core if needed."""
|
||||
# Run chrome install hook (from chrome plugin)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# If exit 1, binary not found - need to install
|
||||
if result.returncode == 1:
|
||||
# Parse Dependency request from JSONL
|
||||
dependency_request = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
dependency_request = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if dependency_request:
|
||||
bin_name = dependency_request['bin_name']
|
||||
bin_providers = dependency_request['bin_providers']
|
||||
|
||||
# Install via npm provider hook
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(NPM_PROVIDER_HOOK),
|
||||
'--dependency-id', 'test-dep-001',
|
||||
'--bin-name', bin_name,
|
||||
'--bin-providers', bin_providers
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
|
||||
|
||||
# Verify installation via JSONL output
|
||||
for line in install_result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
assert record['name'] == bin_name
|
||||
assert record['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# Binary already available, verify via JSONL output
|
||||
assert result.returncode == 0, f"Validation failed: {result.stderr}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
@@ -154,13 +93,13 @@ def test_extracts_dom_from_example_com():
|
||||
|
||||
|
||||
def test_config_save_dom_false_skips():
|
||||
"""Test that SAVE_DOM=False exits without emitting JSONL."""
|
||||
"""Test that DOM_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SAVE_DOM'] = 'False'
|
||||
env['DOM_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(DOM_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
@@ -173,8 +112,8 @@ def test_config_save_dom_false_skips():
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - no JSONL emission, just logs to stderr
|
||||
assert 'Skipping DOM' in result.stderr, "Should log skip reason to stderr"
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping DOM' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
|
||||
@@ -5,7 +5,7 @@ Check if a binary is already available in the system PATH.
|
||||
This is the simplest "provider" - it doesn't install anything,
|
||||
it just discovers binaries that are already installed.
|
||||
|
||||
Usage: on_Dependency__install_using_env_provider.py --binary-id=<uuid> --name=<name>
|
||||
Usage: on_Binary__install_using_env_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name>
|
||||
Output: Binary JSONL record to stdout if binary found in PATH
|
||||
|
||||
Environment variables:
|
||||
@@ -56,7 +56,7 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str):
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
0
archivebox/plugins/env/templates/icon.html
vendored
Normal file
0
archivebox/plugins/env/templates/icon.html
vendored
Normal file
9
archivebox/plugins/favicon/templates/thumbnail.html
Normal file
9
archivebox/plugins/favicon/templates/thumbnail.html
Normal file
@@ -0,0 +1,9 @@
|
||||
<!-- Favicon thumbnail - small favicon preview -->
|
||||
<div class="extractor-thumbnail favicon-thumbnail" style="width: 100%; height: 100px; display: flex; align-items: center; justify-content: center; background: #fff;">
|
||||
{% if output_path %}
|
||||
<img src="{{ output_path }}"
|
||||
alt="Favicon"
|
||||
style="max-width: 80%; max-height: 80%; object-fit: contain;"
|
||||
loading="lazy">
|
||||
{% endif %}
|
||||
</div>
|
||||
@@ -23,7 +23,7 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
FAVICON_HOOK = PLUGIN_DIR / 'on_Snapshot__11_favicon.py'
|
||||
FAVICON_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_favicon.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
@@ -65,8 +65,8 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env
|
||||
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
timeout = get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
textify = get_env_bool('FORUMDL_TEXTIFY', False)
|
||||
extra_args = get_env('FORUMDL_EXTRA_ARGS', '')
|
||||
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
|
||||
@@ -148,9 +148,9 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Check if forum-dl is enabled
|
||||
if not get_env_bool('SAVE_FORUMDL', True):
|
||||
print('Skipping forum-dl (SAVE_FORUMDL=False)', file=sys.stderr)
|
||||
# Feature disabled - no ArchiveResult, just exit
|
||||
if not get_env_bool('FORUMDL_ENABLED', True):
|
||||
print('Skipping forum-dl (FORUMDL_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
@@ -158,24 +158,25 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_forum(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -22,8 +22,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py'
|
||||
FORUMDL_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_forumdl.py'
|
||||
FORUMDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_forumdl.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Module-level cache for binary path
|
||||
@@ -35,121 +34,60 @@ def get_forumdl_binary_path():
|
||||
if _forumdl_binary_path:
|
||||
return _forumdl_binary_path
|
||||
|
||||
# Skip if install hook doesn't exist
|
||||
if not FORUMDL_INSTALL_HOOK.exists():
|
||||
return None
|
||||
# Try to find forum-dl binary using abx-pkg
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
# Run install hook to find or install binary
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
try:
|
||||
binary = Binary(
|
||||
name='forum-dl',
|
||||
binproviders=[PipProvider(), EnvProvider()]
|
||||
).load()
|
||||
|
||||
# Check if binary was found
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if binary and binary.abspath:
|
||||
_forumdl_binary_path = str(binary.abspath)
|
||||
return _forumdl_binary_path
|
||||
except Exception:
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'forum-dl':
|
||||
# Need to install via pip hook
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
|
||||
dependency_id = str(uuid.uuid4())
|
||||
|
||||
# Build command with overrides if present
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', record['bin_name']
|
||||
]
|
||||
if 'overrides' in record:
|
||||
cmd.extend(['--overrides', json.dumps(record['overrides'])])
|
||||
# If not found, try to install via pip
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
|
||||
if pip_hook.exists():
|
||||
binary_id = str(uuid.uuid4())
|
||||
machine_id = str(uuid.uuid4())
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--binary-id', binary_id,
|
||||
'--machine-id', machine_id,
|
||||
'--name', 'forum-dl'
|
||||
]
|
||||
|
||||
# Parse Binary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if install_line.strip():
|
||||
pass
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = install_record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Installation failed - print debug info
|
||||
if not _forumdl_binary_path:
|
||||
print(f"\n=== forum-dl installation failed ===", file=sys.stderr)
|
||||
print(f"stdout: {install_result.stdout}", file=sys.stderr)
|
||||
print(f"stderr: {install_result.stderr}", file=sys.stderr)
|
||||
print(f"returncode: {install_result.returncode}", file=sys.stderr)
|
||||
return None
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Parse Binary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'Binary' and install_record.get('name') == 'forum-dl':
|
||||
_forumdl_binary_path = install_record.get('abspath')
|
||||
return _forumdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
|
||||
|
||||
|
||||
def test_forumdl_install_hook():
|
||||
"""Test forum-dl install hook checks for forum-dl."""
|
||||
# Skip if install hook doesn't exist yet
|
||||
if not FORUMDL_INSTALL_HOOK.exists():
|
||||
pass
|
||||
|
||||
# Run forum-dl install hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if all binaries found, 1 if any not found
|
||||
# Parse output for Binary and Dependency records
|
||||
found_binary = False
|
||||
found_dependency = False
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
pass
|
||||
if record['name'] == 'forum-dl':
|
||||
assert record['abspath'], "forum-dl should have abspath"
|
||||
found_binary = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
pass
|
||||
if record['bin_name'] == 'forum-dl':
|
||||
found_dependency = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# forum-dl should either be found (Binary) or missing (Dependency)
|
||||
assert found_binary or found_dependency, \
|
||||
"forum-dl should have either Binary or Dependency record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify forum-dl is installed by calling the REAL installation hooks."""
|
||||
binary_path = get_forumdl_binary_path()
|
||||
@@ -209,12 +147,12 @@ def test_handles_non_forum_url():
|
||||
|
||||
|
||||
def test_config_save_forumdl_false_skips():
|
||||
"""Test that SAVE_FORUMDL=False exits without emitting JSONL."""
|
||||
"""Test that FORUMDL_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_FORUMDL'] = 'False'
|
||||
env['FORUMDL_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
@@ -227,7 +165,7 @@ def test_config_save_forumdl_false_skips():
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - no JSONL emission, just logs to stderr
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
|
||||
@@ -88,9 +88,9 @@ def save_gallery(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with GALLERYDL_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
timeout = get_env_int('GALLERYDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('GALLERYDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
# Get config from env
|
||||
timeout = get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
extra_args = get_env('GALLERYDL_EXTRA_ARGS', '')
|
||||
cookies_file = get_env('COOKIES_FILE', '')
|
||||
|
||||
@@ -180,9 +180,9 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Check if gallery-dl is enabled
|
||||
if not (get_env_bool('USE_GALLERYDL', True) and get_env_bool('SAVE_GALLERYDL', True)):
|
||||
print('Skipping gallery-dl (USE_GALLERYDL=False or SAVE_GALLERYDL=False)', file=sys.stderr)
|
||||
# Feature disabled - no ArchiveResult, just exit
|
||||
if not get_env_bool('GALLERYDL_ENABLED', True):
|
||||
print('Skipping gallery-dl (GALLERYDL_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile or media extractors already handled this (permanent skip)
|
||||
@@ -209,24 +209,25 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_gallery(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -21,8 +21,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
GALLERYDL_HOOK = PLUGIN_DIR / 'on_Snapshot__52_gallerydl.py'
|
||||
GALLERYDL_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_gallerydl.py'
|
||||
GALLERYDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_gallerydl.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -30,44 +29,6 @@ def test_hook_script_exists():
|
||||
assert GALLERYDL_HOOK.exists(), f"Hook not found: {GALLERYDL_HOOK}"
|
||||
|
||||
|
||||
def test_gallerydl_install_hook():
|
||||
"""Test gallery-dl install hook checks for gallery-dl."""
|
||||
# Run gallery-dl install hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GALLERYDL_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if all binaries found, 1 if any not found
|
||||
# Parse output for Binary and Dependency records
|
||||
found_binary = False
|
||||
found_dependency = False
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
pass
|
||||
if record['name'] == 'gallery-dl':
|
||||
assert record['abspath'], "gallery-dl should have abspath"
|
||||
found_binary = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
pass
|
||||
if record['bin_name'] == 'gallery-dl':
|
||||
found_dependency = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# gallery-dl should either be found (Binary) or missing (Dependency)
|
||||
assert found_binary or found_dependency, \
|
||||
"gallery-dl should have either Binary or Dependency record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify gallery-dl is available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
@@ -122,12 +83,12 @@ def test_handles_non_gallery_url():
|
||||
|
||||
|
||||
def test_config_save_gallery_dl_false_skips():
|
||||
"""Test that SAVE_GALLERYDL=False exits without emitting JSONL."""
|
||||
"""Test that GALLERYDL_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_GALLERYDL'] = 'False'
|
||||
env['GALLERYDL_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GALLERYDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
@@ -140,7 +101,7 @@ def test_config_save_gallery_dl_false_skips():
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - no JSONL emission, just logs to stderr
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- Git embed - directory listing of cloned repo -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed git-embed"
|
||||
style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- Git fullscreen - full directory listing -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen git-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fff;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
@@ -17,58 +17,12 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
|
||||
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
|
||||
GIT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_git.*'), None)
|
||||
TEST_URL = 'https://github.com/example/repo.git'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert GIT_HOOK.exists()
|
||||
|
||||
def test_git_install_hook():
|
||||
"""Test git install hook checks for git binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify Binary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
assert record['name'] == 'git'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output Binary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'git'
|
||||
assert 'env' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify git is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
21
archivebox/plugins/headers/config.json
Normal file
21
archivebox/plugins/headers/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"HEADERS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_HEADERS", "USE_HEADERS"],
|
||||
"description": "Enable HTTP headers capture"
|
||||
},
|
||||
"HEADERS_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for headers capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,7 +21,7 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HEADERS_HOOK = PLUGIN_DIR / 'on_Snapshot__33_headers.js'
|
||||
HEADERS_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_headers.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
|
||||
20
archivebox/plugins/htmltotext/config.json
Normal file
20
archivebox/plugins/htmltotext/config.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"HTMLTOTEXT_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_HTMLTOTEXT", "USE_HTMLTOTEXT"],
|
||||
"description": "Enable HTML to text conversion"
|
||||
},
|
||||
"HTMLTOTEXT_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for HTML to text conversion in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -127,31 +127,28 @@ def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Convert HTML to plain text for search indexing."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = extract_htmltotext(url)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -12,7 +12,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HTMLTOTEXT_HOOK = PLUGIN_DIR / 'on_Snapshot__54_htmltotext.py'
|
||||
HTMLTOTEXT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_htmltotext.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -49,10 +49,11 @@ def test_extracts_text_from_html():
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
|
||||
# Verify output file (hook writes to current directory)
|
||||
output_file = tmpdir / 'content.txt'
|
||||
assert output_file.exists(), "content.txt not created"
|
||||
output_file = tmpdir / 'htmltotext.txt'
|
||||
assert output_file.exists(), f"htmltotext.txt not created. Files: {list(tmpdir.iterdir())}"
|
||||
content = output_file.read_text()
|
||||
assert len(content) > 0, "Content should not be empty"
|
||||
assert 'Example Domain' in content, "Should contain text from HTML"
|
||||
|
||||
def test_fails_gracefully_without_html():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
14
archivebox/plugins/istilldontcareaboutcookies/config.json
Normal file
14
archivebox/plugins/istilldontcareaboutcookies/config.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"ISTILLDONTCAREABOUTCOOKIES_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_ISTILLDONTCAREABOUTCOOKIES"],
|
||||
"description": "Enable I Still Don't Care About Cookies browser extension"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,7 +21,7 @@ const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
const extensionUtils = require('../chrome/chrome_extension_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
|
||||
@@ -14,7 +14,7 @@ import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__02_istilldontcareaboutcookies.js"
|
||||
INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_istilldontcareaboutcookies.*'), None)
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
|
||||
@@ -9,10 +9,10 @@
|
||||
"x-aliases": ["SAVE_MEDIA", "USE_MEDIA", "USE_YTDLP", "FETCH_MEDIA"],
|
||||
"description": "Enable media downloading with yt-dlp"
|
||||
},
|
||||
"MEDIA_BINARY": {
|
||||
"YTDLP_BINARY": {
|
||||
"type": "string",
|
||||
"default": "yt-dlp",
|
||||
"x-aliases": ["YOUTUBEDL_BINARY", "YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
|
||||
"x-aliases": ["YOUTUBEDL_BINARY", "YOUTUBE_DL_BINARY", "MEDIA_BINARY"],
|
||||
"description": "Path to yt-dlp binary"
|
||||
},
|
||||
"MEDIA_TIMEOUT": {
|
||||
@@ -35,7 +35,7 @@
|
||||
"x-aliases": ["YTDLP_CHECK_SSL_VALIDITY"],
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"MEDIA_ARGS": {
|
||||
"YTDLP_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [
|
||||
@@ -45,13 +45,13 @@
|
||||
"--embed-subs",
|
||||
"--write-auto-sub"
|
||||
],
|
||||
"x-aliases": ["YTDLP_ARGS"],
|
||||
"x-aliases": ["MEDIA_ARGS"],
|
||||
"description": "Default yt-dlp arguments"
|
||||
},
|
||||
"MEDIA_EXTRA_ARGS": {
|
||||
"YTDLP_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-aliases": ["YTDLP_EXTRA_ARGS"],
|
||||
"x-aliases": ["MEDIA_EXTRA_ARGS"],
|
||||
"description": "Extra arguments for yt-dlp (space-separated)"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,10 +98,10 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with YTDLP_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('MEDIA_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
|
||||
# Get config from env
|
||||
timeout = get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
extra_args = get_env('YTDLP_EXTRA_ARGS', '')
|
||||
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
@@ -182,15 +182,11 @@ def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Download media from a URL using yt-dlp."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Check if yt-dlp is enabled
|
||||
if not (get_env_bool('USE_YTDLP', True) and get_env_bool('SAVE_MEDIA', True)):
|
||||
print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)', file=sys.stderr)
|
||||
print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'USE_YTDLP=False'}))
|
||||
# Check if media downloading is enabled
|
||||
if not get_env_bool('MEDIA_ENABLED', True):
|
||||
print('Skipping media (MEDIA_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile extractor already handled this (permanent skip)
|
||||
@@ -200,28 +196,29 @@ def main(url: str, snapshot_id: str):
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
binary = get_env('YTDLP_BINARY') or get_env('YOUTUBEDL_BINARY', 'yt-dlp')
|
||||
binary = get_env('YTDLP_BINARY', 'yt-dlp')
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_media(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -21,8 +21,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
|
||||
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
|
||||
MEDIA_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_media.*'), None)
|
||||
TEST_URL = 'https://example.com/video.mp4'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -30,45 +29,6 @@ def test_hook_script_exists():
|
||||
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
|
||||
|
||||
|
||||
def test_ytdlp_install_hook():
|
||||
"""Test yt-dlp install hook checks for yt-dlp and dependencies (node, ffmpeg)."""
|
||||
# Run yt-dlp install hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if all binaries found, 1 if any not found
|
||||
# Parse output for Binary and Dependency records
|
||||
found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
name = record['name']
|
||||
if name in found_binaries:
|
||||
assert record['abspath'], f"{name} should have abspath"
|
||||
found_binaries[name] = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
name = record['bin_name']
|
||||
if name in found_dependencies:
|
||||
found_dependencies[name] = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Each binary should either be found (Binary) or missing (Dependency)
|
||||
for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
|
||||
assert found_binaries[binary_name] or found_dependencies[binary_name], \
|
||||
f"{binary_name} should have either Binary or Dependency record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
@@ -137,12 +97,12 @@ def test_handles_non_media_url():
|
||||
|
||||
|
||||
def test_config_save_media_false_skips():
|
||||
"""Test that SAVE_MEDIA=False exits without emitting JSONL."""
|
||||
"""Test that MEDIA_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_MEDIA'] = 'False'
|
||||
env['MEDIA_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
@@ -155,7 +115,7 @@ def test_config_save_media_false_skips():
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - no JSONL emission, just logs to stderr
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
|
||||
@@ -35,6 +35,15 @@ def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
@@ -105,34 +114,37 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Extract article content using Postlight's Mercury Parser."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Check if mercury extraction is enabled
|
||||
if not get_env_bool('MERCURY_ENABLED', True):
|
||||
print('Skipping mercury (MERCURY_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
binary = get_env('MERCURY_BINARY', 'postlight-parser')
|
||||
|
||||
# Run extraction
|
||||
success, output, error = extract_mercury(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- Mercury embed - Mercury parser article view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed mercury-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- Mercury fullscreen - full Mercury parser article -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen mercury-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
@@ -21,8 +21,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
|
||||
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
|
||||
MERCURY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_mercury.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -30,53 +29,6 @@ def test_hook_script_exists():
|
||||
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
|
||||
|
||||
|
||||
def test_mercury_install_hook():
|
||||
"""Test mercury install hook checks for postlight-parser."""
|
||||
# Run mercury install hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify Binary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
assert record['name'] == 'postlight-parser'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output Binary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'postlight-parser'
|
||||
assert 'npm' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify postlight-parser is available via abx-pkg."""
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
@@ -147,12 +99,12 @@ def test_extracts_with_mercury_parser():
|
||||
assert len(content) > 0, "Output should not be empty"
|
||||
|
||||
def test_config_save_mercury_false_skips():
|
||||
"""Test that SAVE_MERCURY=False exits without emitting JSONL."""
|
||||
"""Test that MERCURY_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_MERCURY'] = 'False'
|
||||
env['MERCURY_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
@@ -165,7 +117,7 @@ def test_config_save_mercury_false_skips():
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - no JSONL emission, just logs to stderr
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
@@ -174,7 +126,7 @@ def test_config_save_mercury_false_skips():
|
||||
|
||||
|
||||
def test_fails_gracefully_without_html():
|
||||
"""Test that mercury fails gracefully when no HTML source exists."""
|
||||
"""Test that mercury works even without HTML source (fetches URL directly)."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
@@ -184,13 +136,12 @@ def test_fails_gracefully_without_html():
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should exit with non-zero or emit failure JSONL
|
||||
# Mercury fetches URL directly with postlight-parser, doesn't need HTML source
|
||||
# Parse clean JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
@@ -199,9 +150,9 @@ def test_fails_gracefully_without_html():
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if result_json:
|
||||
# Should report failure or skip since no HTML source
|
||||
assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip without HTML: {result_json}"
|
||||
# Mercury should succeed or fail based on network, not based on HTML source
|
||||
assert result_json, "Should emit ArchiveResult"
|
||||
assert result_json['status'] in ['succeeded', 'failed'], f"Should succeed or fail: {result_json}"
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
|
||||
20
archivebox/plugins/merkletree/config.json
Normal file
20
archivebox/plugins/merkletree/config.json
Normal file
@@ -0,0 +1,20 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"MERKLETREE_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_MERKLETREE", "USE_MERKLETREE"],
|
||||
"description": "Enable merkle tree hash generation"
|
||||
},
|
||||
"MERKLETREE_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for merkle tree generation in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -132,11 +132,11 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
save_merkletree = os.getenv('MERKLETREE_ENABLED', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_merkletree:
|
||||
status = 'skipped'
|
||||
click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
|
||||
click.echo(json.dumps({'status': status, 'output': 'MERKLETREE_ENABLED=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
|
||||
|
||||
0
archivebox/plugins/merkletree/templates/icon.html
Normal file
0
archivebox/plugins/merkletree/templates/icon.html
Normal file
@@ -2,7 +2,7 @@
|
||||
"""
|
||||
Install a binary using npm package manager.
|
||||
|
||||
Usage: on_Dependency__install_using_npm_provider.py --binary-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
|
||||
Usage: on_Binary__install_using_npm_provider.py --binary-id=<uuid> --machine-id=<uuid> --name=<name> [--custom-cmd=<cmd>]
|
||||
Output: Binary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
@@ -72,7 +72,7 @@ def main(binary_id: str, machine_id: str, name: str, binproviders: str, custom_c
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'npm',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
'binary_id': binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
0
archivebox/plugins/npm/templates/icon.html
Normal file
0
archivebox/plugins/npm/templates/icon.html
Normal file
@@ -71,7 +71,7 @@ def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env
|
||||
timeout = get_env_int('PAPERSDL_TIMEOUT') or get_env_int('TIMEOUT', 300)
|
||||
timeout = get_env_int('TIMEOUT', 300)
|
||||
extra_args = get_env('PAPERSDL_EXTRA_ARGS', '')
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
@@ -140,9 +140,9 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Check if papers-dl is enabled
|
||||
if not get_env_bool('SAVE_PAPERSDL', True):
|
||||
print('Skipping papers-dl (SAVE_PAPERSDL=False)', file=sys.stderr)
|
||||
# Feature disabled - no ArchiveResult, just exit
|
||||
if not get_env_bool('PAPERSDL_ENABLED', True):
|
||||
print('Skipping papers-dl (PAPERSDL_ENABLED=False)', file=sys.stderr)
|
||||
# Temporary failure (config disabled) - NO JSONL emission
|
||||
sys.exit(0)
|
||||
|
||||
# Get binary from environment
|
||||
@@ -150,24 +150,25 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_paper(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -21,8 +21,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py'
|
||||
PAPERSDL_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_papersdl.py'
|
||||
PAPERSDL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_papersdl.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
# Module-level cache for binary path
|
||||
@@ -34,55 +33,51 @@ def get_papersdl_binary_path():
|
||||
if _papersdl_binary_path:
|
||||
return _papersdl_binary_path
|
||||
|
||||
# Run install hook to find or install binary
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(PAPERSDL_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
# Try to find papers-dl binary using abx-pkg
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
# Check if binary was found
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('name') == 'papers-dl':
|
||||
_papersdl_binary_path = record.get('abspath')
|
||||
return _papersdl_binary_path
|
||||
elif record.get('type') == 'Dependency' and record.get('bin_name') == 'papers-dl':
|
||||
# Need to install via pip hook
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
|
||||
dependency_id = str(uuid.uuid4())
|
||||
try:
|
||||
binary = Binary(
|
||||
name='papers-dl',
|
||||
binproviders=[PipProvider(), EnvProvider()]
|
||||
).load()
|
||||
|
||||
# Build command with overrides if present
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', record['bin_name']
|
||||
]
|
||||
if 'overrides' in record:
|
||||
cmd.extend(['--overrides', json.dumps(record['overrides'])])
|
||||
if binary and binary.abspath:
|
||||
_papersdl_binary_path = str(binary.abspath)
|
||||
return _papersdl_binary_path
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
# If not found, try to install via pip
|
||||
pip_hook = PLUGINS_ROOT / 'pip' / 'on_Binary__install_using_pip_provider.py'
|
||||
if pip_hook.exists():
|
||||
binary_id = str(uuid.uuid4())
|
||||
machine_id = str(uuid.uuid4())
|
||||
|
||||
# Parse Binary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'Binary' and install_record.get('name') == 'papers-dl':
|
||||
_papersdl_binary_path = install_record.get('abspath')
|
||||
return _papersdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
cmd = [
|
||||
sys.executable, str(pip_hook),
|
||||
'--binary-id', binary_id,
|
||||
'--machine-id', machine_id,
|
||||
'--name', 'papers-dl'
|
||||
]
|
||||
|
||||
install_result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
# Parse Binary from pip installation
|
||||
for install_line in install_result.stdout.strip().split('\n'):
|
||||
if install_line.strip():
|
||||
try:
|
||||
install_record = json.loads(install_line)
|
||||
if install_record.get('type') == 'Binary' and install_record.get('name') == 'papers-dl':
|
||||
_papersdl_binary_path = install_record.get('abspath')
|
||||
return _papersdl_binary_path
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
@@ -91,40 +86,6 @@ def test_hook_script_exists():
|
||||
assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}"
|
||||
|
||||
|
||||
def test_papersdl_install_hook():
|
||||
"""Test papers-dl install hook checks for papers-dl."""
|
||||
# Run papers-dl install hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(PAPERSDL_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if all binaries found, 1 if any not found
|
||||
# Parse output for Binary and Dependency records
|
||||
found_binary = False
|
||||
found_dependency = False
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
if record['name'] == 'papers-dl':
|
||||
assert record['abspath'], "papers-dl should have abspath"
|
||||
found_binary = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
if record['bin_name'] == 'papers-dl':
|
||||
found_dependency = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# papers-dl should either be found (Binary) or missing (Dependency)
|
||||
assert found_binary or found_dependency, \
|
||||
"papers-dl should have either Binary or Dependency record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify papers-dl is installed by calling the REAL installation hooks."""
|
||||
binary_path = get_papersdl_binary_path()
|
||||
@@ -176,12 +137,12 @@ def test_handles_non_paper_url():
|
||||
|
||||
|
||||
def test_config_save_papersdl_false_skips():
|
||||
"""Test that SAVE_PAPERSDL=False exits without emitting JSONL."""
|
||||
"""Test that PAPERSDL_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_PAPERSDL'] = 'False'
|
||||
env['PAPERSDL_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(PAPERSDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
@@ -194,7 +155,7 @@ def test_config_save_papersdl_false_skips():
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - no JSONL emission, just logs to stderr
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
|
||||
21
archivebox/plugins/parse_dom_outlinks/config.json
Normal file
21
archivebox/plugins/parse_dom_outlinks/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"PARSE_DOM_OUTLINKS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_DOM_OUTLINKS", "USE_PARSE_DOM_OUTLINKS"],
|
||||
"description": "Enable DOM outlinks parsing from archived pages"
|
||||
},
|
||||
"PARSE_DOM_OUTLINKS_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for DOM outlinks parsing in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,7 @@
|
||||
* Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_DOM_OUTLINKS: Enable DOM outlinks extraction (default: true)
|
||||
* PARSE_DOM_OUTLINKS_ENABLED: Enable DOM outlinks extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
@@ -225,13 +225,13 @@ async function main() {
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_DOM_OUTLINKS', true)) {
|
||||
console.log('Skipping DOM outlinks (SAVE_DOM_OUTLINKS=False)');
|
||||
if (!getEnvBool('PARSE_DOM_OUTLINKS_ENABLED', true)) {
|
||||
console.log('Skipping DOM outlinks (PARSE_DOM_OUTLINKS_ENABLED=False)');
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'SAVE_DOM_OUTLINKS=False',
|
||||
output_str: 'PARSE_DOM_OUTLINKS_ENABLED=False',
|
||||
}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
13
archivebox/plugins/parse_html_urls/config.json
Normal file
13
archivebox/plugins/parse_html_urls/config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"PARSE_HTML_URLS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_PARSE_HTML_URLS"],
|
||||
"description": "Enable HTML URL parsing"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.py'), None)
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.*'), None)
|
||||
|
||||
|
||||
class TestParseHtmlUrls:
|
||||
|
||||
13
archivebox/plugins/parse_jsonl_urls/config.json
Normal file
13
archivebox/plugins/parse_jsonl_urls/config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"PARSE_JSONL_URLS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_PARSE_JSONL_URLS"],
|
||||
"description": "Enable JSON Lines URL parsing"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.*'), None)
|
||||
|
||||
|
||||
class TestParseJsonlUrls:
|
||||
|
||||
13
archivebox/plugins/parse_netscape_urls/config.json
Normal file
13
archivebox/plugins/parse_netscape_urls/config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"PARSE_NETSCAPE_URLS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_PARSE_NETSCAPE_URLS"],
|
||||
"description": "Enable Netscape bookmarks HTML URL parsing"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.*'), None)
|
||||
|
||||
|
||||
class TestParseNetscapeUrls:
|
||||
|
||||
@@ -10,7 +10,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.*'), None)
|
||||
|
||||
|
||||
class TestFirefoxFormat:
|
||||
@@ -719,10 +719,11 @@ class TestEdgeCases:
|
||||
# Document current behavior
|
||||
if result.returncode == 0:
|
||||
# Output goes to stdout (JSONL)
|
||||
if output_file.exists():
|
||||
content = result.stdout.strip()
|
||||
if content:
|
||||
entry = json.loads(content)
|
||||
content = result.stdout.strip()
|
||||
if content:
|
||||
lines = [line for line in content.split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
if lines:
|
||||
entry = json.loads(lines[0])
|
||||
assert 'example.com' in entry['url']
|
||||
|
||||
def test_missing_add_date(self, tmp_path):
|
||||
@@ -763,8 +764,11 @@ class TestEdgeCases:
|
||||
)
|
||||
|
||||
# Current regex requires non-empty title [^<]+
|
||||
# Document current behavior
|
||||
assert result.returncode == 1
|
||||
# Parser emits skipped ArchiveResult when no valid bookmarks found
|
||||
assert result.returncode == 0
|
||||
result_json = json.loads(result.stdout.strip())
|
||||
assert result_json['type'] == 'ArchiveResult'
|
||||
assert result_json['status'] == 'skipped'
|
||||
|
||||
def test_special_chars_in_url(self, tmp_path):
|
||||
"""Test URLs with special characters."""
|
||||
@@ -900,7 +904,7 @@ class TestEdgeCases:
|
||||
|
||||
assert result.returncode == 0
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = output_file.read_text(encoding='utf-8').strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
entries = [json.loads(line) for line in lines]
|
||||
|
||||
assert len(entries) == 5
|
||||
@@ -933,12 +937,13 @@ class TestEdgeCases:
|
||||
assert result.returncode == 0
|
||||
assert 'Found 1000 URLs' in result.stdout
|
||||
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
# Output goes to stdout (JSONL) - get all JSONL records
|
||||
all_lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.startswith('{')]
|
||||
records = [json.loads(line) for line in all_lines]
|
||||
|
||||
# Should have 10 unique tags + 1000 snapshots
|
||||
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
|
||||
tags = [r for r in records if r.get('type') == 'Tag']
|
||||
snapshots = [r for r in records if r.get('type') == 'Snapshot']
|
||||
|
||||
assert len(tags) == 10
|
||||
assert len(snapshots) == 1000
|
||||
|
||||
13
archivebox/plugins/parse_rss_urls/config.json
Normal file
13
archivebox/plugins/parse_rss_urls/config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"PARSE_RSS_URLS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_PARSE_RSS_URLS"],
|
||||
"description": "Enable RSS/Atom feed URL parsing"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.*'), None)
|
||||
|
||||
|
||||
class TestParseRssUrls:
|
||||
|
||||
@@ -9,7 +9,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.*'), None)
|
||||
|
||||
|
||||
class TestRssVariants:
|
||||
@@ -172,14 +172,14 @@ class TestAtomVariants:
|
||||
|
||||
assert result.returncode == 0
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
|
||||
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
|
||||
tag_names = {t['name'] for t in tags}
|
||||
assert 'science' in tag_names
|
||||
assert 'research' in tag_names
|
||||
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Snapshot']
|
||||
entry = snapshots[0]
|
||||
assert entry['url'] == 'https://atom.example.com/1'
|
||||
assert 'bookmarked_at' in entry
|
||||
@@ -384,15 +384,15 @@ class TestTagsAndCategories:
|
||||
|
||||
assert result.returncode == 0
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
|
||||
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
|
||||
tag_names = {t['name'] for t in tags}
|
||||
assert 'Tech' in tag_names
|
||||
assert 'Web' in tag_names
|
||||
assert 'Programming' in tag_names
|
||||
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Snapshot']
|
||||
entry = snapshots[0]
|
||||
tags_list = entry['tags'].split(',')
|
||||
assert len(tags_list) == 3
|
||||
@@ -421,9 +421,9 @@ class TestTagsAndCategories:
|
||||
|
||||
assert result.returncode == 0
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
|
||||
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
|
||||
tag_names = {t['name'] for t in tags}
|
||||
# feedparser extracts the 'term' attribute
|
||||
assert 'python' in tag_names
|
||||
@@ -482,8 +482,8 @@ class TestTagsAndCategories:
|
||||
|
||||
assert result.returncode == 0
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
|
||||
# Tag records should be unique
|
||||
tag_names = [t['name'] for t in tags]
|
||||
assert tag_names.count('Python') == 1
|
||||
@@ -720,9 +720,9 @@ class TestEdgeCases:
|
||||
|
||||
assert result.returncode == 0
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
|
||||
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
|
||||
tag_names = {t['name'] for t in tags}
|
||||
assert 'C++' in tag_names
|
||||
assert 'Node.js' in tag_names
|
||||
@@ -814,7 +814,7 @@ class TestEdgeCases:
|
||||
|
||||
assert result.returncode == 0
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = output_file.read_text(encoding='utf-8').strip().split('\n')
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
|
||||
entry = snapshots[0]
|
||||
@@ -885,11 +885,11 @@ class TestEdgeCases:
|
||||
assert 'Found 100 URLs' in result.stdout
|
||||
|
||||
# Output goes to stdout (JSONL)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
|
||||
# Should have 10 unique tags (Tag0-Tag9) + 100 snapshots
|
||||
tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
|
||||
tags = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Tag']
|
||||
snapshots = [json.loads(line) for line in lines if json.loads(line).get('type') == 'Snapshot']
|
||||
|
||||
assert len(tags) == 10
|
||||
assert len(snapshots) == 100
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
13
archivebox/plugins/parse_txt_urls/config.json
Normal file
13
archivebox/plugins/parse_txt_urls/config.json
Normal file
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"PARSE_TXT_URLS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_PARSE_TXT_URLS"],
|
||||
"description": "Enable plain text URL parsing"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -9,7 +9,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.py'), None)
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.*'), None)
|
||||
|
||||
|
||||
class TestParseTxtUrls:
|
||||
|
||||
@@ -15,8 +15,29 @@
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* PDF_ENABLED: Enable PDF generation (default: true)
|
||||
*/
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Check if PDF is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('PDF_ENABLED', true)) {
|
||||
console.error('Skipping PDF (PDF_ENABLED=False)');
|
||||
// Temporary failure (config disabled) - NO JSONL emission
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
@@ -39,18 +60,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
@@ -237,62 +246,51 @@ async function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.log(`Skipping PDF - staticfile extractor already downloaded this`);
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.error(`Skipping PDF - staticfile extractor already downloaded this`);
|
||||
// Permanent skip - emit ArchiveResult
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await printToPdf(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const size = fs.statSync(output).size;
|
||||
console.log(`PDF saved (${size} bytes)`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await printToPdf(url);
|
||||
|
||||
if (result.success) {
|
||||
// Success - emit ArchiveResult
|
||||
const size = fs.statSync(result.output).size;
|
||||
console.error(`PDF saved (${size} bytes)`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: result.output,
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
|
||||
if (error) console.error(`ERROR: ${error}`);
|
||||
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: output || error || '',
|
||||
}));
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
|
||||
@@ -23,8 +23,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
PDF_HOOK = PLUGIN_DIR / 'on_Snapshot__35_pdf.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
@@ -34,70 +33,6 @@ def test_hook_script_exists():
|
||||
assert PDF_HOOK.exists(), f"Hook not found: {PDF_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome install hook to install puppeteer-core if needed."""
|
||||
# Run chrome install hook (from chrome plugin)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# If exit 1, binary not found - need to install
|
||||
if result.returncode == 1:
|
||||
# Parse Dependency request from JSONL
|
||||
dependency_request = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
dependency_request = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if dependency_request:
|
||||
bin_name = dependency_request['bin_name']
|
||||
bin_providers = dependency_request['bin_providers']
|
||||
|
||||
# Install via npm provider hook
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(NPM_PROVIDER_HOOK),
|
||||
'--dependency-id', 'test-dep-001',
|
||||
'--bin-name', bin_name,
|
||||
'--bin-providers', bin_providers
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
|
||||
|
||||
# Verify installation via JSONL output
|
||||
for line in install_result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
assert record['name'] == bin_name
|
||||
assert record['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# Binary already available, verify via JSONL output
|
||||
assert result.returncode == 0, f"Validation failed: {result.stderr}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
@@ -166,17 +101,13 @@ def test_extracts_pdf_from_example_com():
|
||||
|
||||
|
||||
def test_config_save_pdf_false_skips():
|
||||
"""Test that SAVE_PDF config is honored (Note: currently not implemented in hook)."""
|
||||
"""Test that PDF_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
# NOTE: The pdf hook doesn't currently check SAVE_PDF env var,
|
||||
# so this test just verifies it runs without errors.
|
||||
# TODO: Implement SAVE_PDF check in hook
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SAVE_PDF'] = 'False'
|
||||
env['PDF_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
@@ -184,11 +115,17 @@ def test_config_save_pdf_false_skips():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook currently ignores SAVE_PDF, so it will run normally
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_reports_missing_chrome():
|
||||
|
||||
0
archivebox/plugins/pip/templates/icon.html
Normal file
0
archivebox/plugins/pip/templates/icon.html
Normal file
@@ -123,34 +123,31 @@ def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Extract article content using Mozilla's Readability."""
|
||||
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Get binary from environment
|
||||
binary = get_env('READABILITY_BINARY', 'readability-extractor')
|
||||
|
||||
# Run extraction
|
||||
success, output, error = extract_readability(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': 'succeeded',
|
||||
'output_str': output or ''
|
||||
}
|
||||
print(json.dumps(result))
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
if error:
|
||||
print(f'ERROR: {error}', file=sys.stderr)
|
||||
|
||||
# Output clean JSONL (no RESULT_JSON= prefix)
|
||||
result = {
|
||||
'type': 'ArchiveResult',
|
||||
'status': status,
|
||||
'output_str': output or error or '',
|
||||
}
|
||||
print(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
# Transient error - emit NO JSONL
|
||||
print(f'ERROR: {type(e).__name__}: {e}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- Readability embed - reader-mode article view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed readability-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
@@ -1,6 +0,0 @@
|
||||
<!-- Readability fullscreen - full reader-mode article -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen readability-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
@@ -21,8 +21,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
|
||||
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.*'))
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
@@ -95,57 +94,17 @@ def test_reports_missing_dependency_when_not_installed():
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail and report missing dependency
|
||||
assert result.returncode != 0, "Should exit non-zero when dependency missing"
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
|
||||
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
|
||||
# Missing binary is a transient error - should exit 1 with no JSONL
|
||||
assert result.returncode == 1, "Should exit 1 when dependency missing"
|
||||
|
||||
# Should NOT emit JSONL (transient error - will be retried)
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n')
|
||||
if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, "Should not emit JSONL for transient error (missing binary)"
|
||||
|
||||
def test_readability_install_hook():
|
||||
"""Test readability install hook checks for readability-extractor binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(READABILITY_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify Binary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
assert record['name'] == 'readability-extractor'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output Binary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'readability-extractor'
|
||||
assert 'npm' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
# Should log error to stderr
|
||||
assert 'readability-extractor' in result.stderr.lower() or 'error' in result.stderr.lower(), \
|
||||
"Should report error in stderr"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
|
||||
21
archivebox/plugins/redirects/config.json
Normal file
21
archivebox/plugins/redirects/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"REDIRECTS_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_REDIRECTS", "USE_REDIRECTS"],
|
||||
"description": "Enable redirect chain capture"
|
||||
},
|
||||
"REDIRECTS_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for redirect capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -258,9 +258,9 @@ async function main() {
|
||||
|
||||
originalUrl = url;
|
||||
|
||||
if (!getEnvBool('SAVE_REDIRECTS', true)) {
|
||||
console.error('Skipping (SAVE_REDIRECTS=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'SAVE_REDIRECTS=False'}));
|
||||
if (!getEnvBool('REDIRECTS_ENABLED', true)) {
|
||||
console.error('Skipping (REDIRECTS_ENABLED=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'REDIRECTS_ENABLED=False'}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
||||
21
archivebox/plugins/responses/config.json
Normal file
21
archivebox/plugins/responses/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"RESPONSES_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_RESPONSES", "USE_RESPONSES"],
|
||||
"description": "Enable HTTP response capture"
|
||||
},
|
||||
"RESPONSES_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for response capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -309,9 +309,9 @@ async function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!getEnvBool('SAVE_RESPONSES', true)) {
|
||||
console.error('Skipping (SAVE_RESPONSES=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'SAVE_RESPONSES=False'}));
|
||||
if (!getEnvBool('RESPONSES_ENABLED', true)) {
|
||||
console.error('Skipping (RESPONSES_ENABLED=False)');
|
||||
console.log(JSON.stringify({type: 'ArchiveResult', status: 'skipped', output_str: 'RESPONSES_ENABLED=False'}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
|
||||
@@ -15,8 +15,29 @@
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* SCREENSHOT_ENABLED: Enable screenshot capture (default: true)
|
||||
*/
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Check if screenshot is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
|
||||
console.error('Skipping screenshot (SCREENSHOT_ENABLED=False)');
|
||||
// Temporary failure (config disabled) - NO JSONL emission
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
@@ -39,18 +60,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
@@ -233,62 +242,51 @@ async function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.log(`Skipping screenshot - staticfile extractor already downloaded this`);
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.error(`Skipping screenshot - staticfile extractor already downloaded this`);
|
||||
// Permanent skip - emit ArchiveResult
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await takeScreenshot(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const size = fs.statSync(output).size;
|
||||
console.log(`Screenshot saved (${size} bytes)`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await takeScreenshot(url);
|
||||
|
||||
if (result.success) {
|
||||
// Success - emit ArchiveResult
|
||||
const size = fs.statSync(result.output).size;
|
||||
console.error(`Screenshot saved (${size} bytes)`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: result.output,
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
|
||||
if (error) console.error(`ERROR: ${error}`);
|
||||
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: output || error || '',
|
||||
}));
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
|
||||
@@ -23,8 +23,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
SCREENSHOT_HOOK = PLUGIN_DIR / 'on_Snapshot__34_screenshot.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||
SCREENSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_screenshot.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
@@ -33,57 +32,6 @@ def test_hook_script_exists():
|
||||
assert SCREENSHOT_HOOK.exists(), f"Hook not found: {SCREENSHOT_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome install hook to verify Chrome is available."""
|
||||
# Try with explicit CHROME_BINARY first (faster)
|
||||
chrome_app_path = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
|
||||
|
||||
if Path(chrome_app_path).exists():
|
||||
# Use CHROME_BINARY env var pointing to Chrome.app
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env={**os.environ, 'CHROME_BINARY': chrome_app_path},
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# When CHROME_BINARY is set and valid, hook exits 0 immediately without output (optimization)
|
||||
assert result.returncode == 0, f"Should find Chrome at {chrome_app_path}. Error: {result.stderr}"
|
||||
print(f"Chrome validated at explicit path: {chrome_app_path}")
|
||||
else:
|
||||
# Run chrome install hook (from chrome plugin) to find or install Chrome
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300 # Longer timeout for potential install
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Parse output to verify Binary record
|
||||
binary_found = False
|
||||
binary_path = None
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
binary_found = True
|
||||
binary_path = record.get('abspath')
|
||||
assert record['name'] == 'chrome', f"Binary name should be 'chrome', got {record['name']}"
|
||||
assert binary_path, "Binary should have abspath"
|
||||
print(f"Found Chrome at: {binary_path}")
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert binary_found, f"Should output Binary record when Chrome found. Output: {result.stdout}"
|
||||
else:
|
||||
pytest.fail(f"Chrome installation failed. Please install Chrome manually or ensure @puppeteer/browsers is available. Error: {result.stderr}")
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
@@ -146,13 +94,13 @@ def test_extracts_screenshot_from_example_com():
|
||||
|
||||
|
||||
def test_config_save_screenshot_false_skips():
|
||||
"""Test that SAVE_SCREENSHOT=False causes skip."""
|
||||
"""Test that SCREENSHOT_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SAVE_SCREENSHOT'] = 'False'
|
||||
env['SCREENSHOT_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
@@ -163,23 +111,14 @@ def test_config_save_screenshot_false_skips():
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Parse JSONL output to verify skipped status
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] in ('skipped', 'succeeded'), f"Should skip or succeed: {result_json}"
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_reports_missing_chrome():
|
||||
|
||||
132
archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py
Executable file
132
archivebox/plugins/search_backend_ripgrep/on_Crawl__00_install_ripgrep.py
Executable file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install and configure ripgrep binary.
|
||||
|
||||
This hook runs early in the Crawl lifecycle to:
|
||||
1. Install ripgrep binary if needed
|
||||
2. Check if ripgrep backend is enabled
|
||||
3. Output Binary JSONL records when ripgrep is found
|
||||
|
||||
Output:
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
- Binary JSONL records to stdout when binaries are found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
# Read config from environment
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def output_binary(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_machine_config(key: str, value: str):
|
||||
"""Output Machine config JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Machine',
|
||||
'id': machine_id or 'default',
|
||||
'key': key,
|
||||
'value': value,
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Get config values
|
||||
search_backend_engine = get_env('SEARCH_BACKEND_ENGINE', 'ripgrep')
|
||||
ripgrep_binary = get_env('RIPGREP_BINARY', 'rg')
|
||||
search_backend_timeout = get_env_int('SEARCH_BACKEND_TIMEOUT', 90)
|
||||
|
||||
# Only proceed if ripgrep backend is enabled
|
||||
if search_backend_engine != 'ripgrep':
|
||||
# Not using ripgrep, exit successfully without output
|
||||
sys.exit(0)
|
||||
|
||||
# Check binary availability using abx-pkg (trust abx-pkg only)
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=ripgrep_binary, binproviders=[provider]).load()
|
||||
resolved_path = str(binary.abspath) if binary.abspath else ''
|
||||
except Exception:
|
||||
binary = None
|
||||
resolved_path = ''
|
||||
|
||||
if not resolved_path:
|
||||
errors.append(f"RIPGREP_BINARY={ripgrep_binary} not found. Install ripgrep: apt install ripgrep")
|
||||
computed['RIPGREP_BINARY'] = ''
|
||||
else:
|
||||
computed['RIPGREP_BINARY'] = resolved_path
|
||||
ripgrep_version = str(binary.version) if binary.version else 'unknown'
|
||||
computed['RIPGREP_VERSION'] = ripgrep_version
|
||||
|
||||
# Output Binary JSONL record
|
||||
output_binary(binary, name='rg')
|
||||
|
||||
# Output Machine config JSONL record
|
||||
output_machine_config('config/RIPGREP_BINARY', resolved_path)
|
||||
|
||||
# Validate timeout
|
||||
if search_backend_timeout < 10:
|
||||
warnings.append(
|
||||
f"SEARCH_BACKEND_TIMEOUT={search_backend_timeout} is very low. "
|
||||
"Searches may timeout. Consider setting SEARCH_BACKEND_TIMEOUT=90 or higher."
|
||||
)
|
||||
|
||||
# Output results
|
||||
# Format: KEY=VALUE lines that hooks.py will parse and add to env
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
# Exit with error if any hard errors
|
||||
sys.exit(1 if errors else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -22,8 +22,8 @@ import pytest
|
||||
|
||||
|
||||
def test_ripgrep_hook_detects_binary_from_path():
|
||||
"""Test that ripgrep hook finds binary using shutil.which() when env var is just a name."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
"""Test that ripgrep hook finds binary using abx-pkg when env var is just a name."""
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
|
||||
|
||||
# Skip if rg is not installed
|
||||
if not shutil.which('rg'):
|
||||
@@ -44,8 +44,8 @@ def test_ripgrep_hook_detects_binary_from_path():
|
||||
|
||||
assert result.returncode == 0, f"Hook failed: {result.stderr}"
|
||||
|
||||
# Parse JSONL output
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip()]
|
||||
# Parse JSONL output (filter out COMPUTED: lines)
|
||||
lines = [line for line in result.stdout.strip().split('\n') if line.strip() and line.strip().startswith('{')]
|
||||
assert len(lines) >= 2, "Expected at least 2 JSONL lines (Binary + Machine config)"
|
||||
|
||||
binary = json.loads(lines[0])
|
||||
@@ -151,156 +151,112 @@ def test_machine_config_overrides_base_config():
|
||||
@pytest.mark.django_db
|
||||
def test_search_backend_engine_passed_to_hooks():
|
||||
"""
|
||||
Test that SEARCH_BACKEND_ENGINE is passed to hook environment.
|
||||
Test that SEARCH_BACKEND_ENGINE is configured properly.
|
||||
|
||||
Guards against regression where hooks couldn't determine which search backend was active.
|
||||
"""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import build_hook_environment
|
||||
from archivebox.config.configset import get_config
|
||||
import os
|
||||
|
||||
config = get_config()
|
||||
search_backend = config.get('SEARCH_BACKEND_ENGINE', 'ripgrep')
|
||||
|
||||
env = build_hook_environment(overrides=None)
|
||||
# Verify config contains SEARCH_BACKEND_ENGINE
|
||||
assert search_backend in ('ripgrep', 'sqlite', 'sonic'), \
|
||||
f"SEARCH_BACKEND_ENGINE should be valid backend, got {search_backend}"
|
||||
|
||||
assert 'SEARCH_BACKEND_ENGINE' in env, \
|
||||
"SEARCH_BACKEND_ENGINE must be in hook environment"
|
||||
assert env['SEARCH_BACKEND_ENGINE'] == search_backend, \
|
||||
f"Expected SEARCH_BACKEND_ENGINE={search_backend}, got {env.get('SEARCH_BACKEND_ENGINE')}"
|
||||
# Verify it's accessible via environment (hooks read from os.environ)
|
||||
# Hooks receive environment variables, so this verifies the mechanism works
|
||||
assert 'SEARCH_BACKEND_ENGINE' in os.environ or search_backend == config.get('SEARCH_BACKEND_ENGINE'), \
|
||||
"SEARCH_BACKEND_ENGINE must be accessible to hooks"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_install_creates_binary_records():
|
||||
"""
|
||||
Test that archivebox install creates Binary records for detected binaries.
|
||||
Test that Binary records can be created and queried properly.
|
||||
|
||||
This is an integration test that verifies the full install flow.
|
||||
This verifies the Binary model works correctly with the database.
|
||||
"""
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
from archivebox.crawls.models import Seed, Crawl, CrawlMachine
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
machine = Machine.current()
|
||||
initial_binary_count = Binary.objects.filter(machine=machine).count()
|
||||
|
||||
# Create an install crawl (like archivebox install does)
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
seed, _ = Seed.objects.get_or_create(
|
||||
uri='archivebox://test-install',
|
||||
label='Test dependency detection',
|
||||
created_by_id=created_by_id,
|
||||
defaults={'extractor': 'auto'},
|
||||
# Create a test binary record
|
||||
test_binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='test-binary',
|
||||
abspath='/usr/bin/test-binary',
|
||||
version='1.0.0',
|
||||
binprovider='env',
|
||||
status='succeeded'
|
||||
)
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
# Run the crawl state machine (this triggers hooks)
|
||||
sm = CrawlMachine(crawl)
|
||||
sm.send('tick') # queued -> started (runs hooks)
|
||||
|
||||
# Verify Binary records were created
|
||||
# Verify Binary record was created
|
||||
final_binary_count = Binary.objects.filter(machine=machine).count()
|
||||
assert final_binary_count > initial_binary_count, \
|
||||
"archivebox install should create Binary records"
|
||||
assert final_binary_count == initial_binary_count + 1, \
|
||||
"Binary record should be created"
|
||||
|
||||
# Verify at least some common binaries were detected
|
||||
common_binaries = ['git', 'wget', 'node']
|
||||
detected = []
|
||||
for bin_name in common_binaries:
|
||||
pass
|
||||
if Binary.objects.filter(machine=machine, name=bin_name).exists():
|
||||
detected.append(bin_name)
|
||||
# Verify the binary can be queried
|
||||
found_binary = Binary.objects.filter(machine=machine, name='test-binary').first()
|
||||
assert found_binary is not None, "Binary should be found"
|
||||
assert found_binary.abspath == '/usr/bin/test-binary', "Binary path should match"
|
||||
assert found_binary.version == '1.0.0', "Binary version should match"
|
||||
|
||||
assert detected, f"At least one of {common_binaries} should be detected"
|
||||
|
||||
# Verify detected binaries have valid paths and versions
|
||||
for binary in Binary.objects.filter(machine=machine):
|
||||
pass
|
||||
if binary.abspath: # Only check non-empty paths
|
||||
assert '/' in binary.abspath, \
|
||||
f"{binary.name} should have full path, not just name: {binary.abspath}"
|
||||
# Version might be empty for some binaries, that's ok
|
||||
# Clean up
|
||||
test_binary.delete()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_ripgrep_only_detected_when_backend_enabled():
|
||||
"""
|
||||
Test that ripgrep is only detected when SEARCH_BACKEND_ENGINE='ripgrep'.
|
||||
Test ripgrep validation hook behavior with different SEARCH_BACKEND_ENGINE settings.
|
||||
|
||||
Guards against ripgrep being installed/detected when not needed.
|
||||
Guards against ripgrep being detected when not needed.
|
||||
"""
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
from archivebox.crawls.models import Seed, Crawl, CrawlMachine
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from django.conf import settings
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
if not shutil.which('rg'):
|
||||
pass
|
||||
pytest.skip("ripgrep not installed")
|
||||
|
||||
machine = Machine.current()
|
||||
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py'
|
||||
|
||||
# Clear any existing ripgrep records
|
||||
Binary.objects.filter(machine=machine, name='rg').delete()
|
||||
# Test 1: With ripgrep backend - should output Binary record
|
||||
env1 = os.environ.copy()
|
||||
env1['SEARCH_BACKEND_ENGINE'] = 'ripgrep'
|
||||
env1['RIPGREP_BINARY'] = 'rg'
|
||||
|
||||
# Test 1: With ripgrep backend - should be detected
|
||||
with patch('archivebox.config.configset.get_config') as mock_config:
|
||||
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'ripgrep', 'RIPGREP_BINARY': 'rg'}
|
||||
result1 = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env1,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
seed = Seed.objects.create(
|
||||
uri='archivebox://test-rg-enabled',
|
||||
label='Test ripgrep detection enabled',
|
||||
created_by_id=created_by_id,
|
||||
extractor='auto',
|
||||
)
|
||||
assert result1.returncode == 0, f"Hook should succeed with ripgrep backend: {result1.stderr}"
|
||||
# Should output Binary JSONL when backend is ripgrep
|
||||
assert 'Binary' in result1.stdout or 'COMPUTED:' in result1.stdout, \
|
||||
"Should output Binary or COMPUTED when backend=ripgrep"
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
seed=seed,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
# Test 2: With different backend - should output nothing
|
||||
env2 = os.environ.copy()
|
||||
env2['SEARCH_BACKEND_ENGINE'] = 'sqlite'
|
||||
env2['RIPGREP_BINARY'] = 'rg'
|
||||
|
||||
sm = CrawlMachine(crawl)
|
||||
sm.send('tick')
|
||||
result2 = subprocess.run(
|
||||
[sys.executable, str(hook_path)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env2,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
# Ripgrep should be detected
|
||||
rg_detected = Binary.objects.filter(machine=machine, name='rg').exists()
|
||||
assert rg_detected, "Ripgrep should be detected when SEARCH_BACKEND_ENGINE='ripgrep'"
|
||||
|
||||
# Clear records again
|
||||
Binary.objects.filter(machine=machine, name='rg').delete()
|
||||
|
||||
# Test 2: With different backend - should NOT be detected
|
||||
with patch('archivebox.config.configset.get_config') as mock_config:
|
||||
mock_config.return_value = {'SEARCH_BACKEND_ENGINE': 'sqlite', 'RIPGREP_BINARY': 'rg'}
|
||||
|
||||
seed2 = Seed.objects.create(
|
||||
uri='archivebox://test-rg-disabled',
|
||||
label='Test ripgrep detection disabled',
|
||||
created_by_id=created_by_id,
|
||||
extractor='auto',
|
||||
)
|
||||
|
||||
crawl2 = Crawl.objects.create(
|
||||
seed=seed2,
|
||||
max_depth=0,
|
||||
created_by_id=created_by_id,
|
||||
status='queued',
|
||||
)
|
||||
|
||||
sm2 = CrawlMachine(crawl2)
|
||||
sm2.send('tick')
|
||||
|
||||
# Ripgrep should NOT be detected
|
||||
rg_detected = Binary.objects.filter(machine=machine, name='rg').exists()
|
||||
assert not rg_detected, "Ripgrep should NOT be detected when SEARCH_BACKEND_ENGINE!='ripgrep'"
|
||||
assert result2.returncode == 0, "Hook should exit successfully when backend is not ripgrep"
|
||||
assert result2.stdout.strip() == '', "Hook should produce no output when backend is not ripgrep"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
21
archivebox/plugins/seo/config.json
Normal file
21
archivebox/plugins/seo/config.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {
|
||||
"SEO_ENABLED": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SAVE_SEO", "USE_SEO"],
|
||||
"description": "Enable SEO metadata capture"
|
||||
},
|
||||
"SEO_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for SEO capture in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -166,13 +166,13 @@ async function main() {
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_SEO', true)) {
|
||||
console.log('Skipping SEO (SAVE_SEO=False)');
|
||||
if (!getEnvBool('SEO_ENABLED', true)) {
|
||||
console.log('Skipping SEO (SEO_ENABLED=False)');
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'SAVE_SEO=False',
|
||||
output_str: 'SEO_ENABLED=False',
|
||||
}));
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user