mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
remove huey
This commit is contained in:
1
archivebox/plugins/archive_org/templates/icon.html
Normal file
1
archivebox/plugins/archive_org/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🏛️
|
||||
@@ -7,7 +7,7 @@ new plugin-based output structure to the legacy canonical output paths that
|
||||
ArchiveBox has historically used. This maintains backward compatibility with
|
||||
existing tools and scripts that expect outputs at specific locations.
|
||||
|
||||
Canonical output paths (from Snapshot.canonical_outputs()):
|
||||
Canonical output paths:
|
||||
- favicon.ico → favicon/favicon.ico
|
||||
- singlefile.html → singlefile/singlefile.html
|
||||
- readability/content.html → readability/content.html
|
||||
@@ -27,27 +27,20 @@ New plugin outputs:
|
||||
- redirects.json → redirects/redirects.json
|
||||
- console.jsonl → consolelog/console.jsonl
|
||||
|
||||
Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
||||
Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
|
||||
DATA_DIR: ArchiveBox data directory
|
||||
ARCHIVE_DIR: Archive output directory
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.plugins.canonical_outputs'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
# Configure Django if running standalone
|
||||
if __name__ == '__main__':
|
||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
import django
|
||||
django.setup()
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict
|
||||
|
||||
import rich_click as click
|
||||
|
||||
@@ -150,10 +143,7 @@ def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Create symlinks from plugin outputs to canonical legacy locations."""
|
||||
from datetime import datetime
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
start_ts = datetime.now()
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
@@ -161,31 +151,20 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
from archivebox.config import CONSTANTS
|
||||
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_canonical:
|
||||
click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now()
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'STATUS={status}')
|
||||
click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
||||
click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
error = f'Snapshot {snapshot_id} not found'
|
||||
raise ValueError(error)
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
|
||||
# Parent is the snapshot directory
|
||||
output_dir = Path.cwd()
|
||||
snapshot_dir = output_dir.parent
|
||||
|
||||
# Get snapshot directory
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
if not snapshot_dir.exists():
|
||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||
|
||||
# Create canonical symlinks
|
||||
results = create_canonical_symlinks(snapshot_dir)
|
||||
@@ -203,37 +182,18 @@ def main(url: str, snapshot_id: str):
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
end_ts = datetime.now()
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
|
||||
# Print results
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
click.echo(f'OUTPUT={output}')
|
||||
click.echo(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
click.echo(f'ERROR={error}', err=True)
|
||||
|
||||
# Print JSON result
|
||||
import json
|
||||
result_json = {
|
||||
'extractor': 'canonical_outputs',
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
# Print JSON result for hook runner
|
||||
result = {
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'symlinks_created': symlinks_created,
|
||||
'error': error or None,
|
||||
'symlinks_created': symlinks_created,
|
||||
}
|
||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
click.echo(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install Chrome/Chromium if not already available.
|
||||
|
||||
Runs at crawl start to ensure Chrome is installed.
|
||||
Uses playwright to install chromium if no system Chrome found.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def find_chrome():
|
||||
"""Try to find system Chrome/Chromium."""
|
||||
# Comprehensive list of Chrome/Chromium binary names and paths
|
||||
chromium_names_linux = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'chromium-browser-beta',
|
||||
'chromium-browser-unstable',
|
||||
'chromium-browser-canary',
|
||||
'chromium-browser-dev',
|
||||
]
|
||||
|
||||
chrome_names_linux = [
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'google-chrome-beta',
|
||||
'google-chrome-canary',
|
||||
'google-chrome-unstable',
|
||||
'google-chrome-dev',
|
||||
'chrome',
|
||||
]
|
||||
|
||||
chrome_paths_macos = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
]
|
||||
|
||||
chrome_paths_linux = [
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
'/snap/bin/chromium',
|
||||
'/opt/google/chrome/chrome',
|
||||
]
|
||||
|
||||
all_chrome_names = chrome_names_linux + chromium_names_linux
|
||||
all_chrome_paths = chrome_paths_macos + chrome_paths_linux
|
||||
|
||||
# Check env var first
|
||||
env_path = os.environ.get('CHROME_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return env_path
|
||||
|
||||
# Try shutil.which for various names
|
||||
for name in all_chrome_names:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
return abspath
|
||||
|
||||
# Check common paths
|
||||
for path in all_chrome_paths:
|
||||
if Path(path).is_file():
|
||||
return path
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
# First try to find system Chrome
|
||||
system_chrome = find_chrome()
|
||||
if system_chrome:
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'chrome',
|
||||
'abspath': str(system_chrome),
|
||||
'version': None,
|
||||
'sha256': None,
|
||||
'binprovider': 'env',
|
||||
}))
|
||||
sys.exit(0)
|
||||
|
||||
# If not found in system, try to install chromium via apt/brew
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Try chromium-browser or chromium via system package managers
|
||||
for binary_name in ['chromium', 'chromium-browser', 'google-chrome']:
|
||||
try:
|
||||
chrome_binary = Binary(
|
||||
name=binary_name,
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = chrome_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = chrome_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'chrome',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# If all attempts failed
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'chrome',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install Chrome/Chromium", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'chrome',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing Chrome: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -2,7 +2,7 @@
|
||||
Integration tests for chrome_session plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook finds system Chrome or installs chromium
|
||||
1. Validate hook checks for Chrome/Chromium binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Chrome session script exists
|
||||
"""
|
||||
@@ -14,7 +14,7 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py'
|
||||
CHROME_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_chrome.py'
|
||||
CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
|
||||
|
||||
|
||||
@@ -23,37 +23,50 @@ def test_hook_script_exists():
|
||||
assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_install_hook():
|
||||
"""Test chrome install hook to find or install Chrome/Chromium."""
|
||||
def test_chrome_validate_hook():
|
||||
"""Test chrome validate hook checks for Chrome/Chromium binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
[sys.executable, str(CHROME_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'chrome'
|
||||
assert record['abspath']
|
||||
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'chrome'
|
||||
assert record['abspath']
|
||||
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'chrome'
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify chrome is available via abx-pkg after hook installation."""
|
||||
"""Verify chrome is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
@@ -75,10 +88,10 @@ def test_verify_deps_with_abx_pkg():
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# If we get here, chrome should still be available from system
|
||||
# If we get here, chrome not available
|
||||
import shutil
|
||||
assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \
|
||||
"Chrome should be available after install hook"
|
||||
if not (shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome')):
|
||||
pytest.skip("Chrome/Chromium not available - Dependency record should have been emitted")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
6
archivebox/plugins/dom/templates/embed.html
Normal file
6
archivebox/plugins/dom/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- DOM embed - full iframe of captured DOM HTML -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed dom-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
6
archivebox/plugins/dom/templates/fullscreen.html
Normal file
6
archivebox/plugins/dom/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- DOM fullscreen - full page iframe -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen dom-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||
</iframe>
|
||||
1
archivebox/plugins/dom/templates/icon.html
Normal file
1
archivebox/plugins/dom/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🌐
|
||||
8
archivebox/plugins/dom/templates/thumbnail.html
Normal file
8
archivebox/plugins/dom/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
|
||||
<div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
1
archivebox/plugins/favicon/templates/icon.html
Normal file
1
archivebox/plugins/favicon/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
⭐
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install git if not already available.
|
||||
|
||||
Runs at crawl start to ensure git is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# git binary and package have same name
|
||||
git_binary = Binary(
|
||||
name='git',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = git_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = git_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'git',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install git", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing git: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
6
archivebox/plugins/git/templates/embed.html
Normal file
6
archivebox/plugins/git/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Git embed - directory listing of cloned repo -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed git-embed"
|
||||
style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
6
archivebox/plugins/git/templates/fullscreen.html
Normal file
6
archivebox/plugins/git/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Git fullscreen - full directory listing -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen git-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fff;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
1
archivebox/plugins/git/templates/icon.html
Normal file
1
archivebox/plugins/git/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📂
|
||||
5
archivebox/plugins/git/templates/thumbnail.html
Normal file
5
archivebox/plugins/git/templates/thumbnail.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- Git thumbnail - shows git repository icon and info -->
|
||||
<div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
|
||||
<span style="font-size: 32px;">📂</span>
|
||||
<span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
|
||||
</div>
|
||||
@@ -2,7 +2,7 @@
|
||||
Integration tests for git plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook installs git via abx-pkg
|
||||
1. Validate hook checks for git binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Standalone git extractor execution
|
||||
"""
|
||||
@@ -17,50 +17,64 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
|
||||
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
|
||||
GIT_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_git.py'
|
||||
TEST_URL = 'https://github.com/example/repo.git'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert GIT_HOOK.exists()
|
||||
|
||||
def test_git_install_hook():
|
||||
"""Test git install hook to install git if needed."""
|
||||
def test_git_validate_hook():
|
||||
"""Test git validate hook checks for git binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(GIT_INSTALL_HOOK)],
|
||||
[sys.executable, str(GIT_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'git'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'git'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'git'
|
||||
assert 'env' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify git is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
"""Verify git is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
git_loaded = git_binary.load()
|
||||
assert git_loaded and git_loaded.abspath, "git should be available after install hook"
|
||||
|
||||
if git_loaded and git_loaded.abspath:
|
||||
assert True, "git is available"
|
||||
else:
|
||||
pytest.skip("git not available - Dependency record should have been emitted")
|
||||
|
||||
def test_reports_missing_git():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
1
archivebox/plugins/headers/templates/icon.html
Normal file
1
archivebox/plugins/headers/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📋
|
||||
1
archivebox/plugins/htmltotext/templates/icon.html
Normal file
1
archivebox/plugins/htmltotext/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📃
|
||||
@@ -1,67 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install yt-dlp if not already available.
|
||||
|
||||
Runs at crawl start to ensure yt-dlp is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
PipProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# yt-dlp binary and package have same name
|
||||
ytdlp_binary = Binary(
|
||||
name='yt-dlp',
|
||||
binproviders=[PipProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = ytdlp_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via pip
|
||||
loaded = ytdlp_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'yt-dlp',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,brew,env',
|
||||
}))
|
||||
print("Failed to install yt-dlp", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,brew,env',
|
||||
}))
|
||||
print(f"Error installing yt-dlp: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
278
archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
Executable file
278
archivebox/plugins/media/on_Crawl__00_validate_ytdlp.py
Executable file
@@ -0,0 +1,278 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for yt-dlp and its dependencies (node, ffmpeg).
|
||||
|
||||
Runs at crawl start to verify yt-dlp and required binaries are available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_ytdlp() -> dict | None:
|
||||
"""Find yt-dlp binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
|
||||
class YtdlpBinary(Binary):
|
||||
name: str = 'yt-dlp'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = YtdlpBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'yt-dlp',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'yt-dlp',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_node() -> dict | None:
|
||||
"""Find node binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class NodeBinary(Binary):
|
||||
name: str = 'node'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
overrides: dict = {'apt': {'packages': ['nodejs']}}
|
||||
|
||||
binary = NodeBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'node',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'node',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_ffmpeg() -> dict | None:
|
||||
"""Find ffmpeg binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
class FfmpegBinary(Binary):
|
||||
name: str = 'ffmpeg'
|
||||
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
|
||||
|
||||
binary = FfmpegBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'ffmpeg',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'ffmpeg',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
# Check for yt-dlp (required)
|
||||
ytdlp_result = find_ytdlp()
|
||||
|
||||
# Check for node (required for JS extraction)
|
||||
node_result = find_node()
|
||||
|
||||
# Check for ffmpeg (required for video conversion)
|
||||
ffmpeg_result = find_ffmpeg()
|
||||
|
||||
missing_deps = []
|
||||
|
||||
# Emit results for yt-dlp
|
||||
if ytdlp_result and ytdlp_result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': ytdlp_result['name'],
|
||||
'abspath': ytdlp_result['abspath'],
|
||||
'version': ytdlp_result['version'],
|
||||
'sha256': ytdlp_result['sha256'],
|
||||
'binprovider': ytdlp_result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/YTDLP_BINARY',
|
||||
'value': ytdlp_result['abspath'],
|
||||
}))
|
||||
|
||||
if ytdlp_result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/YTDLP_VERSION',
|
||||
'value': ytdlp_result['version'],
|
||||
}))
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'yt-dlp',
|
||||
'bin_providers': 'pip,env',
|
||||
}))
|
||||
missing_deps.append('yt-dlp')
|
||||
|
||||
# Emit results for node
|
||||
if node_result and node_result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': node_result['name'],
|
||||
'abspath': node_result['abspath'],
|
||||
'version': node_result['version'],
|
||||
'sha256': node_result['sha256'],
|
||||
'binprovider': node_result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/NODE_BINARY',
|
||||
'value': node_result['abspath'],
|
||||
}))
|
||||
|
||||
if node_result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/NODE_VERSION',
|
||||
'value': node_result['version'],
|
||||
}))
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'node',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
missing_deps.append('node')
|
||||
|
||||
# Emit results for ffmpeg
|
||||
if ffmpeg_result and ffmpeg_result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': ffmpeg_result['name'],
|
||||
'abspath': ffmpeg_result['abspath'],
|
||||
'version': ffmpeg_result['version'],
|
||||
'sha256': ffmpeg_result['sha256'],
|
||||
'binprovider': ffmpeg_result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/FFMPEG_BINARY',
|
||||
'value': ffmpeg_result['abspath'],
|
||||
}))
|
||||
|
||||
if ffmpeg_result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/FFMPEG_VERSION',
|
||||
'value': ffmpeg_result['version'],
|
||||
}))
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'ffmpeg',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
missing_deps.append('ffmpeg')
|
||||
|
||||
if missing_deps:
|
||||
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
9
archivebox/plugins/media/templates/embed.html
Normal file
9
archivebox/plugins/media/templates/embed.html
Normal file
@@ -0,0 +1,9 @@
|
||||
<!-- Media embed - video/audio player -->
|
||||
<div class="extractor-embed media-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<video src="{{ output_path }}"
|
||||
style="max-width: 100%; max-height: 100%;"
|
||||
controls
|
||||
preload="metadata">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
</div>
|
||||
10
archivebox/plugins/media/templates/fullscreen.html
Normal file
10
archivebox/plugins/media/templates/fullscreen.html
Normal file
@@ -0,0 +1,10 @@
|
||||
<!-- Media fullscreen - full video/audio player -->
|
||||
<div class="extractor-fullscreen media-fullscreen" style="width: 100%; height: 100vh; background: #000; display: flex; align-items: center; justify-content: center;">
|
||||
<video src="{{ output_path }}"
|
||||
style="max-width: 100%; max-height: 100%;"
|
||||
controls
|
||||
autoplay
|
||||
preload="auto">
|
||||
Your browser does not support the video tag.
|
||||
</video>
|
||||
</div>
|
||||
1
archivebox/plugins/media/templates/icon.html
Normal file
1
archivebox/plugins/media/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🎬
|
||||
14
archivebox/plugins/media/templates/thumbnail.html
Normal file
14
archivebox/plugins/media/templates/thumbnail.html
Normal file
@@ -0,0 +1,14 @@
|
||||
<!-- Media thumbnail - shows video/audio player or placeholder -->
|
||||
<div class="extractor-thumbnail media-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<video src="{{ output_path }}"
|
||||
style="width: 100%; height: 100px; object-fit: contain;"
|
||||
poster=""
|
||||
preload="metadata"
|
||||
muted
|
||||
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
|
||||
</video>
|
||||
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
|
||||
<span style="font-size: 32px;">🎬</span>
|
||||
<span>Media</span>
|
||||
</div>
|
||||
</div>
|
||||
@@ -21,7 +21,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
|
||||
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
|
||||
MEDIA_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_ytdlp.py'
|
||||
TEST_URL = 'https://example.com/video.mp4'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -29,46 +29,72 @@ def test_hook_script_exists():
|
||||
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
|
||||
|
||||
|
||||
def test_ytdlp_install_hook():
|
||||
"""Test yt-dlp install hook to install yt-dlp if needed."""
|
||||
# Run yt-dlp install hook
|
||||
def test_ytdlp_validate_hook():
|
||||
"""Test yt-dlp validate hook checks for yt-dlp and dependencies (node, ffmpeg)."""
|
||||
# Run yt-dlp validate hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MEDIA_INSTALL_HOOK)],
|
||||
[sys.executable, str(MEDIA_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
# Hook exits 0 if all binaries found, 1 if any not found
|
||||
# Parse output for InstalledBinary and Dependency records
|
||||
found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'yt-dlp'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
name = record['name']
|
||||
if name in found_binaries:
|
||||
assert record['abspath'], f"{name} should have abspath"
|
||||
found_binaries[name] = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
name = record['bin_name']
|
||||
if name in found_dependencies:
|
||||
found_dependencies[name] = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Each binary should either be found (InstalledBinary) or missing (Dependency)
|
||||
for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
|
||||
assert found_binaries[binary_name] or found_dependencies[binary_name], \
|
||||
f"{binary_name} should have either InstalledBinary or Dependency record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify yt-dlp is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
"""Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
PipProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
missing_binaries = []
|
||||
|
||||
# Verify yt-dlp is available
|
||||
ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
|
||||
ytdlp_loaded = ytdlp_binary.load()
|
||||
assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook"
|
||||
if not (ytdlp_loaded and ytdlp_loaded.abspath):
|
||||
missing_binaries.append('yt-dlp')
|
||||
|
||||
# Verify node is available (yt-dlp needs it for JS extraction)
|
||||
node_binary = Binary(
|
||||
name='node',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
node_loaded = node_binary.load()
|
||||
if not (node_loaded and node_loaded.abspath):
|
||||
missing_binaries.append('node')
|
||||
|
||||
# Verify ffmpeg is available (yt-dlp needs it for video conversion)
|
||||
ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
ffmpeg_loaded = ffmpeg_binary.load()
|
||||
if not (ffmpeg_loaded and ffmpeg_loaded.abspath):
|
||||
missing_binaries.append('ffmpeg')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
|
||||
def test_handles_non_media_url():
|
||||
"""Test that media extractor handles non-media URLs gracefully via hook."""
|
||||
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install mercury-parser if not already available.
|
||||
|
||||
Runs at crawl start to ensure mercury-parser is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Note: npm package is @postlight/mercury-parser, binary is mercury-parser
|
||||
mercury_binary = Binary(
|
||||
name='mercury-parser',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = mercury_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via npm
|
||||
loaded = mercury_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'mercury-parser',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'mercury-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print("Failed to install mercury-parser", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'mercury-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"Error installing mercury-parser: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
123
archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
Executable file
123
archivebox/plugins/mercury/on_Crawl__00_validate_mercury.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for postlight-parser binary.
|
||||
|
||||
Runs at crawl start to verify postlight-parser is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_mercury() -> dict | None:
|
||||
"""Find postlight-parser binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class MercuryBinary(Binary):
|
||||
name: str = 'postlight-parser'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
|
||||
|
||||
binary = MercuryBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'postlight-parser',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'postlight-parser',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_mercury()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/MERCURY_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/MERCURY_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'postlight-parser',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"postlight-parser binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -6,10 +6,10 @@ Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Creates mercury/ directory with content.html, content.txt, article.json
|
||||
|
||||
Environment variables:
|
||||
MERCURY_BINARY: Path to mercury-parser binary
|
||||
MERCURY_BINARY: Path to postlight-parser binary
|
||||
TIMEOUT: Timeout in seconds (default: 60)
|
||||
|
||||
Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
|
||||
Note: Requires postlight-parser: npm install -g @postlight/parser
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -25,7 +25,7 @@ import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'mercury'
|
||||
BIN_NAME = 'mercury-parser'
|
||||
BIN_NAME = 'postlight-parser'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'mercury'
|
||||
|
||||
@@ -42,12 +42,12 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
|
||||
|
||||
def find_mercury() -> str | None:
|
||||
"""Find mercury-parser binary."""
|
||||
"""Find postlight-parser binary."""
|
||||
mercury = get_env('MERCURY_BINARY')
|
||||
if mercury and os.path.isfile(mercury):
|
||||
return mercury
|
||||
|
||||
for name in ['mercury-parser', 'mercury']:
|
||||
for name in ['postlight-parser']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
@@ -56,7 +56,7 @@ def find_mercury() -> str | None:
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get mercury-parser version."""
|
||||
"""Get postlight-parser version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
@@ -83,12 +83,12 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
|
||||
if result_text.returncode != 0:
|
||||
stderr = result_text.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'mercury-parser failed: {stderr[:200]}'
|
||||
return False, None, f'postlight-parser failed: {stderr[:200]}'
|
||||
|
||||
try:
|
||||
text_json = json.loads(result_text.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return False, None, 'mercury-parser returned invalid JSON'
|
||||
return False, None, 'postlight-parser returned invalid JSON'
|
||||
|
||||
if text_json.get('failed'):
|
||||
return False, None, 'Mercury was not able to extract article'
|
||||
@@ -139,7 +139,7 @@ def main(url: str, snapshot_id: str):
|
||||
# Find binary
|
||||
binary = find_mercury()
|
||||
if not binary:
|
||||
print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
|
||||
print(f'ERROR: postlight-parser binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
6
archivebox/plugins/mercury/templates/embed.html
Normal file
6
archivebox/plugins/mercury/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Mercury embed - Mercury parser article view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed mercury-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
6
archivebox/plugins/mercury/templates/fullscreen.html
Normal file
6
archivebox/plugins/mercury/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Mercury fullscreen - full Mercury parser article -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen mercury-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
1
archivebox/plugins/mercury/templates/icon.html
Normal file
1
archivebox/plugins/mercury/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
☿️
|
||||
8
archivebox/plugins/mercury/templates/thumbnail.html
Normal file
8
archivebox/plugins/mercury/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Mercury thumbnail - shows Mercury parser extracted article content -->
|
||||
<div class="extractor-thumbnail mercury-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 300px; border: none; pointer-events: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
@@ -21,7 +21,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
|
||||
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
|
||||
MERCURY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_mercury.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
@@ -29,53 +29,70 @@ def test_hook_script_exists():
|
||||
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
|
||||
|
||||
|
||||
def test_mercury_install_hook():
|
||||
"""Test mercury install hook to install mercury-parser if needed."""
|
||||
# Run mercury install hook
|
||||
def test_mercury_validate_hook():
|
||||
"""Test mercury validate hook checks for postlight-parser."""
|
||||
# Run mercury validate hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(MERCURY_INSTALL_HOOK)],
|
||||
[sys.executable, str(MERCURY_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'mercury-parser'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'postlight-parser'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'postlight-parser'
|
||||
assert 'npm' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify mercury-parser is available via abx-pkg after hook installation."""
|
||||
"""Verify postlight-parser is available via abx-pkg."""
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Verify mercury-parser is available
|
||||
# Verify postlight-parser is available
|
||||
mercury_binary = Binary(
|
||||
name='mercury-parser',
|
||||
name='postlight-parser',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
|
||||
overrides={'npm': {'packages': ['@postlight/parser']}}
|
||||
)
|
||||
mercury_loaded = mercury_binary.load()
|
||||
assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook"
|
||||
|
||||
# If validate hook found it (exit 0), this should succeed
|
||||
# If validate hook didn't find it (exit 1), this may fail unless binprovider installed it
|
||||
if mercury_loaded and mercury_loaded.abspath:
|
||||
assert True, "postlight-parser is available"
|
||||
else:
|
||||
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
|
||||
|
||||
def test_extracts_with_mercury_parser():
|
||||
"""Test full workflow: extract with mercury-parser from real HTML via hook."""
|
||||
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
|
||||
@@ -2,46 +2,28 @@
|
||||
"""
|
||||
Create a Merkle tree of all archived outputs.
|
||||
|
||||
This plugin runs after all extractors and post-processing complete (priority 92)
|
||||
and generates a cryptographic Merkle tree of all files in the snapshot directory.
|
||||
This provides:
|
||||
- Tamper detection: verify archive integrity
|
||||
- Efficient updates: only re-hash changed files
|
||||
- Compact proofs: prove file inclusion without sending all files
|
||||
- Deduplication: identify identical content across snapshots
|
||||
This plugin runs after all extractors complete (priority 93) and generates
|
||||
a cryptographic Merkle tree of all files in the snapshot directory.
|
||||
|
||||
Output: merkletree/merkletree.json containing:
|
||||
- root_hash: SHA256 hash of the Merkle root
|
||||
- tree: Full tree structure with internal nodes
|
||||
- files: List of all files with their hashes
|
||||
- metadata: Timestamp, file count, total size
|
||||
Output: merkletree.json containing root_hash, tree structure, file list, metadata
|
||||
|
||||
Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
|
||||
Usage: on_Snapshot__93_merkletree.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
|
||||
DATA_DIR: ArchiveBox data directory
|
||||
ARCHIVE_DIR: Archive output directory
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.plugins.merkletree'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Configure Django if running standalone
|
||||
if __name__ == '__main__':
|
||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
import rich_click as click
|
||||
import click
|
||||
|
||||
|
||||
def sha256_file(filepath: Path) -> str:
|
||||
@@ -49,12 +31,10 @@ def sha256_file(filepath: Path) -> str:
|
||||
h = hashlib.sha256()
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
# Read in 64kb chunks
|
||||
while chunk := f.read(65536):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
except (OSError, PermissionError):
|
||||
# If we can't read the file, return a null hash
|
||||
return '0' * 64
|
||||
|
||||
|
||||
@@ -64,74 +44,45 @@ def sha256_data(data: bytes) -> str:
|
||||
|
||||
|
||||
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
|
||||
"""
|
||||
Recursively collect all files in snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: Root directory to scan
|
||||
exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
|
||||
|
||||
Returns:
|
||||
List of (relative_path, sha256_hash, file_size) tuples
|
||||
"""
|
||||
"""Recursively collect all files in snapshot directory."""
|
||||
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
|
||||
files = []
|
||||
|
||||
for root, dirs, filenames in os.walk(snapshot_dir):
|
||||
# Filter out excluded directories
|
||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||||
|
||||
for filename in filenames:
|
||||
filepath = Path(root) / filename
|
||||
rel_path = filepath.relative_to(snapshot_dir)
|
||||
|
||||
# Skip symlinks (we hash the target, not the link)
|
||||
if filepath.is_symlink():
|
||||
continue
|
||||
|
||||
# Compute hash and size
|
||||
file_hash = sha256_file(filepath)
|
||||
file_size = filepath.stat().st_size if filepath.exists() else 0
|
||||
|
||||
files.append((rel_path, file_hash, file_size))
|
||||
|
||||
# Sort by path for deterministic tree
|
||||
files.sort(key=lambda x: str(x[0]))
|
||||
return files
|
||||
|
||||
|
||||
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||
"""
|
||||
Build a Merkle tree from a list of leaf hashes.
|
||||
|
||||
Args:
|
||||
file_hashes: List of SHA256 hashes (leaves)
|
||||
|
||||
Returns:
|
||||
(root_hash, tree_levels) where tree_levels is a list of hash lists per level
|
||||
"""
|
||||
"""Build a Merkle tree from a list of leaf hashes."""
|
||||
if not file_hashes:
|
||||
# Empty tree
|
||||
return sha256_data(b''), [[]]
|
||||
|
||||
# Initialize with leaf level
|
||||
tree_levels = [file_hashes.copy()]
|
||||
|
||||
# Build tree bottom-up
|
||||
while len(tree_levels[-1]) > 1:
|
||||
current_level = tree_levels[-1]
|
||||
next_level = []
|
||||
|
||||
# Process pairs
|
||||
for i in range(0, len(current_level), 2):
|
||||
left = current_level[i]
|
||||
|
||||
if i + 1 < len(current_level):
|
||||
# Combine left + right
|
||||
right = current_level[i + 1]
|
||||
combined = left + right
|
||||
else:
|
||||
# Odd number of nodes: duplicate the last one
|
||||
combined = left + left
|
||||
|
||||
parent_hash = sha256_data(combined.encode('utf-8'))
|
||||
@@ -139,67 +90,41 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||
|
||||
tree_levels.append(next_level)
|
||||
|
||||
# Root is the single hash at the top level
|
||||
root_hash = tree_levels[-1][0]
|
||||
return root_hash, tree_levels
|
||||
|
||||
|
||||
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a complete Merkle tree of all files in snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: The snapshot directory to scan
|
||||
|
||||
Returns:
|
||||
Dict containing root_hash, tree structure, file list, and metadata
|
||||
"""
|
||||
# Collect all files
|
||||
"""Create a complete Merkle tree of all files in snapshot directory."""
|
||||
files = collect_files(snapshot_dir)
|
||||
|
||||
# Extract just the hashes for tree building
|
||||
file_hashes = [file_hash for _, file_hash, _ in files]
|
||||
|
||||
# Build Merkle tree
|
||||
root_hash, tree_levels = build_merkle_tree(file_hashes)
|
||||
|
||||
# Calculate total size
|
||||
total_size = sum(size for _, _, size in files)
|
||||
|
||||
# Prepare file list with metadata
|
||||
file_list = [
|
||||
{
|
||||
'path': str(path),
|
||||
'hash': file_hash,
|
||||
'size': size,
|
||||
}
|
||||
{'path': str(path), 'hash': file_hash, 'size': size}
|
||||
for path, file_hash, size in files
|
||||
]
|
||||
|
||||
# Prepare result
|
||||
result = {
|
||||
return {
|
||||
'root_hash': root_hash,
|
||||
'tree_levels': tree_levels,
|
||||
'files': file_list,
|
||||
'metadata': {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'timestamp': datetime.now(timezone.utc).isoformat(),
|
||||
'file_count': len(files),
|
||||
'total_size': total_size,
|
||||
'tree_depth': len(tree_levels),
|
||||
},
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL being archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Generate Merkle tree of all archived outputs."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
start_ts = datetime.now()
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
@@ -211,30 +136,19 @@ def main(url: str, snapshot_id: str):
|
||||
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_merkletree:
|
||||
click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now()
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'STATUS={status}')
|
||||
click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
||||
click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
|
||||
sys.exit(0)
|
||||
|
||||
# Get snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
error = f'Snapshot {snapshot_id} not found'
|
||||
raise ValueError(error)
|
||||
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
|
||||
# Parent is the snapshot directory
|
||||
output_dir = Path.cwd()
|
||||
snapshot_dir = output_dir.parent
|
||||
|
||||
# Get snapshot directory
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
if not snapshot_dir.exists():
|
||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
|
||||
|
||||
# Create output directory
|
||||
output_dir = snapshot_dir / 'merkletree'
|
||||
# Ensure output directory exists
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / 'merkletree.json'
|
||||
|
||||
@@ -246,49 +160,31 @@ def main(url: str, snapshot_id: str):
|
||||
json.dump(merkle_data, f, indent=2)
|
||||
|
||||
status = 'succeeded'
|
||||
output = str(output_path)
|
||||
output = 'merkletree.json'
|
||||
root_hash = merkle_data['root_hash']
|
||||
file_count = merkle_data['metadata']['file_count']
|
||||
total_size = merkle_data['metadata']['total_size']
|
||||
|
||||
click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
|
||||
click.echo(f'Merkle tree: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
end_ts = datetime.now()
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
|
||||
# Print results
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
click.echo(f'OUTPUT={output}')
|
||||
click.echo(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
click.echo(f'ERROR={error}', err=True)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': 'merkletree',
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
# Print JSON result for hook runner
|
||||
result = {
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
'root_hash': root_hash,
|
||||
'file_count': file_count,
|
||||
'error': error or None,
|
||||
}
|
||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
click.echo(json.dumps(result))
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
🔗
|
||||
@@ -133,7 +133,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='HTML URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse HTML and extract href URLs."""
|
||||
|
||||
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
|
||||
|
||||
1
archivebox/plugins/parse_html_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_html_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
🔗
|
||||
@@ -127,7 +127,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='JSONL file URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse JSONL bookmark file and extract URLs."""
|
||||
|
||||
try:
|
||||
|
||||
1
archivebox/plugins/parse_jsonl_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_jsonl_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📋
|
||||
@@ -52,7 +52,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse Netscape bookmark HTML and extract URLs."""
|
||||
|
||||
try:
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
🔖
|
||||
@@ -51,7 +51,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse RSS/Atom feed and extract article URLs."""
|
||||
|
||||
if feedparser is None:
|
||||
|
||||
1
archivebox/plugins/parse_rss_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_rss_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📡
|
||||
@@ -100,7 +100,8 @@ def fetch_content(url: str) -> str:
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
|
||||
def main(url: str):
|
||||
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
|
||||
def main(url: str, snapshot_id: str = None):
|
||||
"""Parse plain text and extract URLs."""
|
||||
|
||||
try:
|
||||
|
||||
1
archivebox/plugins/parse_txt_urls/templates/icon.html
Normal file
1
archivebox/plugins/parse_txt_urls/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📃
|
||||
5
archivebox/plugins/pdf/templates/embed.html
Normal file
5
archivebox/plugins/pdf/templates/embed.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- PDF embed - full PDF viewer -->
|
||||
<embed src="{{ output_path }}#toolbar=1&navpanes=1"
|
||||
type="application/pdf"
|
||||
class="extractor-embed pdf-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px;">
|
||||
5
archivebox/plugins/pdf/templates/fullscreen.html
Normal file
5
archivebox/plugins/pdf/templates/fullscreen.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- PDF fullscreen - full PDF viewer -->
|
||||
<embed src="{{ output_path }}#toolbar=1&navpanes=1&view=FitH"
|
||||
type="application/pdf"
|
||||
class="extractor-fullscreen pdf-fullscreen"
|
||||
style="width: 100%; height: 100vh;">
|
||||
1
archivebox/plugins/pdf/templates/icon.html
Normal file
1
archivebox/plugins/pdf/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📄
|
||||
6
archivebox/plugins/pdf/templates/thumbnail.html
Normal file
6
archivebox/plugins/pdf/templates/thumbnail.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- PDF thumbnail - shows first page preview -->
|
||||
<div class="extractor-thumbnail pdf-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f5f5f5;">
|
||||
<embed src="{{ output_path }}#toolbar=0&navpanes=0&scrollbar=0&page=1&view=FitH"
|
||||
type="application/pdf"
|
||||
style="width: 100%; height: 200px; margin-top: -20px; pointer-events: none;">
|
||||
</div>
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install readability-extractor if not already available.
|
||||
|
||||
Runs at crawl start to ensure readability-extractor is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# Note: npm package is from github:ArchiveBox/readability-extractor
|
||||
readability_binary = Binary(
|
||||
name='readability-extractor',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = readability_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via npm from GitHub repo
|
||||
loaded = readability_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'readability-extractor',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print("Failed to install readability-extractor", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"Error installing readability-extractor: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
123
archivebox/plugins/readability/on_Crawl__00_validate_readability.py
Executable file
123
archivebox/plugins/readability/on_Crawl__00_validate_readability.py
Executable file
@@ -0,0 +1,123 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for readability-extractor binary.
|
||||
|
||||
Runs at crawl start to verify readability-extractor is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_readability() -> dict | None:
|
||||
"""Find readability-extractor binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider
|
||||
|
||||
class ReadabilityBinary(Binary):
|
||||
name: str = 'readability-extractor'
|
||||
binproviders_supported = [NpmProvider(), EnvProvider()]
|
||||
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
|
||||
binary = ReadabilityBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'readability-extractor',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'readability-extractor',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_readability()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/READABILITY_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/READABILITY_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'readability-extractor',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"readability-extractor binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
6
archivebox/plugins/readability/templates/embed.html
Normal file
6
archivebox/plugins/readability/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Readability embed - reader-mode article view -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed readability-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
6
archivebox/plugins/readability/templates/fullscreen.html
Normal file
6
archivebox/plugins/readability/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Readability fullscreen - full reader-mode article -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen readability-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
1
archivebox/plugins/readability/templates/icon.html
Normal file
1
archivebox/plugins/readability/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📖
|
||||
8
archivebox/plugins/readability/templates/thumbnail.html
Normal file
8
archivebox/plugins/readability/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Readability thumbnail - shows reader-mode extracted article content -->
|
||||
<div class="extractor-thumbnail readability-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 100%; height: 300px; border: none; pointer-events: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
@@ -2,7 +2,7 @@
|
||||
Integration tests for readability plugin
|
||||
|
||||
Tests verify:
|
||||
1. Install hook installs readability-extractor via abx-pkg
|
||||
1. Validate hook checks for readability-extractor binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Plugin reports missing dependency correctly
|
||||
4. Extraction works against real example.com content
|
||||
@@ -21,7 +21,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
|
||||
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
|
||||
READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
@@ -101,48 +101,63 @@ def test_reports_missing_dependency_when_not_installed():
|
||||
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
|
||||
|
||||
|
||||
def test_readability_install_hook():
|
||||
"""Test readability install hook to install readability-extractor if needed."""
|
||||
def test_readability_validate_hook():
|
||||
"""Test readability validate hook checks for readability-extractor binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(READABILITY_INSTALL_HOOK)],
|
||||
[sys.executable, str(READABILITY_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'readability-extractor'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'readability-extractor'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'readability-extractor'
|
||||
assert 'npm' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify readability-extractor is available via abx-pkg after hook installation."""
|
||||
"""Verify readability-extractor is available via abx-pkg."""
|
||||
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
NpmProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
readability_binary = Binary(
|
||||
name='readability-extractor',
|
||||
binproviders=[NpmProvider(), EnvProvider()],
|
||||
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
|
||||
)
|
||||
readability_loaded = readability_binary.load()
|
||||
assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook"
|
||||
|
||||
if readability_loaded and readability_loaded.abspath:
|
||||
assert True, "readability-extractor is available"
|
||||
else:
|
||||
pytest.skip("readability-extractor not available - Dependency record should have been emitted")
|
||||
|
||||
|
||||
def test_extracts_article_after_installation():
|
||||
|
||||
5
archivebox/plugins/screenshot/templates/embed.html
Normal file
5
archivebox/plugins/screenshot/templates/embed.html
Normal file
@@ -0,0 +1,5 @@
|
||||
<!-- Screenshot embed - full image view -->
|
||||
<img src="{{ output_path }}"
|
||||
alt="Screenshot of page"
|
||||
class="extractor-embed screenshot-embed"
|
||||
style="max-width: 100%; height: auto;">
|
||||
8
archivebox/plugins/screenshot/templates/fullscreen.html
Normal file
8
archivebox/plugins/screenshot/templates/fullscreen.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Screenshot fullscreen - zoomable image -->
|
||||
<div style="width: 100%; height: 100vh; overflow: auto; background: #222; display: flex; align-items: start; justify-content: center;">
|
||||
<img src="{{ output_path }}"
|
||||
alt="Screenshot of page"
|
||||
class="extractor-fullscreen screenshot-fullscreen"
|
||||
style="max-width: 100%; cursor: zoom-in;"
|
||||
onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.maxWidth === 'none' ? 'zoom-out' : 'zoom-in';">
|
||||
</div>
|
||||
1
archivebox/plugins/screenshot/templates/icon.html
Normal file
1
archivebox/plugins/screenshot/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📷
|
||||
8
archivebox/plugins/screenshot/templates/thumbnail.html
Normal file
8
archivebox/plugins/screenshot/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Screenshot thumbnail - shows the captured screenshot image -->
|
||||
<img src="{{ output_path }}"
|
||||
alt="Screenshot of page"
|
||||
class="extractor-thumbnail screenshot-thumbnail"
|
||||
style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333;"
|
||||
loading="lazy"
|
||||
onerror="this.style.display='none'; this.nextElementSibling.style.display='block';">
|
||||
<div style="display: none; text-align: center; padding: 20px; color: #999;">📷 Screenshot</div>
|
||||
6
archivebox/plugins/singlefile/templates/embed.html
Normal file
6
archivebox/plugins/singlefile/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Singlefile embed - full iframe of archived HTML -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed singlefile-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
6
archivebox/plugins/singlefile/templates/fullscreen.html
Normal file
6
archivebox/plugins/singlefile/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Singlefile fullscreen - full page iframe -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen singlefile-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||
</iframe>
|
||||
1
archivebox/plugins/singlefile/templates/icon.html
Normal file
1
archivebox/plugins/singlefile/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📦
|
||||
8
archivebox/plugins/singlefile/templates/thumbnail.html
Normal file
8
archivebox/plugins/singlefile/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Singlefile thumbnail - scaled down iframe preview of archived HTML -->
|
||||
<div class="extractor-thumbnail singlefile-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
1
archivebox/plugins/staticfile/templates/icon.html
Normal file
1
archivebox/plugins/staticfile/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📁
|
||||
1
archivebox/plugins/title/templates/icon.html
Normal file
1
archivebox/plugins/title/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📝
|
||||
@@ -1,68 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install wget if not already available.
|
||||
|
||||
Runs at crawl start to ensure wget is installed.
|
||||
Outputs JSONL for InstalledBinary.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
|
||||
# wget binary and package have same name
|
||||
wget_binary = Binary(
|
||||
name='wget',
|
||||
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
|
||||
)
|
||||
|
||||
# Try to load, install if not found
|
||||
try:
|
||||
loaded = wget_binary.load()
|
||||
if not loaded or not loaded.abspath:
|
||||
raise Exception("Not loaded")
|
||||
except Exception:
|
||||
# Install via system package manager
|
||||
loaded = wget_binary.install()
|
||||
|
||||
if loaded and loaded.abspath:
|
||||
# Output InstalledBinary JSONL
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': 'wget',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256,
|
||||
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
|
||||
}))
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print("Failed to install wget", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Error installing wget: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
6
archivebox/plugins/wget/templates/embed.html
Normal file
6
archivebox/plugins/wget/templates/embed.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Wget embed - full iframe of mirrored site -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-embed wget-embed"
|
||||
style="width: 100%; height: 100%; min-height: 500px; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms">
|
||||
</iframe>
|
||||
6
archivebox/plugins/wget/templates/fullscreen.html
Normal file
6
archivebox/plugins/wget/templates/fullscreen.html
Normal file
@@ -0,0 +1,6 @@
|
||||
<!-- Wget fullscreen - full page iframe of mirrored site -->
|
||||
<iframe src="{{ output_path }}"
|
||||
class="extractor-fullscreen wget-fullscreen"
|
||||
style="width: 100%; height: 100vh; border: none;"
|
||||
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
|
||||
</iframe>
|
||||
1
archivebox/plugins/wget/templates/icon.html
Normal file
1
archivebox/plugins/wget/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
📥
|
||||
8
archivebox/plugins/wget/templates/thumbnail.html
Normal file
8
archivebox/plugins/wget/templates/thumbnail.html
Normal file
@@ -0,0 +1,8 @@
|
||||
<!-- Wget thumbnail - scaled down iframe preview of mirrored site -->
|
||||
<div class="extractor-thumbnail wget-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
|
||||
<iframe src="{{ output_path }}"
|
||||
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
|
||||
loading="lazy"
|
||||
sandbox="allow-same-origin">
|
||||
</iframe>
|
||||
</div>
|
||||
@@ -2,8 +2,8 @@
|
||||
Integration tests for wget plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin reports missing dependency correctly
|
||||
2. wget can be installed via brew/apt provider hooks
|
||||
1. Validate hook checks for wget binary
|
||||
2. Verify deps with abx-pkg
|
||||
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
|
||||
4. Extraction works against real example.com
|
||||
5. Output files contain actual page content
|
||||
@@ -26,7 +26,7 @@ import pytest
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
|
||||
WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
|
||||
WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
|
||||
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
|
||||
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
@@ -37,45 +37,59 @@ def test_hook_script_exists():
|
||||
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
|
||||
|
||||
|
||||
def test_wget_install_hook():
|
||||
"""Test wget install hook to install wget if needed."""
|
||||
def test_wget_validate_hook():
|
||||
"""Test wget validate hook checks for wget binary."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_INSTALL_HOOK)],
|
||||
[sys.executable, str(WGET_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
|
||||
|
||||
# Verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'wget'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert found_binary, "Should output InstalledBinary record"
|
||||
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
|
||||
if result.returncode == 0:
|
||||
# Binary found - verify InstalledBinary JSONL output
|
||||
found_binary = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'wget'
|
||||
assert record['abspath']
|
||||
found_binary = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_binary, "Should output InstalledBinary record when binary found"
|
||||
else:
|
||||
# Binary not found - verify Dependency JSONL output
|
||||
found_dependency = False
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
assert record['bin_name'] == 'wget'
|
||||
assert 'env' in record['bin_providers']
|
||||
found_dependency = True
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
assert found_dependency, "Should output Dependency record when binary not found"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify wget is available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
|
||||
|
||||
AptProvider.model_rebuild()
|
||||
BrewProvider.model_rebuild()
|
||||
EnvProvider.model_rebuild()
|
||||
"""Verify wget is available via abx-pkg."""
|
||||
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
|
||||
wget_loaded = wget_binary.load()
|
||||
assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook"
|
||||
|
||||
if wget_loaded and wget_loaded.abspath:
|
||||
assert True, "wget is available"
|
||||
else:
|
||||
pytest.skip("wget not available - Dependency record should have been emitted")
|
||||
|
||||
|
||||
def test_reports_missing_dependency_when_not_installed():
|
||||
|
||||
Reference in New Issue
Block a user