remove huey

This commit is contained in:
Nick Sweeting
2025-12-24 23:40:18 -08:00
parent 6c769d831c
commit d95f0dc186
105 changed files with 3635 additions and 1402 deletions

View File

@@ -0,0 +1 @@
🏛️

View File

@@ -7,7 +7,7 @@ new plugin-based output structure to the legacy canonical output paths that
ArchiveBox has historically used. This maintains backward compatibility with
existing tools and scripts that expect outputs at specific locations.
Canonical output paths (from Snapshot.canonical_outputs()):
Canonical output paths:
- favicon.ico → favicon/favicon.ico
- singlefile.html → singlefile/singlefile.html
- readability/content.html → readability/content.html
@@ -27,27 +27,20 @@ New plugin outputs:
- redirects.json → redirects/redirects.json
- console.jsonl → consolelog/console.jsonl
Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__92_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
__package__ = 'archivebox.plugins.canonical_outputs'
import os
import sys
import json
from pathlib import Path
from typing import Dict, Optional
# Configure Django if running standalone
if __name__ == '__main__':
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
from datetime import datetime, timezone
from typing import Dict
import rich_click as click
@@ -150,10 +143,7 @@ def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Create symlinks from plugin outputs to canonical legacy locations."""
from datetime import datetime
from archivebox.core.models import Snapshot
start_ts = datetime.now()
start_ts = datetime.now(timezone.utc)
status = 'failed'
output = None
error = ''
@@ -161,31 +151,20 @@ def main(url: str, snapshot_id: str):
try:
# Check if enabled
from archivebox.config import CONSTANTS
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_canonical:
click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
status = 'skipped'
end_ts = datetime.now()
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'STATUS={status}')
click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
click.echo(json.dumps({'status': status, 'output': 'SAVE_CANONICAL_SYMLINKS=false'}))
sys.exit(0)
# Get snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
error = f'Snapshot {snapshot_id} not found'
raise ValueError(error)
# Working directory is the extractor output dir (e.g., <snapshot>/canonical_outputs/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
# Get snapshot directory
snapshot_dir = Path(snapshot.output_dir)
if not snapshot_dir.exists():
error = f'Snapshot directory not found: {snapshot_dir}'
raise FileNotFoundError(error)
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
# Create canonical symlinks
results = create_canonical_symlinks(snapshot_dir)
@@ -203,37 +182,18 @@ def main(url: str, snapshot_id: str):
status = 'failed'
click.echo(f'Error: {error}', err=True)
end_ts = datetime.now()
duration = (end_ts - start_ts).total_seconds()
end_ts = datetime.now(timezone.utc)
# Print results
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'DURATION={duration:.2f}')
if output:
click.echo(f'OUTPUT={output}')
click.echo(f'STATUS={status}')
if error:
click.echo(f'ERROR={error}', err=True)
# Print JSON result
import json
result_json = {
'extractor': 'canonical_outputs',
'url': url,
'snapshot_id': snapshot_id,
# Print JSON result for hook runner
result = {
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'symlinks_created': symlinks_created,
'error': error or None,
'symlinks_created': symlinks_created,
}
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
click.echo(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':

View File

@@ -1,149 +0,0 @@
#!/usr/bin/env python3
"""
Install Chrome/Chromium if not already available.
Runs at crawl start to ensure Chrome is installed.
Uses playwright to install chromium if no system Chrome found.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
import os
import shutil
from pathlib import Path
def find_chrome():
"""Try to find system Chrome/Chromium."""
# Comprehensive list of Chrome/Chromium binary names and paths
chromium_names_linux = [
'chromium',
'chromium-browser',
'chromium-browser-beta',
'chromium-browser-unstable',
'chromium-browser-canary',
'chromium-browser-dev',
]
chrome_names_linux = [
'google-chrome',
'google-chrome-stable',
'google-chrome-beta',
'google-chrome-canary',
'google-chrome-unstable',
'google-chrome-dev',
'chrome',
]
chrome_paths_macos = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
]
chrome_paths_linux = [
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
'/snap/bin/chromium',
'/opt/google/chrome/chrome',
]
all_chrome_names = chrome_names_linux + chromium_names_linux
all_chrome_paths = chrome_paths_macos + chrome_paths_linux
# Check env var first
env_path = os.environ.get('CHROME_BINARY', '')
if env_path and Path(env_path).is_file():
return env_path
# Try shutil.which for various names
for name in all_chrome_names:
abspath = shutil.which(name)
if abspath:
return abspath
# Check common paths
for path in all_chrome_paths:
if Path(path).is_file():
return path
return None
def main():
try:
# First try to find system Chrome
system_chrome = find_chrome()
if system_chrome:
print(json.dumps({
'type': 'InstalledBinary',
'name': 'chrome',
'abspath': str(system_chrome),
'version': None,
'sha256': None,
'binprovider': 'env',
}))
sys.exit(0)
# If not found in system, try to install chromium via apt/brew
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
# Try chromium-browser or chromium via system package managers
for binary_name in ['chromium', 'chromium-browser', 'google-chrome']:
try:
chrome_binary = Binary(
name=binary_name,
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = chrome_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via system package manager
loaded = chrome_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'chrome',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
except Exception:
continue
# If all attempts failed
print(json.dumps({
'type': 'Dependency',
'bin_name': 'chrome',
'bin_providers': 'apt,brew,env',
}))
print("Failed to install Chrome/Chromium", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'chrome',
'bin_providers': 'apt,brew,env',
}))
print(f"Error installing Chrome: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -2,7 +2,7 @@
Integration tests for chrome_session plugin
Tests verify:
1. Install hook finds system Chrome or installs chromium
1. Validate hook checks for Chrome/Chromium binary
2. Verify deps with abx-pkg
3. Chrome session script exists
"""
@@ -14,7 +14,7 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
CHROME_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_chrome.py'
CHROME_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_chrome.py'
CHROME_SESSION_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_session.js'
@@ -23,37 +23,50 @@ def test_hook_script_exists():
assert CHROME_SESSION_HOOK.exists(), f"Hook not found: {CHROME_SESSION_HOOK}"
def test_chrome_install_hook():
"""Test chrome install hook to find or install Chrome/Chromium."""
def test_chrome_validate_hook():
"""Test chrome validate hook checks for Chrome/Chromium binary."""
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
[sys.executable, str(CHROME_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'chrome'
assert record['abspath']
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'chrome'
assert record['abspath']
assert Path(record['abspath']).exists(), f"Chrome binary should exist at {record['abspath']}"
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'chrome'
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify chrome is available via abx-pkg after hook installation."""
"""Verify chrome is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
@@ -75,10 +88,10 @@ def test_verify_deps_with_abx_pkg():
except Exception:
continue
# If we get here, chrome should still be available from system
# If we get here, chrome not available
import shutil
assert shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome'), \
"Chrome should be available after install hook"
if not (shutil.which('chromium') or shutil.which('chrome') or shutil.which('google-chrome')):
pytest.skip("Chrome/Chromium not available - Dependency record should have been emitted")
if __name__ == '__main__':

View File

@@ -0,0 +1,6 @@
<!-- DOM embed - full iframe of captured DOM HTML -->
<iframe src="{{ output_path }}"
class="extractor-embed dom-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- DOM fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen dom-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
</iframe>

View File

@@ -0,0 +1 @@
🌐

View File

@@ -0,0 +1,8 @@
<!-- DOM thumbnail - scaled down iframe preview of captured DOM HTML -->
<div class="extractor-thumbnail dom-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
<iframe src="{{ output_path }}"
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -0,0 +1 @@

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install git if not already available.
Runs at crawl start to ensure git is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
# git binary and package have same name
git_binary = Binary(
name='git',
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = git_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via system package manager
loaded = git_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'git',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'git',
'bin_providers': 'apt,brew,env',
}))
print("Failed to install git", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'git',
'bin_providers': 'apt,brew,env',
}))
print(f"Error installing git: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,6 @@
<!-- Git embed - directory listing of cloned repo -->
<iframe src="{{ output_path }}"
class="extractor-embed git-embed"
style="width: 100%; height: 100%; min-height: 400px; border: none; background: #fff;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Git fullscreen - full directory listing -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen git-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fff;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1 @@
📂

View File

@@ -0,0 +1,5 @@
<!-- Git thumbnail - shows git repository icon and info -->
<div class="extractor-thumbnail git-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f6f8fa; display: flex; flex-direction: column; align-items: center; justify-content: center; padding: 10px;">
<span style="font-size: 32px;">📂</span>
<span style="font-size: 11px; color: #586069; margin-top: 4px;">Git Repository</span>
</div>

View File

@@ -2,7 +2,7 @@
Integration tests for git plugin
Tests verify:
1. Install hook installs git via abx-pkg
1. Validate hook checks for git binary
2. Verify deps with abx-pkg
3. Standalone git extractor execution
"""
@@ -17,50 +17,64 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
GIT_HOOK = PLUGIN_DIR / 'on_Snapshot__12_git.py'
GIT_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_git.py'
GIT_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_git.py'
TEST_URL = 'https://github.com/example/repo.git'
def test_hook_script_exists():
assert GIT_HOOK.exists()
def test_git_install_hook():
"""Test git install hook to install git if needed."""
def test_git_validate_hook():
"""Test git validate hook checks for git binary."""
result = subprocess.run(
[sys.executable, str(GIT_INSTALL_HOOK)],
[sys.executable, str(GIT_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'git'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'git'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'git'
assert 'env' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify git is available via abx-pkg after hook installation."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
"""Verify git is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
git_binary = Binary(name='git', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
git_loaded = git_binary.load()
assert git_loaded and git_loaded.abspath, "git should be available after install hook"
if git_loaded and git_loaded.abspath:
assert True, "git is available"
else:
pytest.skip("git not available - Dependency record should have been emitted")
def test_reports_missing_git():
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -0,0 +1 @@
📋

View File

@@ -0,0 +1 @@
📃

View File

@@ -1,67 +0,0 @@
#!/usr/bin/env python3
"""
Install yt-dlp if not already available.
Runs at crawl start to ensure yt-dlp is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
PipProvider.model_rebuild()
EnvProvider.model_rebuild()
# yt-dlp binary and package have same name
ytdlp_binary = Binary(
name='yt-dlp',
binproviders=[PipProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = ytdlp_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via pip
loaded = ytdlp_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'yt-dlp',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'yt-dlp',
'bin_providers': 'pip,brew,env',
}))
print("Failed to install yt-dlp", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'yt-dlp',
'bin_providers': 'pip,brew,env',
}))
print(f"Error installing yt-dlp: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,278 @@
#!/usr/bin/env python3
"""
Validation hook for yt-dlp and its dependencies (node, ffmpeg).
Runs at crawl start to verify yt-dlp and required binaries are available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_ytdlp() -> dict | None:
"""Find yt-dlp binary."""
try:
from abx_pkg import Binary, PipProvider, EnvProvider
class YtdlpBinary(Binary):
name: str = 'yt-dlp'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = YtdlpBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'yt-dlp',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('yt-dlp') or os.environ.get('YTDLP_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'yt-dlp',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def find_node() -> dict | None:
"""Find node binary."""
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
class NodeBinary(Binary):
name: str = 'node'
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
overrides: dict = {'apt': {'packages': ['nodejs']}}
binary = NodeBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'node',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('node') or os.environ.get('NODE_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'node',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def find_ffmpeg() -> dict | None:
"""Find ffmpeg binary."""
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
class FfmpegBinary(Binary):
name: str = 'ffmpeg'
binproviders_supported = [AptProvider(), BrewProvider(), EnvProvider()]
binary = FfmpegBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'ffmpeg',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('ffmpeg') or os.environ.get('FFMPEG_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'ffmpeg',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
# Check for yt-dlp (required)
ytdlp_result = find_ytdlp()
# Check for node (required for JS extraction)
node_result = find_node()
# Check for ffmpeg (required for video conversion)
ffmpeg_result = find_ffmpeg()
missing_deps = []
# Emit results for yt-dlp
if ytdlp_result and ytdlp_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': ytdlp_result['name'],
'abspath': ytdlp_result['abspath'],
'version': ytdlp_result['version'],
'sha256': ytdlp_result['sha256'],
'binprovider': ytdlp_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/YTDLP_BINARY',
'value': ytdlp_result['abspath'],
}))
if ytdlp_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/YTDLP_VERSION',
'value': ytdlp_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'yt-dlp',
'bin_providers': 'pip,env',
}))
missing_deps.append('yt-dlp')
# Emit results for node
if node_result and node_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': node_result['name'],
'abspath': node_result['abspath'],
'version': node_result['version'],
'sha256': node_result['sha256'],
'binprovider': node_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/NODE_BINARY',
'value': node_result['abspath'],
}))
if node_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/NODE_VERSION',
'value': node_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'node',
'bin_providers': 'apt,brew,env',
}))
missing_deps.append('node')
# Emit results for ffmpeg
if ffmpeg_result and ffmpeg_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': ffmpeg_result['name'],
'abspath': ffmpeg_result['abspath'],
'version': ffmpeg_result['version'],
'sha256': ffmpeg_result['sha256'],
'binprovider': ffmpeg_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/FFMPEG_BINARY',
'value': ffmpeg_result['abspath'],
}))
if ffmpeg_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/FFMPEG_VERSION',
'value': ffmpeg_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'ffmpeg',
'bin_providers': 'apt,brew,env',
}))
missing_deps.append('ffmpeg')
if missing_deps:
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
sys.exit(1)
else:
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,9 @@
<!-- Media embed - video/audio player -->
<div class="extractor-embed media-embed" style="width: 100%; height: 100%; min-height: 400px; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="max-width: 100%; max-height: 100%;"
controls
preload="metadata">
Your browser does not support the video tag.
</video>
</div>

View File

@@ -0,0 +1,10 @@
<!-- Media fullscreen - full video/audio player -->
<div class="extractor-fullscreen media-fullscreen" style="width: 100%; height: 100vh; background: #000; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="max-width: 100%; max-height: 100%;"
controls
autoplay
preload="auto">
Your browser does not support the video tag.
</video>
</div>

View File

@@ -0,0 +1 @@
🎬

View File

@@ -0,0 +1,14 @@
<!-- Media thumbnail - shows video/audio player or placeholder -->
<div class="extractor-thumbnail media-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<video src="{{ output_path }}"
style="width: 100%; height: 100px; object-fit: contain;"
poster=""
preload="metadata"
muted
onerror="this.style.display='none'; this.nextElementSibling.style.display='flex';">
</video>
<div style="display: none; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
<span style="font-size: 32px;">🎬</span>
<span>Media</span>
</div>
</div>

View File

@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MEDIA_HOOK = PLUGIN_DIR / 'on_Snapshot__51_media.py'
MEDIA_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_ytdlp.py'
MEDIA_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_ytdlp.py'
TEST_URL = 'https://example.com/video.mp4'
def test_hook_script_exists():
@@ -29,46 +29,72 @@ def test_hook_script_exists():
assert MEDIA_HOOK.exists(), f"Hook not found: {MEDIA_HOOK}"
def test_ytdlp_install_hook():
"""Test yt-dlp install hook to install yt-dlp if needed."""
# Run yt-dlp install hook
def test_ytdlp_validate_hook():
"""Test yt-dlp validate hook checks for yt-dlp and dependencies (node, ffmpeg)."""
# Run yt-dlp validate hook
result = subprocess.run(
[sys.executable, str(MEDIA_INSTALL_HOOK)],
[sys.executable, str(MEDIA_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for InstalledBinary and Dependency records
found_binaries = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
found_dependencies = {'node': False, 'ffmpeg': False, 'yt-dlp': False}
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'yt-dlp'
assert record['abspath']
found_binary = True
break
name = record['name']
if name in found_binaries:
assert record['abspath'], f"{name} should have abspath"
found_binaries[name] = True
elif record.get('type') == 'Dependency':
name = record['bin_name']
if name in found_dependencies:
found_dependencies[name] = True
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Each binary should either be found (InstalledBinary) or missing (Dependency)
for binary_name in ['yt-dlp', 'node', 'ffmpeg']:
assert found_binaries[binary_name] or found_dependencies[binary_name], \
f"{binary_name} should have either InstalledBinary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify yt-dlp is available via abx-pkg after hook installation."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
"""Verify yt-dlp, node, and ffmpeg are available via abx-pkg."""
from abx_pkg import Binary, PipProvider, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
PipProvider.model_rebuild()
EnvProvider.model_rebuild()
missing_binaries = []
# Verify yt-dlp is available
ytdlp_binary = Binary(name='yt-dlp', binproviders=[PipProvider(), EnvProvider()])
ytdlp_loaded = ytdlp_binary.load()
assert ytdlp_loaded and ytdlp_loaded.abspath, "yt-dlp should be available after install hook"
if not (ytdlp_loaded and ytdlp_loaded.abspath):
missing_binaries.append('yt-dlp')
# Verify node is available (yt-dlp needs it for JS extraction)
node_binary = Binary(
name='node',
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
node_loaded = node_binary.load()
if not (node_loaded and node_loaded.abspath):
missing_binaries.append('node')
# Verify ffmpeg is available (yt-dlp needs it for video conversion)
ffmpeg_binary = Binary(name='ffmpeg', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
ffmpeg_loaded = ffmpeg_binary.load()
if not (ffmpeg_loaded and ffmpeg_loaded.abspath):
missing_binaries.append('ffmpeg')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
def test_handles_non_media_url():
"""Test that media extractor handles non-media URLs gracefully via hook."""

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install mercury-parser if not already available.
Runs at crawl start to ensure mercury-parser is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
# Note: npm package is @postlight/mercury-parser, binary is mercury-parser
mercury_binary = Binary(
name='mercury-parser',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
)
# Try to load, install if not found
try:
loaded = mercury_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via npm
loaded = mercury_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'mercury-parser',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'mercury-parser',
'bin_providers': 'npm,env',
}))
print("Failed to install mercury-parser", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'mercury-parser',
'bin_providers': 'npm,env',
}))
print(f"Error installing mercury-parser: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Validation hook for postlight-parser binary.
Runs at crawl start to verify postlight-parser is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_mercury() -> dict | None:
"""Find postlight-parser binary."""
try:
from abx_pkg import Binary, NpmProvider, EnvProvider
class MercuryBinary(Binary):
name: str = 'postlight-parser'
binproviders_supported = [NpmProvider(), EnvProvider()]
overrides: dict = {'npm': {'packages': ['@postlight/parser']}}
binary = MercuryBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'postlight-parser',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('postlight-parser') or os.environ.get('MERCURY_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'postlight-parser',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
result = find_mercury()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/MERCURY_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/MERCURY_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'postlight-parser',
'bin_providers': 'npm,env',
}))
print(f"postlight-parser binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -6,10 +6,10 @@ Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
Output: Creates mercury/ directory with content.html, content.txt, article.json
Environment variables:
MERCURY_BINARY: Path to mercury-parser binary
MERCURY_BINARY: Path to postlight-parser binary
TIMEOUT: Timeout in seconds (default: 60)
Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
Note: Requires postlight-parser: npm install -g @postlight/parser
"""
import json
@@ -25,7 +25,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'mercury'
BIN_NAME = 'mercury-parser'
BIN_NAME = 'postlight-parser'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'mercury'
@@ -42,12 +42,12 @@ def get_env_int(name: str, default: int = 0) -> int:
def find_mercury() -> str | None:
"""Find mercury-parser binary."""
"""Find postlight-parser binary."""
mercury = get_env('MERCURY_BINARY')
if mercury and os.path.isfile(mercury):
return mercury
for name in ['mercury-parser', 'mercury']:
for name in ['postlight-parser']:
binary = shutil.which(name)
if binary:
return binary
@@ -56,7 +56,7 @@ def find_mercury() -> str | None:
def get_version(binary: str) -> str:
"""Get mercury-parser version."""
"""Get postlight-parser version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
@@ -83,12 +83,12 @@ def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
if result_text.returncode != 0:
stderr = result_text.stderr.decode('utf-8', errors='replace')
return False, None, f'mercury-parser failed: {stderr[:200]}'
return False, None, f'postlight-parser failed: {stderr[:200]}'
try:
text_json = json.loads(result_text.stdout)
except json.JSONDecodeError:
return False, None, 'mercury-parser returned invalid JSON'
return False, None, 'postlight-parser returned invalid JSON'
if text_json.get('failed'):
return False, None, 'Mercury was not able to extract article'
@@ -139,7 +139,7 @@ def main(url: str, snapshot_id: str):
# Find binary
binary = find_mercury()
if not binary:
print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
print(f'ERROR: postlight-parser binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
sys.exit(1)

View File

@@ -0,0 +1,6 @@
<!-- Mercury embed - Mercury parser article view -->
<iframe src="{{ output_path }}"
class="extractor-embed mercury-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Mercury fullscreen - full Mercury parser article -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen mercury-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1 @@
☿️

View File

@@ -0,0 +1,8 @@
<!-- Mercury thumbnail - shows Mercury parser extracted article content -->
<div class="extractor-thumbnail mercury-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 300px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py'
MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py'
MERCURY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_mercury.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
@@ -29,53 +29,70 @@ def test_hook_script_exists():
assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}"
def test_mercury_install_hook():
"""Test mercury install hook to install mercury-parser if needed."""
# Run mercury install hook
def test_mercury_validate_hook():
"""Test mercury validate hook checks for postlight-parser."""
# Run mercury validate hook
result = subprocess.run(
[sys.executable, str(MERCURY_INSTALL_HOOK)],
[sys.executable, str(MERCURY_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'mercury-parser'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'postlight-parser'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'postlight-parser'
assert 'npm' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify mercury-parser is available via abx-pkg after hook installation."""
"""Verify postlight-parser is available via abx-pkg."""
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
# Verify mercury-parser is available
# Verify postlight-parser is available
mercury_binary = Binary(
name='mercury-parser',
name='postlight-parser',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['@postlight/mercury-parser']}}
overrides={'npm': {'packages': ['@postlight/parser']}}
)
mercury_loaded = mercury_binary.load()
assert mercury_loaded and mercury_loaded.abspath, "mercury-parser should be available after install hook"
# If validate hook found it (exit 0), this should succeed
# If validate hook didn't find it (exit 1), this may fail unless binprovider installed it
if mercury_loaded and mercury_loaded.abspath:
assert True, "postlight-parser is available"
else:
pytest.skip("postlight-parser not available - Dependency record should have been emitted")
def test_extracts_with_mercury_parser():
"""Test full workflow: extract with mercury-parser from real HTML via hook."""
"""Test full workflow: extract with postlight-parser from real HTML via hook."""
# Prerequisites checked by earlier test
with tempfile.TemporaryDirectory() as tmpdir:

View File

@@ -2,46 +2,28 @@
"""
Create a Merkle tree of all archived outputs.
This plugin runs after all extractors and post-processing complete (priority 92)
and generates a cryptographic Merkle tree of all files in the snapshot directory.
This provides:
- Tamper detection: verify archive integrity
- Efficient updates: only re-hash changed files
- Compact proofs: prove file inclusion without sending all files
- Deduplication: identify identical content across snapshots
This plugin runs after all extractors complete (priority 93) and generates
a cryptographic Merkle tree of all files in the snapshot directory.
Output: merkletree/merkletree.json containing:
- root_hash: SHA256 hash of the Merkle root
- tree: Full tree structure with internal nodes
- files: List of all files with their hashes
- metadata: Timestamp, file count, total size
Output: merkletree.json containing root_hash, tree structure, file list, metadata
Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__93_merkletree.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
DATA_DIR: ArchiveBox data directory
ARCHIVE_DIR: Archive output directory
"""
__package__ = 'archivebox.plugins.merkletree'
import os
import sys
import json
import hashlib
from pathlib import Path
from datetime import datetime
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple, Any
# Configure Django if running standalone
if __name__ == '__main__':
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
import rich_click as click
import click
def sha256_file(filepath: Path) -> str:
@@ -49,12 +31,10 @@ def sha256_file(filepath: Path) -> str:
h = hashlib.sha256()
try:
with open(filepath, 'rb') as f:
# Read in 64kb chunks
while chunk := f.read(65536):
h.update(chunk)
return h.hexdigest()
except (OSError, PermissionError):
# If we can't read the file, return a null hash
return '0' * 64
@@ -64,74 +44,45 @@ def sha256_data(data: bytes) -> str:
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
"""
Recursively collect all files in snapshot directory.
Args:
snapshot_dir: Root directory to scan
exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
Returns:
List of (relative_path, sha256_hash, file_size) tuples
"""
"""Recursively collect all files in snapshot directory."""
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
files = []
for root, dirs, filenames in os.walk(snapshot_dir):
# Filter out excluded directories
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for filename in filenames:
filepath = Path(root) / filename
rel_path = filepath.relative_to(snapshot_dir)
# Skip symlinks (we hash the target, not the link)
if filepath.is_symlink():
continue
# Compute hash and size
file_hash = sha256_file(filepath)
file_size = filepath.stat().st_size if filepath.exists() else 0
files.append((rel_path, file_hash, file_size))
# Sort by path for deterministic tree
files.sort(key=lambda x: str(x[0]))
return files
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
"""
Build a Merkle tree from a list of leaf hashes.
Args:
file_hashes: List of SHA256 hashes (leaves)
Returns:
(root_hash, tree_levels) where tree_levels is a list of hash lists per level
"""
"""Build a Merkle tree from a list of leaf hashes."""
if not file_hashes:
# Empty tree
return sha256_data(b''), [[]]
# Initialize with leaf level
tree_levels = [file_hashes.copy()]
# Build tree bottom-up
while len(tree_levels[-1]) > 1:
current_level = tree_levels[-1]
next_level = []
# Process pairs
for i in range(0, len(current_level), 2):
left = current_level[i]
if i + 1 < len(current_level):
# Combine left + right
right = current_level[i + 1]
combined = left + right
else:
# Odd number of nodes: duplicate the last one
combined = left + left
parent_hash = sha256_data(combined.encode('utf-8'))
@@ -139,67 +90,41 @@ def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
tree_levels.append(next_level)
# Root is the single hash at the top level
root_hash = tree_levels[-1][0]
return root_hash, tree_levels
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
"""
Create a complete Merkle tree of all files in snapshot directory.
Args:
snapshot_dir: The snapshot directory to scan
Returns:
Dict containing root_hash, tree structure, file list, and metadata
"""
# Collect all files
"""Create a complete Merkle tree of all files in snapshot directory."""
files = collect_files(snapshot_dir)
# Extract just the hashes for tree building
file_hashes = [file_hash for _, file_hash, _ in files]
# Build Merkle tree
root_hash, tree_levels = build_merkle_tree(file_hashes)
# Calculate total size
total_size = sum(size for _, _, size in files)
# Prepare file list with metadata
file_list = [
{
'path': str(path),
'hash': file_hash,
'size': size,
}
{'path': str(path), 'hash': file_hash, 'size': size}
for path, file_hash, size in files
]
# Prepare result
result = {
return {
'root_hash': root_hash,
'tree_levels': tree_levels,
'files': file_list,
'metadata': {
'timestamp': datetime.now().isoformat(),
'timestamp': datetime.now(timezone.utc).isoformat(),
'file_count': len(files),
'total_size': total_size,
'tree_depth': len(tree_levels),
},
}
return result
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Generate Merkle tree of all archived outputs."""
from archivebox.core.models import Snapshot
start_ts = datetime.now()
start_ts = datetime.now(timezone.utc)
status = 'failed'
output = None
error = ''
@@ -211,30 +136,19 @@ def main(url: str, snapshot_id: str):
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_merkletree:
click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
status = 'skipped'
end_ts = datetime.now()
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'STATUS={status}')
click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
click.echo(json.dumps({'status': status, 'output': 'SAVE_MERKLETREE=false'}))
sys.exit(0)
# Get snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
error = f'Snapshot {snapshot_id} not found'
raise ValueError(error)
# Working directory is the extractor output dir (e.g., <snapshot>/merkletree/)
# Parent is the snapshot directory
output_dir = Path.cwd()
snapshot_dir = output_dir.parent
# Get snapshot directory
snapshot_dir = Path(snapshot.output_dir)
if not snapshot_dir.exists():
error = f'Snapshot directory not found: {snapshot_dir}'
raise FileNotFoundError(error)
raise FileNotFoundError(f'Snapshot directory not found: {snapshot_dir}')
# Create output directory
output_dir = snapshot_dir / 'merkletree'
# Ensure output directory exists
output_dir.mkdir(exist_ok=True)
output_path = output_dir / 'merkletree.json'
@@ -246,49 +160,31 @@ def main(url: str, snapshot_id: str):
json.dump(merkle_data, f, indent=2)
status = 'succeeded'
output = str(output_path)
output = 'merkletree.json'
root_hash = merkle_data['root_hash']
file_count = merkle_data['metadata']['file_count']
total_size = merkle_data['metadata']['total_size']
click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
click.echo(f'Merkle tree: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
click.echo(f'Error: {error}', err=True)
end_ts = datetime.now()
duration = (end_ts - start_ts).total_seconds()
end_ts = datetime.now(timezone.utc)
# Print results
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'DURATION={duration:.2f}')
if output:
click.echo(f'OUTPUT={output}')
click.echo(f'STATUS={status}')
if error:
click.echo(f'ERROR={error}', err=True)
# Print JSON result
result_json = {
'extractor': 'merkletree',
'url': url,
'snapshot_id': snapshot_id,
# Print JSON result for hook runner
result = {
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
'root_hash': root_hash,
'file_count': file_count,
'error': error or None,
}
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
click.echo(json.dumps(result))
sys.exit(0 if status == 'succeeded' else 1)
sys.exit(0 if status in ('succeeded', 'skipped') else 1)
if __name__ == '__main__':

View File

@@ -0,0 +1 @@
🔗

View File

@@ -133,7 +133,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='HTML URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse HTML and extract href URLs."""
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)

View File

@@ -0,0 +1 @@
🔗

View File

@@ -127,7 +127,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='JSONL file URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse JSONL bookmark file and extract URLs."""
try:

View File

@@ -0,0 +1 @@
📋

View File

@@ -52,7 +52,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse Netscape bookmark HTML and extract URLs."""
try:

View File

@@ -0,0 +1 @@
🔖

View File

@@ -51,7 +51,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse RSS/Atom feed and extract article URLs."""
if feedparser is None:

View File

@@ -0,0 +1 @@
📡

View File

@@ -100,7 +100,8 @@ def fetch_content(url: str) -> str:
@click.command()
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
def main(url: str):
@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)')
def main(url: str, snapshot_id: str = None):
"""Parse plain text and extract URLs."""
try:

View File

@@ -0,0 +1 @@
📃

View File

@@ -0,0 +1,5 @@
<!-- PDF embed - full PDF viewer -->
<embed src="{{ output_path }}#toolbar=1&navpanes=1"
type="application/pdf"
class="extractor-embed pdf-embed"
style="width: 100%; height: 100%; min-height: 500px;">

View File

@@ -0,0 +1,5 @@
<!-- PDF fullscreen - full PDF viewer -->
<embed src="{{ output_path }}#toolbar=1&navpanes=1&view=FitH"
type="application/pdf"
class="extractor-fullscreen pdf-fullscreen"
style="width: 100%; height: 100vh;">

View File

@@ -0,0 +1 @@
📄

View File

@@ -0,0 +1,6 @@
<!-- PDF thumbnail - shows first page preview -->
<div class="extractor-thumbnail pdf-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #f5f5f5;">
<embed src="{{ output_path }}#toolbar=0&navpanes=0&scrollbar=0&page=1&view=FitH"
type="application/pdf"
style="width: 100%; height: 200px; margin-top: -20px; pointer-events: none;">
</div>

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install readability-extractor if not already available.
Runs at crawl start to ensure readability-extractor is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
# Note: npm package is from github:ArchiveBox/readability-extractor
readability_binary = Binary(
name='readability-extractor',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
)
# Try to load, install if not found
try:
loaded = readability_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via npm from GitHub repo
loaded = readability_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'readability-extractor',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'readability-extractor',
'bin_providers': 'npm,env',
}))
print("Failed to install readability-extractor", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'readability-extractor',
'bin_providers': 'npm,env',
}))
print(f"Error installing readability-extractor: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,123 @@
#!/usr/bin/env python3
"""
Validation hook for readability-extractor binary.
Runs at crawl start to verify readability-extractor is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_readability() -> dict | None:
"""Find readability-extractor binary."""
try:
from abx_pkg import Binary, NpmProvider, EnvProvider
class ReadabilityBinary(Binary):
name: str = 'readability-extractor'
binproviders_supported = [NpmProvider(), EnvProvider()]
overrides: dict = {'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
binary = ReadabilityBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'readability-extractor',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('readability-extractor') or os.environ.get('READABILITY_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'readability-extractor',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
result = find_readability()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/READABILITY_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'readability-extractor',
'bin_providers': 'npm,env',
}))
print(f"readability-extractor binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,6 @@
<!-- Readability embed - reader-mode article view -->
<iframe src="{{ output_path }}"
class="extractor-embed readability-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Readability fullscreen - full reader-mode article -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen readability-fullscreen"
style="width: 100%; height: 100vh; border: none; background: #fefefe;"
sandbox="allow-same-origin">
</iframe>

View File

@@ -0,0 +1 @@
📖

View File

@@ -0,0 +1,8 @@
<!-- Readability thumbnail - shows reader-mode extracted article content -->
<div class="extractor-thumbnail readability-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fefefe; padding: 8px; font-family: Georgia, serif; font-size: 11px; line-height: 1.4; color: #333;">
<iframe src="{{ output_path }}"
style="width: 100%; height: 300px; border: none; pointer-events: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -2,7 +2,7 @@
Integration tests for readability plugin
Tests verify:
1. Install hook installs readability-extractor via abx-pkg
1. Validate hook checks for readability-extractor binary
2. Verify deps with abx-pkg
3. Plugin reports missing dependency correctly
4. Extraction works against real example.com content
@@ -21,7 +21,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
READABILITY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_readability.py'
READABILITY_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_readability.py'
TEST_URL = 'https://example.com'
@@ -101,48 +101,63 @@ def test_reports_missing_dependency_when_not_installed():
assert 'readability-extractor' in combined or 'BIN_NAME' in combined, "Should mention readability-extractor"
def test_readability_install_hook():
"""Test readability install hook to install readability-extractor if needed."""
def test_readability_validate_hook():
"""Test readability validate hook checks for readability-extractor binary."""
result = subprocess.run(
[sys.executable, str(READABILITY_INSTALL_HOOK)],
[sys.executable, str(READABILITY_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'readability-extractor'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'readability-extractor'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'readability-extractor'
assert 'npm' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify readability-extractor is available via abx-pkg after hook installation."""
"""Verify readability-extractor is available via abx-pkg."""
from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides
NpmProvider.model_rebuild()
EnvProvider.model_rebuild()
readability_binary = Binary(
name='readability-extractor',
binproviders=[NpmProvider(), EnvProvider()],
overrides={'npm': {'packages': ['github:ArchiveBox/readability-extractor']}}
)
readability_loaded = readability_binary.load()
assert readability_loaded and readability_loaded.abspath, "readability-extractor should be available after install hook"
if readability_loaded and readability_loaded.abspath:
assert True, "readability-extractor is available"
else:
pytest.skip("readability-extractor not available - Dependency record should have been emitted")
def test_extracts_article_after_installation():

View File

@@ -0,0 +1,5 @@
<!-- Screenshot embed - full image view -->
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-embed screenshot-embed"
style="max-width: 100%; height: auto;">

View File

@@ -0,0 +1,8 @@
<!-- Screenshot fullscreen - zoomable image -->
<div style="width: 100%; height: 100vh; overflow: auto; background: #222; display: flex; align-items: start; justify-content: center;">
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-fullscreen screenshot-fullscreen"
style="max-width: 100%; cursor: zoom-in;"
onclick="this.style.maxWidth = this.style.maxWidth === 'none' ? '100%' : 'none'; this.style.cursor = this.style.maxWidth === 'none' ? 'zoom-out' : 'zoom-in';">
</div>

View File

@@ -0,0 +1 @@
📷

View File

@@ -0,0 +1,8 @@
<!-- Screenshot thumbnail - shows the captured screenshot image -->
<img src="{{ output_path }}"
alt="Screenshot of page"
class="extractor-thumbnail screenshot-thumbnail"
style="width: 100%; height: 100px; object-fit: cover; object-position: top center; background: #333;"
loading="lazy"
onerror="this.style.display='none'; this.nextElementSibling.style.display='block';">
<div style="display: none; text-align: center; padding: 20px; color: #999;">📷 Screenshot</div>

View File

@@ -0,0 +1,6 @@
<!-- Singlefile embed - full iframe of archived HTML -->
<iframe src="{{ output_path }}"
class="extractor-embed singlefile-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Singlefile fullscreen - full page iframe -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen singlefile-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
</iframe>

View File

@@ -0,0 +1 @@
📦

View File

@@ -0,0 +1,8 @@
<!-- Singlefile thumbnail - scaled down iframe preview of archived HTML -->
<div class="extractor-thumbnail singlefile-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
<iframe src="{{ output_path }}"
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -0,0 +1 @@
📁

View File

@@ -0,0 +1 @@
📝

View File

@@ -1,68 +0,0 @@
#!/usr/bin/env python3
"""
Install wget if not already available.
Runs at crawl start to ensure wget is installed.
Outputs JSONL for InstalledBinary.
"""
import json
import sys
from pathlib import Path
def main():
try:
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
# wget binary and package have same name
wget_binary = Binary(
name='wget',
binproviders=[AptProvider(), BrewProvider(), EnvProvider()]
)
# Try to load, install if not found
try:
loaded = wget_binary.load()
if not loaded or not loaded.abspath:
raise Exception("Not loaded")
except Exception:
# Install via system package manager
loaded = wget_binary.install()
if loaded and loaded.abspath:
# Output InstalledBinary JSONL
print(json.dumps({
'type': 'InstalledBinary',
'name': 'wget',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256,
'binprovider': loaded.loaded_binprovider.name if loaded.loaded_binprovider else 'unknown',
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
}))
print("Failed to install wget", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
}))
print(f"Error installing wget: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,6 @@
<!-- Wget embed - full iframe of mirrored site -->
<iframe src="{{ output_path }}"
class="extractor-embed wget-embed"
style="width: 100%; height: 100%; min-height: 500px; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms">
</iframe>

View File

@@ -0,0 +1,6 @@
<!-- Wget fullscreen - full page iframe of mirrored site -->
<iframe src="{{ output_path }}"
class="extractor-fullscreen wget-fullscreen"
style="width: 100%; height: 100vh; border: none;"
sandbox="allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation">
</iframe>

View File

@@ -0,0 +1 @@
📥

View File

@@ -0,0 +1,8 @@
<!-- Wget thumbnail - scaled down iframe preview of mirrored site -->
<div class="extractor-thumbnail wget-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #fff;">
<iframe src="{{ output_path }}"
style="width: 400%; height: 400px; transform: scale(0.25); transform-origin: top left; pointer-events: none; border: none;"
loading="lazy"
sandbox="allow-same-origin">
</iframe>
</div>

View File

@@ -2,8 +2,8 @@
Integration tests for wget plugin
Tests verify:
1. Plugin reports missing dependency correctly
2. wget can be installed via brew/apt provider hooks
1. Validate hook checks for wget binary
2. Verify deps with abx-pkg
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
4. Extraction works against real example.com
5. Output files contain actual page content
@@ -26,7 +26,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
WGET_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_wget.py'
WGET_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_wget.py'
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
TEST_URL = 'https://example.com'
@@ -37,45 +37,59 @@ def test_hook_script_exists():
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
def test_wget_install_hook():
"""Test wget install hook to install wget if needed."""
def test_wget_validate_hook():
"""Test wget validate hook checks for wget binary."""
result = subprocess.run(
[sys.executable, str(WGET_INSTALL_HOOK)],
[sys.executable, str(WGET_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=600
timeout=30
)
assert result.returncode == 0, f"Install hook failed: {result.stderr}"
# Verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'wget'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record"
# Hook exits 0 if binary found, 1 if not found (with Dependency record)
if result.returncode == 0:
# Binary found - verify InstalledBinary JSONL output
found_binary = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'wget'
assert record['abspath']
found_binary = True
break
except json.JSONDecodeError:
pass
assert found_binary, "Should output InstalledBinary record when binary found"
else:
# Binary not found - verify Dependency JSONL output
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
assert record['bin_name'] == 'wget'
assert 'env' in record['bin_providers']
found_dependency = True
break
except json.JSONDecodeError:
pass
assert found_dependency, "Should output Dependency record when binary not found"
def test_verify_deps_with_abx_pkg():
"""Verify wget is available via abx-pkg after hook installation."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider
AptProvider.model_rebuild()
BrewProvider.model_rebuild()
EnvProvider.model_rebuild()
"""Verify wget is available via abx-pkg."""
from abx_pkg import Binary, AptProvider, BrewProvider, EnvProvider, BinProviderOverrides
wget_binary = Binary(name='wget', binproviders=[AptProvider(), BrewProvider(), EnvProvider()])
wget_loaded = wget_binary.load()
assert wget_loaded and wget_loaded.abspath, "wget should be available after install hook"
if wget_loaded and wget_loaded.abspath:
assert True, "wget is available"
else:
pytest.skip("wget not available - Dependency record should have been emitted")
def test_reports_missing_dependency_when_not_installed():