more tests and migrations fixes

This commit is contained in:
Nick Sweeting
2025-12-26 18:22:48 -08:00
parent 0fbcbd2616
commit e2cbcd17f6
26 changed files with 3608 additions and 1792 deletions

View File

@@ -0,0 +1,46 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_FORUMDL": {
"type": "boolean",
"default": true,
"description": "Enable forum downloading with forum-dl"
},
"FORUMDL_BINARY": {
"type": "string",
"default": "forum-dl",
"description": "Path to forum-dl binary"
},
"FORUMDL_TIMEOUT": {
"type": "integer",
"default": 3600,
"minimum": 30,
"x-fallback": "TIMEOUT",
"description": "Timeout for forum downloads in seconds"
},
"FORUMDL_OUTPUT_FORMAT": {
"type": "string",
"default": "jsonl",
"enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"],
"description": "Output format for forum downloads"
},
"FORUMDL_TEXTIFY": {
"type": "boolean",
"default": false,
"description": "Convert HTML content to plaintext (keep false to preserve HTML)"
},
"FORUMDL_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"FORUMDL_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for forum-dl (space-separated)"
}
}
}

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Validation hook for forum-dl.
Runs at crawl start to verify forum-dl binary is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_forumdl() -> dict | None:
"""Find forum-dl binary."""
try:
from abx_pkg import Binary, PipProvider, EnvProvider
class ForumdlBinary(Binary):
name: str = 'forum-dl'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = ForumdlBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'forum-dl',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('forum-dl') or os.environ.get('FORUMDL_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'forum-dl',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
# Check for forum-dl (required)
forumdl_result = find_forumdl()
missing_deps = []
# Emit results for forum-dl
if forumdl_result and forumdl_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': forumdl_result['name'],
'abspath': forumdl_result['abspath'],
'version': forumdl_result['version'],
'sha256': forumdl_result['sha256'],
'binprovider': forumdl_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/FORUMDL_BINARY',
'value': forumdl_result['abspath'],
}))
if forumdl_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/FORUMDL_VERSION',
'value': forumdl_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'forum-dl',
'bin_providers': 'pip,env',
}))
missing_deps.append('forum-dl')
if missing_deps:
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
sys.exit(1)
else:
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,237 @@
#!/usr/bin/env python3
"""
Download forum content from a URL using forum-dl.
Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
Output: Downloads forum content to $PWD/
Environment variables:
FORUMDL_BINARY: Path to forum-dl binary
FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums)
FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl)
FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML)
FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated)
# Forum-dl feature toggles
SAVE_FORUMDL: Enable forum-dl forum extraction (default: True)
# Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set:
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
"""
import json
import os
import shutil
import subprocess
import sys
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'forumdl'
BIN_NAME = 'forum-dl'
BIN_PROVIDERS = 'pip,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def find_forumdl() -> str | None:
"""Find forum-dl binary."""
forumdl = get_env('FORUMDL_BINARY')
if forumdl and os.path.isfile(forumdl):
return forumdl
binary = shutil.which('forum-dl')
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get forum-dl version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download forum using forum-dl.
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
textify = get_env_bool('FORUMDL_TEXTIFY', False)
extra_args = get_env('FORUMDL_EXTRA_ARGS', '')
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
# Build output filename based on format
if output_format == 'warc':
output_file = output_dir / 'forum.warc.gz'
elif output_format == 'jsonl':
output_file = output_dir / 'forum.jsonl'
elif output_format == 'maildir':
output_file = output_dir / 'forum' # maildir is a directory
elif output_format in ('mbox', 'mh', 'mmdf', 'babyl'):
output_file = output_dir / f'forum.{output_format}'
else:
output_file = output_dir / f'forum.{output_format}'
# Build command
cmd = [binary, '-f', output_format, '-o', str(output_file)]
if textify:
cmd.append('--textify')
if not check_ssl:
cmd.append('--no-check-certificate')
if extra_args:
cmd.extend(extra_args.split())
cmd.append(url)
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
# Check if output file was created
if output_file.exists() and output_file.stat().st_size > 0:
return True, str(output_file), ''
else:
stderr = result.stderr
# These are NOT errors - page simply has no downloadable forum content
stderr_lower = stderr.lower()
if 'unsupported url' in stderr_lower:
return True, None, '' # Not a forum site - success, no output
if 'no content' in stderr_lower:
return True, None, '' # No forum found - success, no output
if result.returncode == 0:
return True, None, '' # forum-dl exited cleanly, just no forum - success
# These ARE errors - something went wrong
if '404' in stderr:
return False, None, '404 Not Found'
if '403' in stderr:
return False, None, '403 Forbidden'
if 'unable to extract' in stderr_lower:
return False, None, 'Unable to extract forum info'
return False, None, f'forum-dl error: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to download forum from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Download forum content from a URL using forum-dl."""
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if forum-dl is enabled
if not get_env_bool('SAVE_FORUMDL', True):
print('Skipping forum-dl (SAVE_FORUMDL=False)')
status = 'skipped'
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Find binary
binary = find_forumdl()
if not binary:
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=pip install forum-dl', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} {url}'
# Run extraction
success, output, error = save_forum(url, binary)
status = 'succeeded' if success else 'failed'
if success:
if output:
output_path = Path(output)
file_size = output_path.stat().st_size
print(f'forum-dl completed: {output_path.name} ({file_size} bytes)')
else:
print(f'forum-dl completed: no forum content found on page (this is normal)')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
if cmd_str:
print(f'CMD={cmd_str}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,40 @@
<!-- Embedded forum view - renders JSONL forum posts -->
<div class="extractor-embed forumdl-embed" style="width: 100%; max-width: 900px; margin: 0 auto; background: #1a1a1a; padding: 20px; border-radius: 8px;">
<div style="text-align: center; padding: 15px 0; border-bottom: 1px solid #333; margin-bottom: 20px;">
<span style="font-size: 32px;">💬</span>
<h3 style="margin: 10px 0; color: #fff; font-size: 18px;">Forum Thread</h3>
</div>
<div id="forum-posts" style="max-height: 500px; overflow-y: auto; color: #ddd;"></div>
<script>
(async function() {
try {
const response = await fetch('{{ output_path }}');
const text = await response.text();
const posts = text.trim().split('\n').map(line => JSON.parse(line));
const container = document.getElementById('forum-posts');
posts.forEach(post => {
const postDiv = document.createElement('div');
postDiv.style.cssText = 'background: #2a2a2a; padding: 15px; margin-bottom: 15px; border-radius: 5px; border-left: 3px solid #4a9eff;';
const author = post.author || 'Anonymous';
const date = post.date ? new Date(post.date).toLocaleString() : '';
const title = post.title || '';
const content = post.content || post.body || '';
postDiv.innerHTML = `
<div style="display: flex; justify-content: space-between; margin-bottom: 10px; padding-bottom: 8px; border-bottom: 1px solid #444;">
<strong style="color: #4a9eff;">${author}</strong>
<span style="color: #888; font-size: 12px;">${date}</span>
</div>
${title ? `<h4 style="margin: 0 0 10px 0; color: #fff;">${title}</h4>` : ''}
<div style="color: #ccc; line-height: 1.5;">${content}</div>
`;
container.appendChild(postDiv);
});
} catch(e) {
document.getElementById('forum-posts').innerHTML = '<p style="color: #888;">Error loading forum posts</p>';
}
})();
</script>
</div>

View File

@@ -0,0 +1,147 @@
<!-- Fullscreen forum view - renders JSONL forum posts -->
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Forum Thread</title>
<style>
body {
margin: 0;
padding: 20px;
background: #0d1117;
color: #c9d1d9;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
line-height: 1.6;
}
.header {
max-width: 1000px;
margin: 0 auto 30px;
text-align: center;
padding: 20px;
border-bottom: 1px solid #30363d;
}
.icon {
font-size: 48px;
margin-bottom: 10px;
}
h1 {
margin: 0;
font-size: 28px;
color: #f0f6fc;
}
.container {
max-width: 1000px;
margin: 0 auto;
}
.post {
background: #161b22;
border: 1px solid #30363d;
border-radius: 6px;
margin-bottom: 16px;
padding: 16px;
transition: border-color 0.2s;
}
.post:hover {
border-color: #58a6ff;
}
.post-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
padding-bottom: 12px;
border-bottom: 1px solid #21262d;
}
.post-author {
font-weight: 600;
color: #58a6ff;
font-size: 14px;
}
.post-date {
color: #8b949e;
font-size: 12px;
}
.post-title {
margin: 0 0 12px 0;
font-size: 18px;
font-weight: 600;
color: #f0f6fc;
}
.post-content {
color: #c9d1d9;
word-wrap: break-word;
}
.post-content img {
max-width: 100%;
height: auto;
border-radius: 4px;
}
.post-content a {
color: #58a6ff;
text-decoration: none;
}
.post-content a:hover {
text-decoration: underline;
}
.loading {
text-align: center;
padding: 40px;
color: #8b949e;
}
</style>
</head>
<body>
<div class="header">
<div class="icon">💬</div>
<h1>Forum Thread</h1>
</div>
<div class="container">
<div id="forum-posts" class="loading">Loading posts...</div>
</div>
<script>
(async function() {
try {
const response = await fetch('{{ output_path }}');
const text = await response.text();
const posts = text.trim().split('\n').filter(line => line).map(line => JSON.parse(line));
const container = document.getElementById('forum-posts');
container.innerHTML = '';
container.className = '';
posts.forEach(post => {
const postDiv = document.createElement('div');
postDiv.className = 'post';
const author = post.author || 'Anonymous';
const date = post.date ? new Date(post.date).toLocaleString() : '';
const title = post.title || '';
const content = post.content || post.body || '';
postDiv.innerHTML = `
<div class="post-header">
<span class="post-author">${escapeHtml(author)}</span>
<span class="post-date">${escapeHtml(date)}</span>
</div>
${title ? `<h2 class="post-title">${escapeHtml(title)}</h2>` : ''}
<div class="post-content">${content}</div>
`;
container.appendChild(postDiv);
});
if (posts.length === 0) {
container.innerHTML = '<div class="loading">No posts found</div>';
}
} catch(e) {
document.getElementById('forum-posts').innerHTML = '<div class="loading">Error loading posts: ' + e.message + '</div>';
}
})();
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
</script>
</body>
</html>

View File

@@ -0,0 +1 @@
💬

View File

@@ -0,0 +1,7 @@
<!-- Forum thumbnail - shows icon placeholder -->
<div class="extractor-thumbnail forumdl-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<div style="display: flex; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
<span style="font-size: 32px;">💬</span>
<span>Forum</span>
</div>
</div>

View File

@@ -0,0 +1,157 @@
"""
Integration tests for forumdl plugin
Tests verify:
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
4. Forum extraction works on forum URLs
5. JSONL output is correct
6. Config options work
7. Handles non-forum URLs gracefully
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py'
FORUMDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_forumdl.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
def test_forumdl_validate_hook():
"""Test forum-dl validate hook checks for forum-dl."""
# Run forum-dl validate hook
result = subprocess.run(
[sys.executable, str(FORUMDL_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for InstalledBinary and Dependency records
found_binary = False
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
if record['name'] == 'forum-dl':
assert record['abspath'], "forum-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
if record['bin_name'] == 'forum-dl':
found_dependency = True
except json.JSONDecodeError:
pass
# forum-dl should either be found (InstalledBinary) or missing (Dependency)
assert found_binary or found_dependency, \
"forum-dl should have either InstalledBinary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify forum-dl is available via abx-pkg."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
missing_binaries = []
# Verify forum-dl is available
forumdl_binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()])
forumdl_loaded = forumdl_binary.load()
if not (forumdl_loaded and forumdl_loaded.abspath):
missing_binaries.append('forum-dl')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
def test_handles_non_forum_url():
"""Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
# Prerequisites checked by earlier test
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run forum-dl extraction hook on non-forum URL
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# Should exit 0 even for non-forum URL
assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
# Verify JSONL output
assert 'STATUS=' in result.stdout, "Should report status"
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
# Parse JSONL result
result_json = None
for line in result.stdout.split('\n'):
if line.startswith('RESULT_JSON='):
result_json = json.loads(line.split('=', 1)[1])
break
assert result_json, "Should have RESULT_JSON"
assert result_json['extractor'] == 'forumdl'
def test_config_save_forumdl_false_skips():
"""Test that SAVE_FORUMDL=False causes skip."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['SAVE_FORUMDL'] = 'False'
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
assert 'STATUS=' in result.stdout
def test_config_timeout():
"""Test that FORUMDL_TIMEOUT config is respected."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['FORUMDL_TIMEOUT'] = '5'
result = subprocess.run(
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, "Should complete without hanging"
if __name__ == '__main__':
pytest.main([__file__, '-v'])