mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
more tests and migrations fixes
This commit is contained in:
46
archivebox/plugins/forumdl/config.json
Normal file
46
archivebox/plugins/forumdl/config.json
Normal file
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_FORUMDL": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable forum downloading with forum-dl"
|
||||
},
|
||||
"FORUMDL_BINARY": {
|
||||
"type": "string",
|
||||
"default": "forum-dl",
|
||||
"description": "Path to forum-dl binary"
|
||||
},
|
||||
"FORUMDL_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 3600,
|
||||
"minimum": 30,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for forum downloads in seconds"
|
||||
},
|
||||
"FORUMDL_OUTPUT_FORMAT": {
|
||||
"type": "string",
|
||||
"default": "jsonl",
|
||||
"enum": ["jsonl", "warc", "mbox", "maildir", "mh", "mmdf", "babyl"],
|
||||
"description": "Output format for forum downloads"
|
||||
},
|
||||
"FORUMDL_TEXTIFY": {
|
||||
"type": "boolean",
|
||||
"default": false,
|
||||
"description": "Convert HTML content to plaintext (keep false to preserve HTML)"
|
||||
},
|
||||
"FORUMDL_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"FORUMDL_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for forum-dl (space-separated)"
|
||||
}
|
||||
}
|
||||
}
|
||||
129
archivebox/plugins/forumdl/on_Crawl__00_validate_forumdl.py
Executable file
129
archivebox/plugins/forumdl/on_Crawl__00_validate_forumdl.py
Executable file
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for forum-dl.
|
||||
|
||||
Runs at crawl start to verify forum-dl binary is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, version_flag],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
return first_line[:64]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_forumdl() -> dict | None:
|
||||
"""Find forum-dl binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider
|
||||
|
||||
class ForumdlBinary(Binary):
|
||||
name: str = 'forum-dl'
|
||||
binproviders_supported = [PipProvider(), EnvProvider()]
|
||||
|
||||
binary = ForumdlBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'forum-dl',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('forum-dl') or os.environ.get('FORUMDL_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'forum-dl',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
# Check for forum-dl (required)
|
||||
forumdl_result = find_forumdl()
|
||||
|
||||
missing_deps = []
|
||||
|
||||
# Emit results for forum-dl
|
||||
if forumdl_result and forumdl_result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': forumdl_result['name'],
|
||||
'abspath': forumdl_result['abspath'],
|
||||
'version': forumdl_result['version'],
|
||||
'sha256': forumdl_result['sha256'],
|
||||
'binprovider': forumdl_result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/FORUMDL_BINARY',
|
||||
'value': forumdl_result['abspath'],
|
||||
}))
|
||||
|
||||
if forumdl_result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/FORUMDL_VERSION',
|
||||
'value': forumdl_result['version'],
|
||||
}))
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'forum-dl',
|
||||
'bin_providers': 'pip,env',
|
||||
}))
|
||||
missing_deps.append('forum-dl')
|
||||
|
||||
if missing_deps:
|
||||
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
237
archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py
Executable file
237
archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py
Executable file
@@ -0,0 +1,237 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download forum content from a URL using forum-dl.
|
||||
|
||||
Usage: on_Snapshot__forumdl.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads forum content to $PWD/
|
||||
|
||||
Environment variables:
|
||||
FORUMDL_BINARY: Path to forum-dl binary
|
||||
FORUMDL_TIMEOUT: Timeout in seconds (default: 3600 for large forums)
|
||||
FORUMDL_OUTPUT_FORMAT: Output format (default: jsonl)
|
||||
FORUMDL_TEXTIFY: Convert HTML to plaintext (default: False - keeps HTML)
|
||||
FORUMDL_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
FORUMDL_EXTRA_ARGS: Extra arguments for forum-dl (space-separated)
|
||||
|
||||
# Forum-dl feature toggles
|
||||
SAVE_FORUMDL: Enable forum-dl forum extraction (default: True)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if FORUMDL_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'forumdl'
|
||||
BIN_NAME = 'forum-dl'
|
||||
BIN_PROVIDERS = 'pip,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def find_forumdl() -> str | None:
|
||||
"""Find forum-dl binary."""
|
||||
forumdl = get_env('FORUMDL_BINARY')
|
||||
if forumdl and os.path.isfile(forumdl):
|
||||
return forumdl
|
||||
|
||||
binary = shutil.which('forum-dl')
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get forum-dl version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Download forum using forum-dl.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env
|
||||
timeout = get_env_int('FORUMDL_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('FORUMDL_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
textify = get_env_bool('FORUMDL_TEXTIFY', False)
|
||||
extra_args = get_env('FORUMDL_EXTRA_ARGS', '')
|
||||
output_format = get_env('FORUMDL_OUTPUT_FORMAT', 'jsonl')
|
||||
|
||||
# Output directory is current directory (hook already runs in output dir)
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
|
||||
# Build output filename based on format
|
||||
if output_format == 'warc':
|
||||
output_file = output_dir / 'forum.warc.gz'
|
||||
elif output_format == 'jsonl':
|
||||
output_file = output_dir / 'forum.jsonl'
|
||||
elif output_format == 'maildir':
|
||||
output_file = output_dir / 'forum' # maildir is a directory
|
||||
elif output_format in ('mbox', 'mh', 'mmdf', 'babyl'):
|
||||
output_file = output_dir / f'forum.{output_format}'
|
||||
else:
|
||||
output_file = output_dir / f'forum.{output_format}'
|
||||
|
||||
# Build command
|
||||
cmd = [binary, '-f', output_format, '-o', str(output_file)]
|
||||
|
||||
if textify:
|
||||
cmd.append('--textify')
|
||||
|
||||
if not check_ssl:
|
||||
cmd.append('--no-check-certificate')
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
|
||||
|
||||
# Check if output file was created
|
||||
if output_file.exists() and output_file.stat().st_size > 0:
|
||||
return True, str(output_file), ''
|
||||
else:
|
||||
stderr = result.stderr
|
||||
|
||||
# These are NOT errors - page simply has no downloadable forum content
|
||||
stderr_lower = stderr.lower()
|
||||
if 'unsupported url' in stderr_lower:
|
||||
return True, None, '' # Not a forum site - success, no output
|
||||
if 'no content' in stderr_lower:
|
||||
return True, None, '' # No forum found - success, no output
|
||||
if result.returncode == 0:
|
||||
return True, None, '' # forum-dl exited cleanly, just no forum - success
|
||||
|
||||
# These ARE errors - something went wrong
|
||||
if '404' in stderr:
|
||||
return False, None, '404 Not Found'
|
||||
if '403' in stderr:
|
||||
return False, None, '403 Forbidden'
|
||||
if 'unable to extract' in stderr_lower:
|
||||
return False, None, 'Unable to extract forum info'
|
||||
|
||||
return False, None, f'forum-dl error: {stderr[:200]}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to download forum from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Download forum content from a URL using forum-dl."""
|
||||
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
cmd_str = ''
|
||||
|
||||
try:
|
||||
# Check if forum-dl is enabled
|
||||
if not get_env_bool('SAVE_FORUMDL', True):
|
||||
print('Skipping forum-dl (SAVE_FORUMDL=False)')
|
||||
status = 'skipped'
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Find binary
|
||||
binary = find_forumdl()
|
||||
if not binary:
|
||||
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
print(f'INSTALL_HINT=pip install forum-dl', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} {url}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_forum(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
if output:
|
||||
output_path = Path(output)
|
||||
file_size = output_path.stat().st_size
|
||||
print(f'forum-dl completed: {output_path.name} ({file_size} bytes)')
|
||||
else:
|
||||
print(f'forum-dl completed: no forum content found on page (this is normal)')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
if cmd_str:
|
||||
print(f'CMD={cmd_str}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
40
archivebox/plugins/forumdl/templates/embed.html
Normal file
40
archivebox/plugins/forumdl/templates/embed.html
Normal file
@@ -0,0 +1,40 @@
|
||||
<!-- Embedded forum view - renders JSONL forum posts -->
|
||||
<div class="extractor-embed forumdl-embed" style="width: 100%; max-width: 900px; margin: 0 auto; background: #1a1a1a; padding: 20px; border-radius: 8px;">
|
||||
<div style="text-align: center; padding: 15px 0; border-bottom: 1px solid #333; margin-bottom: 20px;">
|
||||
<span style="font-size: 32px;">💬</span>
|
||||
<h3 style="margin: 10px 0; color: #fff; font-size: 18px;">Forum Thread</h3>
|
||||
</div>
|
||||
<div id="forum-posts" style="max-height: 500px; overflow-y: auto; color: #ddd;"></div>
|
||||
<script>
|
||||
(async function() {
|
||||
try {
|
||||
const response = await fetch('{{ output_path }}');
|
||||
const text = await response.text();
|
||||
const posts = text.trim().split('\n').map(line => JSON.parse(line));
|
||||
const container = document.getElementById('forum-posts');
|
||||
|
||||
posts.forEach(post => {
|
||||
const postDiv = document.createElement('div');
|
||||
postDiv.style.cssText = 'background: #2a2a2a; padding: 15px; margin-bottom: 15px; border-radius: 5px; border-left: 3px solid #4a9eff;';
|
||||
|
||||
const author = post.author || 'Anonymous';
|
||||
const date = post.date ? new Date(post.date).toLocaleString() : '';
|
||||
const title = post.title || '';
|
||||
const content = post.content || post.body || '';
|
||||
|
||||
postDiv.innerHTML = `
|
||||
<div style="display: flex; justify-content: space-between; margin-bottom: 10px; padding-bottom: 8px; border-bottom: 1px solid #444;">
|
||||
<strong style="color: #4a9eff;">${author}</strong>
|
||||
<span style="color: #888; font-size: 12px;">${date}</span>
|
||||
</div>
|
||||
${title ? `<h4 style="margin: 0 0 10px 0; color: #fff;">${title}</h4>` : ''}
|
||||
<div style="color: #ccc; line-height: 1.5;">${content}</div>
|
||||
`;
|
||||
container.appendChild(postDiv);
|
||||
});
|
||||
} catch(e) {
|
||||
document.getElementById('forum-posts').innerHTML = '<p style="color: #888;">Error loading forum posts</p>';
|
||||
}
|
||||
})();
|
||||
</script>
|
||||
</div>
|
||||
147
archivebox/plugins/forumdl/templates/fullscreen.html
Normal file
147
archivebox/plugins/forumdl/templates/fullscreen.html
Normal file
@@ -0,0 +1,147 @@
|
||||
<!-- Fullscreen forum view - renders JSONL forum posts -->
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Forum Thread</title>
|
||||
<style>
|
||||
body {
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
background: #0d1117;
|
||||
color: #c9d1d9;
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
|
||||
line-height: 1.6;
|
||||
}
|
||||
.header {
|
||||
max-width: 1000px;
|
||||
margin: 0 auto 30px;
|
||||
text-align: center;
|
||||
padding: 20px;
|
||||
border-bottom: 1px solid #30363d;
|
||||
}
|
||||
.icon {
|
||||
font-size: 48px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
h1 {
|
||||
margin: 0;
|
||||
font-size: 28px;
|
||||
color: #f0f6fc;
|
||||
}
|
||||
.container {
|
||||
max-width: 1000px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
.post {
|
||||
background: #161b22;
|
||||
border: 1px solid #30363d;
|
||||
border-radius: 6px;
|
||||
margin-bottom: 16px;
|
||||
padding: 16px;
|
||||
transition: border-color 0.2s;
|
||||
}
|
||||
.post:hover {
|
||||
border-color: #58a6ff;
|
||||
}
|
||||
.post-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 12px;
|
||||
padding-bottom: 12px;
|
||||
border-bottom: 1px solid #21262d;
|
||||
}
|
||||
.post-author {
|
||||
font-weight: 600;
|
||||
color: #58a6ff;
|
||||
font-size: 14px;
|
||||
}
|
||||
.post-date {
|
||||
color: #8b949e;
|
||||
font-size: 12px;
|
||||
}
|
||||
.post-title {
|
||||
margin: 0 0 12px 0;
|
||||
font-size: 18px;
|
||||
font-weight: 600;
|
||||
color: #f0f6fc;
|
||||
}
|
||||
.post-content {
|
||||
color: #c9d1d9;
|
||||
word-wrap: break-word;
|
||||
}
|
||||
.post-content img {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
border-radius: 4px;
|
||||
}
|
||||
.post-content a {
|
||||
color: #58a6ff;
|
||||
text-decoration: none;
|
||||
}
|
||||
.post-content a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
color: #8b949e;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<div class="icon">💬</div>
|
||||
<h1>Forum Thread</h1>
|
||||
</div>
|
||||
<div class="container">
|
||||
<div id="forum-posts" class="loading">Loading posts...</div>
|
||||
</div>
|
||||
<script>
|
||||
(async function() {
|
||||
try {
|
||||
const response = await fetch('{{ output_path }}');
|
||||
const text = await response.text();
|
||||
const posts = text.trim().split('\n').filter(line => line).map(line => JSON.parse(line));
|
||||
const container = document.getElementById('forum-posts');
|
||||
container.innerHTML = '';
|
||||
container.className = '';
|
||||
|
||||
posts.forEach(post => {
|
||||
const postDiv = document.createElement('div');
|
||||
postDiv.className = 'post';
|
||||
|
||||
const author = post.author || 'Anonymous';
|
||||
const date = post.date ? new Date(post.date).toLocaleString() : '';
|
||||
const title = post.title || '';
|
||||
const content = post.content || post.body || '';
|
||||
|
||||
postDiv.innerHTML = `
|
||||
<div class="post-header">
|
||||
<span class="post-author">${escapeHtml(author)}</span>
|
||||
<span class="post-date">${escapeHtml(date)}</span>
|
||||
</div>
|
||||
${title ? `<h2 class="post-title">${escapeHtml(title)}</h2>` : ''}
|
||||
<div class="post-content">${content}</div>
|
||||
`;
|
||||
container.appendChild(postDiv);
|
||||
});
|
||||
|
||||
if (posts.length === 0) {
|
||||
container.innerHTML = '<div class="loading">No posts found</div>';
|
||||
}
|
||||
} catch(e) {
|
||||
document.getElementById('forum-posts').innerHTML = '<div class="loading">Error loading posts: ' + e.message + '</div>';
|
||||
}
|
||||
})();
|
||||
|
||||
function escapeHtml(text) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = text;
|
||||
return div.innerHTML;
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
1
archivebox/plugins/forumdl/templates/icon.html
Normal file
1
archivebox/plugins/forumdl/templates/icon.html
Normal file
@@ -0,0 +1 @@
|
||||
💬
|
||||
7
archivebox/plugins/forumdl/templates/thumbnail.html
Normal file
7
archivebox/plugins/forumdl/templates/thumbnail.html
Normal file
@@ -0,0 +1,7 @@
|
||||
<!-- Forum thumbnail - shows icon placeholder -->
|
||||
<div class="extractor-thumbnail forumdl-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
|
||||
<div style="display: flex; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
|
||||
<span style="font-size: 32px;">💬</span>
|
||||
<span>Forum</span>
|
||||
</div>
|
||||
</div>
|
||||
157
archivebox/plugins/forumdl/tests/test_forumdl.py
Normal file
157
archivebox/plugins/forumdl/tests/test_forumdl.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
Integration tests for forumdl plugin
|
||||
|
||||
Tests verify:
|
||||
1. Hook script exists
|
||||
2. Dependencies installed via validation hooks
|
||||
3. Verify deps with abx-pkg
|
||||
4. Forum extraction works on forum URLs
|
||||
5. JSONL output is correct
|
||||
6. Config options work
|
||||
7. Handles non-forum URLs gracefully
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
FORUMDL_HOOK = PLUGIN_DIR / 'on_Snapshot__53_forumdl.py'
|
||||
FORUMDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_forumdl.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify on_Snapshot hook exists."""
|
||||
assert FORUMDL_HOOK.exists(), f"Hook not found: {FORUMDL_HOOK}"
|
||||
|
||||
|
||||
def test_forumdl_validate_hook():
|
||||
"""Test forum-dl validate hook checks for forum-dl."""
|
||||
# Run forum-dl validate hook
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_VALIDATE_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook exits 0 if all binaries found, 1 if any not found
|
||||
# Parse output for InstalledBinary and Dependency records
|
||||
found_binary = False
|
||||
found_dependency = False
|
||||
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip():
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
if record['name'] == 'forum-dl':
|
||||
assert record['abspath'], "forum-dl should have abspath"
|
||||
found_binary = True
|
||||
elif record.get('type') == 'Dependency':
|
||||
if record['bin_name'] == 'forum-dl':
|
||||
found_dependency = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# forum-dl should either be found (InstalledBinary) or missing (Dependency)
|
||||
assert found_binary or found_dependency, \
|
||||
"forum-dl should have either InstalledBinary or Dependency record"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify forum-dl is available via abx-pkg."""
|
||||
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
|
||||
|
||||
missing_binaries = []
|
||||
|
||||
# Verify forum-dl is available
|
||||
forumdl_binary = Binary(name='forum-dl', binproviders=[PipProvider(), EnvProvider()])
|
||||
forumdl_loaded = forumdl_binary.load()
|
||||
if not (forumdl_loaded and forumdl_loaded.abspath):
|
||||
missing_binaries.append('forum-dl')
|
||||
|
||||
if missing_binaries:
|
||||
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
|
||||
|
||||
|
||||
def test_handles_non_forum_url():
|
||||
"""Test that forum-dl extractor handles non-forum URLs gracefully via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run forum-dl extraction hook on non-forum URL
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should exit 0 even for non-forum URL
|
||||
assert result.returncode == 0, f"Should handle non-forum URL gracefully: {result.stderr}"
|
||||
|
||||
# Verify JSONL output
|
||||
assert 'STATUS=' in result.stdout, "Should report status"
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
# Parse JSONL result
|
||||
result_json = None
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.split('=', 1)[1])
|
||||
break
|
||||
|
||||
assert result_json, "Should have RESULT_JSON"
|
||||
assert result_json['extractor'] == 'forumdl'
|
||||
|
||||
|
||||
def test_config_save_forumdl_false_skips():
|
||||
"""Test that SAVE_FORUMDL=False causes skip."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['SAVE_FORUMDL'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=' in result.stdout
|
||||
|
||||
|
||||
def test_config_timeout():
|
||||
"""Test that FORUMDL_TIMEOUT config is respected."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env['FORUMDL_TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FORUMDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, "Should complete without hanging"
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user