add papersdl plugin

This commit is contained in:
Nick Sweeting
2025-12-26 18:25:52 -08:00
parent e2cbcd17f6
commit 6fdc52cc57
7 changed files with 612 additions and 0 deletions

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Validation hook for papers-dl.
Runs at crawl start to verify papers-dl binary is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, version_flag],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
first_line = result.stdout.strip().split('\n')[0]
return first_line[:64]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_papersdl() -> dict | None:
"""Find papers-dl binary."""
try:
from abx_pkg import Binary, PipProvider, EnvProvider
class PapersdlBinary(Binary):
name: str = 'papers-dl'
binproviders_supported = [PipProvider(), EnvProvider()]
binary = PapersdlBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'papers-dl',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('papers-dl') or os.environ.get('PAPERSDL_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'papers-dl',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
# Check for papers-dl (required)
papersdl_result = find_papersdl()
missing_deps = []
# Emit results for papers-dl
if papersdl_result and papersdl_result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': papersdl_result['name'],
'abspath': papersdl_result['abspath'],
'version': papersdl_result['version'],
'sha256': papersdl_result['sha256'],
'binprovider': papersdl_result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/PAPERSDL_BINARY',
'value': papersdl_result['abspath'],
}))
if papersdl_result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/PAPERSDL_VERSION',
'value': papersdl_result['version'],
}))
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'papers-dl',
'bin_providers': 'pip,env',
}))
missing_deps.append('papers-dl')
if missing_deps:
print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr)
sys.exit(1)
else:
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,232 @@
#!/usr/bin/env python3
"""
Download scientific papers from a URL using papers-dl.
Usage: on_Snapshot__papersdl.py --url=<url> --snapshot-id=<uuid>
Output: Downloads paper PDFs to $PWD/
Environment variables:
PAPERSDL_BINARY: Path to papers-dl binary
PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads)
PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated)
# papers-dl feature toggles
SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True)
# Fallback to ARCHIVING_CONFIG values if PAPERSDL_* not set:
TIMEOUT: Fallback timeout
"""
import json
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'papersdl'
BIN_NAME = 'papers-dl'
BIN_PROVIDERS = 'pip,env'
OUTPUT_DIR = '.'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def find_papersdl() -> str | None:
"""Find papers-dl binary."""
papersdl = get_env('PAPERSDL_BINARY')
if papersdl and os.path.isfile(papersdl):
return papersdl
binary = shutil.which('papers-dl')
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get papers-dl version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
def extract_doi_from_url(url: str) -> str | None:
"""Extract DOI from common paper URLs."""
# Match DOI pattern in URL
doi_pattern = r'10\.\d{4,}/[^\s]+'
match = re.search(doi_pattern, url)
if match:
return match.group(0)
return None
def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download paper using papers-dl.
Returns: (success, output_path, error_message)
"""
# Get config from env
timeout = get_env_int('PAPERSDL_TIMEOUT') or get_env_int('TIMEOUT', 300)
extra_args = get_env('PAPERSDL_EXTRA_ARGS', '')
# Output directory is current directory (hook already runs in output dir)
output_dir = Path(OUTPUT_DIR)
# Try to extract DOI from URL
doi = extract_doi_from_url(url)
if not doi:
# If no DOI found, papers-dl might handle the URL directly
identifier = url
else:
identifier = doi
# Build command - papers-dl fetch <identifier> -o <output_dir>
cmd = [binary, 'fetch', identifier, '-o', str(output_dir)]
if extra_args:
cmd.extend(extra_args.split())
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
# Check if any PDF files were downloaded
pdf_files = list(output_dir.glob('*.pdf'))
if pdf_files:
# Return first PDF file
return True, str(pdf_files[0]), ''
else:
stderr = result.stderr
stdout = result.stdout
# These are NOT errors - page simply has no downloadable paper
stderr_lower = stderr.lower()
stdout_lower = stdout.lower()
if 'not found' in stderr_lower or 'not found' in stdout_lower:
return True, None, '' # Paper not available - success, no output
if 'no results' in stderr_lower or 'no results' in stdout_lower:
return True, None, '' # No paper found - success, no output
if result.returncode == 0:
return True, None, '' # papers-dl exited cleanly, just no paper - success
# These ARE errors - something went wrong
if '404' in stderr or '404' in stdout:
return False, None, '404 Not Found'
if '403' in stderr or '403' in stdout:
return False, None, '403 Forbidden'
return False, None, f'papers-dl error: {stderr[:200] or stdout[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to download paper from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Download scientific paper from a URL using papers-dl."""
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if papers-dl is enabled
if not get_env_bool('SAVE_PAPERSDL', True):
print('Skipping papers-dl (SAVE_PAPERSDL=False)')
status = 'skipped'
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Find binary
binary = find_papersdl()
if not binary:
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=pip install papers-dl', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} fetch {url}'
# Run extraction
success, output, error = save_paper(url, binary)
status = 'succeeded' if success else 'failed'
if success:
if output:
output_path = Path(output)
file_size = output_path.stat().st_size
print(f'papers-dl completed: {output_path.name} ({file_size} bytes)')
else:
print(f'papers-dl completed: no paper found for this URL (this is normal)')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
if cmd_str:
print(f'CMD={cmd_str}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,15 @@
<!-- Embedded paper view - shows PDF viewer -->
<div class="extractor-embed papersdl-embed" style="width: 100%; max-width: 900px; margin: 0 auto; background: #1a1a1a; padding: 20px; border-radius: 8px;">
<div style="text-align: center; padding: 15px 0; border-bottom: 1px solid #333; margin-bottom: 20px;">
<span style="font-size: 32px;">📄</span>
<h3 style="margin: 10px 0; color: #fff; font-size: 18px;">Scientific Paper</h3>
</div>
<div style="width: 100%; height: 500px; background: #2a2a2a; border-radius: 5px; overflow: hidden;">
<embed src="{{ output_path }}" type="application/pdf" width="100%" height="100%" />
</div>
<div style="margin-top: 15px; text-align: center;">
<a href="{{ output_path }}" download style="color: #4a9eff; text-decoration: none; padding: 10px 20px; background: #2a2a2a; border-radius: 5px; display: inline-block;">
Download PDF
</a>
</div>
</div>

View File

@@ -0,0 +1,71 @@
<!-- Fullscreen paper view - shows PDF in full screen -->
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Scientific Paper</title>
<style>
body {
margin: 0;
padding: 0;
background: #1a1a1a;
color: #ddd;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
display: flex;
flex-direction: column;
height: 100vh;
}
.header {
text-align: center;
padding: 15px;
background: #0d1117;
border-bottom: 1px solid #30363d;
}
.icon {
font-size: 32px;
margin-bottom: 5px;
}
h1 {
margin: 0;
font-size: 20px;
color: #f0f6fc;
}
.pdf-container {
flex: 1;
width: 100%;
overflow: hidden;
}
embed {
width: 100%;
height: 100%;
}
.download-link {
position: fixed;
bottom: 20px;
right: 20px;
background: #58a6ff;
color: #fff;
padding: 12px 24px;
border-radius: 6px;
text-decoration: none;
font-weight: 600;
box-shadow: 0 4px 6px rgba(0,0,0,0.3);
transition: background 0.2s;
}
.download-link:hover {
background: #1f6feb;
}
</style>
</head>
<body>
<div class="header">
<div class="icon">📄</div>
<h1>Scientific Paper</h1>
</div>
<div class="pdf-container">
<embed src="{{ output_path }}" type="application/pdf" />
</div>
<a href="{{ output_path }}" download class="download-link">Download PDF</a>
</body>
</html>

View File

@@ -0,0 +1 @@
📄

View File

@@ -0,0 +1,7 @@
<!-- Paper thumbnail - shows PDF icon placeholder -->
<div class="extractor-thumbnail papersdl-thumbnail" style="width: 100%; height: 100px; overflow: hidden; background: #1a1a1a; display: flex; align-items: center; justify-content: center;">
<div style="display: flex; flex-direction: column; align-items: center; color: #888; font-size: 12px;">
<span style="font-size: 32px;">📄</span>
<span>Paper</span>
</div>
</div>

View File

@@ -0,0 +1,157 @@
"""
Integration tests for papersdl plugin
Tests verify:
1. Hook script exists
2. Dependencies installed via validation hooks
3. Verify deps with abx-pkg
4. Paper extraction works on paper URLs
5. JSONL output is correct
6. Config options work
7. Handles non-paper URLs gracefully
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py'
PAPERSDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_papersdl.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify on_Snapshot hook exists."""
assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}"
def test_papersdl_validate_hook():
"""Test papers-dl validate hook checks for papers-dl."""
# Run papers-dl validate hook
result = subprocess.run(
[sys.executable, str(PAPERSDL_VALIDATE_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# Hook exits 0 if all binaries found, 1 if any not found
# Parse output for InstalledBinary and Dependency records
found_binary = False
found_dependency = False
for line in result.stdout.strip().split('\n'):
if line.strip():
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
if record['name'] == 'papers-dl':
assert record['abspath'], "papers-dl should have abspath"
found_binary = True
elif record.get('type') == 'Dependency':
if record['bin_name'] == 'papers-dl':
found_dependency = True
except json.JSONDecodeError:
pass
# papers-dl should either be found (InstalledBinary) or missing (Dependency)
assert found_binary or found_dependency, \
"papers-dl should have either InstalledBinary or Dependency record"
def test_verify_deps_with_abx_pkg():
"""Verify papers-dl is available via abx-pkg."""
from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides
missing_binaries = []
# Verify papers-dl is available
papersdl_binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()])
papersdl_loaded = papersdl_binary.load()
if not (papersdl_loaded and papersdl_loaded.abspath):
missing_binaries.append('papers-dl')
if missing_binaries:
pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted")
def test_handles_non_paper_url():
"""Test that papers-dl extractor handles non-paper URLs gracefully via hook."""
# Prerequisites checked by earlier test
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run papers-dl extraction hook on non-paper URL
result = subprocess.run(
[sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# Should exit 0 even for non-paper URL
assert result.returncode == 0, f"Should handle non-paper URL gracefully: {result.stderr}"
# Verify JSONL output
assert 'STATUS=' in result.stdout, "Should report status"
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
# Parse JSONL result
result_json = None
for line in result.stdout.split('\n'):
if line.startswith('RESULT_JSON='):
result_json = json.loads(line.split('=', 1)[1])
break
assert result_json, "Should have RESULT_JSON"
assert result_json['extractor'] == 'papersdl'
def test_config_save_papersdl_false_skips():
"""Test that SAVE_PAPERSDL=False causes skip."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['SAVE_PAPERSDL'] = 'False'
result = subprocess.run(
[sys.executable, str(PAPERSDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
assert 'STATUS=' in result.stdout
def test_config_timeout():
"""Test that PAPERSDL_TIMEOUT config is respected."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env['PAPERSDL_TIMEOUT'] = '5'
result = subprocess.run(
[sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
assert result.returncode == 0, "Should complete without hanging"
if __name__ == '__main__':
pytest.main([__file__, '-v'])