From 6fdc52cc578e947b87cbcb2239068d05d468334e Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 26 Dec 2025 18:25:52 -0800 Subject: [PATCH] add papersdl plugin --- .../on_Crawl__00_validate_papersdl.py | 129 ++++++++++ .../papersdl/on_Snapshot__54_papersdl.py | 232 ++++++++++++++++++ .../plugins/papersdl/templates/embed.html | 15 ++ .../papersdl/templates/fullscreen.html | 71 ++++++ .../plugins/papersdl/templates/icon.html | 1 + .../plugins/papersdl/templates/thumbnail.html | 7 + .../plugins/papersdl/tests/test_papersdl.py | 157 ++++++++++++ 7 files changed, 612 insertions(+) create mode 100755 archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py create mode 100755 archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py create mode 100644 archivebox/plugins/papersdl/templates/embed.html create mode 100644 archivebox/plugins/papersdl/templates/fullscreen.html create mode 100644 archivebox/plugins/papersdl/templates/icon.html create mode 100644 archivebox/plugins/papersdl/templates/thumbnail.html create mode 100644 archivebox/plugins/papersdl/tests/test_papersdl.py diff --git a/archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py b/archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py new file mode 100755 index 00000000..5dda5650 --- /dev/null +++ b/archivebox/plugins/papersdl/on_Crawl__00_validate_papersdl.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +Validation hook for papers-dl. + +Runs at crawl start to verify papers-dl binary is available. +Outputs JSONL for InstalledBinary and Machine config updates. +""" + +import os +import sys +import json +import shutil +import hashlib +import subprocess +from pathlib import Path + + +def get_binary_version(abspath: str, version_flag: str = '--version') -> str | None: + """Get version string from binary.""" + try: + result = subprocess.run( + [abspath, version_flag], + capture_output=True, + text=True, + timeout=5, + ) + if result.returncode == 0 and result.stdout: + first_line = result.stdout.strip().split('\n')[0] + return first_line[:64] + except Exception: + pass + return None + + +def get_binary_hash(abspath: str) -> str | None: + """Get SHA256 hash of binary.""" + try: + with open(abspath, 'rb') as f: + return hashlib.sha256(f.read()).hexdigest() + except Exception: + return None + + +def find_papersdl() -> dict | None: + """Find papers-dl binary.""" + try: + from abx_pkg import Binary, PipProvider, EnvProvider + + class PapersdlBinary(Binary): + name: str = 'papers-dl' + binproviders_supported = [PipProvider(), EnvProvider()] + + binary = PapersdlBinary() + loaded = binary.load() + if loaded and loaded.abspath: + return { + 'name': 'papers-dl', + 'abspath': str(loaded.abspath), + 'version': str(loaded.version) if loaded.version else None, + 'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None, + 'binprovider': loaded.binprovider.name if loaded.binprovider else 'env', + } + except ImportError: + pass + except Exception: + pass + + # Fallback to shutil.which + abspath = shutil.which('papers-dl') or os.environ.get('PAPERSDL_BINARY', '') + if abspath and Path(abspath).is_file(): + return { + 'name': 'papers-dl', + 'abspath': abspath, + 'version': get_binary_version(abspath), + 'sha256': get_binary_hash(abspath), + 'binprovider': 'env', + } + + return None + + +def main(): + # Check for papers-dl (required) + papersdl_result = find_papersdl() + + missing_deps = [] + + # Emit results for papers-dl + if papersdl_result and papersdl_result.get('abspath'): + print(json.dumps({ + 'type': 'InstalledBinary', + 'name': papersdl_result['name'], + 'abspath': papersdl_result['abspath'], + 'version': papersdl_result['version'], + 'sha256': papersdl_result['sha256'], + 'binprovider': papersdl_result['binprovider'], + })) + + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/PAPERSDL_BINARY', + 'value': papersdl_result['abspath'], + })) + + if papersdl_result['version']: + print(json.dumps({ + 'type': 'Machine', + '_method': 'update', + 'key': 'config/PAPERSDL_VERSION', + 'value': papersdl_result['version'], + })) + else: + print(json.dumps({ + 'type': 'Dependency', + 'bin_name': 'papers-dl', + 'bin_providers': 'pip,env', + })) + missing_deps.append('papers-dl') + + if missing_deps: + print(f"Missing dependencies: {', '.join(missing_deps)}", file=sys.stderr) + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py new file mode 100755 index 00000000..b133194b --- /dev/null +++ b/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +""" +Download scientific papers from a URL using papers-dl. + +Usage: on_Snapshot__papersdl.py --url= --snapshot-id= +Output: Downloads paper PDFs to $PWD/ + +Environment variables: + PAPERSDL_BINARY: Path to papers-dl binary + PAPERSDL_TIMEOUT: Timeout in seconds (default: 300 for paper downloads) + PAPERSDL_EXTRA_ARGS: Extra arguments for papers-dl (space-separated) + + # papers-dl feature toggles + SAVE_PAPERSDL: Enable papers-dl paper extraction (default: True) + + # Fallback to ARCHIVING_CONFIG values if PAPERSDL_* not set: + TIMEOUT: Fallback timeout +""" + +import json +import os +import re +import shutil +import subprocess +import sys +from pathlib import Path + +import rich_click as click + + +# Extractor metadata +EXTRACTOR_NAME = 'papersdl' +BIN_NAME = 'papers-dl' +BIN_PROVIDERS = 'pip,env' +OUTPUT_DIR = '.' + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def get_env_int(name: str, default: int = 0) -> int: + try: + return int(get_env(name, str(default))) + except ValueError: + return default + + +def find_papersdl() -> str | None: + """Find papers-dl binary.""" + papersdl = get_env('PAPERSDL_BINARY') + if papersdl and os.path.isfile(papersdl): + return papersdl + + binary = shutil.which('papers-dl') + if binary: + return binary + + return None + + +def get_version(binary: str) -> str: + """Get papers-dl version.""" + try: + result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10) + return result.stdout.strip()[:64] + except Exception: + return '' + + +def extract_doi_from_url(url: str) -> str | None: + """Extract DOI from common paper URLs.""" + # Match DOI pattern in URL + doi_pattern = r'10\.\d{4,}/[^\s]+' + match = re.search(doi_pattern, url) + if match: + return match.group(0) + return None + + +def save_paper(url: str, binary: str) -> tuple[bool, str | None, str]: + """ + Download paper using papers-dl. + + Returns: (success, output_path, error_message) + """ + # Get config from env + timeout = get_env_int('PAPERSDL_TIMEOUT') or get_env_int('TIMEOUT', 300) + extra_args = get_env('PAPERSDL_EXTRA_ARGS', '') + + # Output directory is current directory (hook already runs in output dir) + output_dir = Path(OUTPUT_DIR) + + # Try to extract DOI from URL + doi = extract_doi_from_url(url) + if not doi: + # If no DOI found, papers-dl might handle the URL directly + identifier = url + else: + identifier = doi + + # Build command - papers-dl fetch -o + cmd = [binary, 'fetch', identifier, '-o', str(output_dir)] + + if extra_args: + cmd.extend(extra_args.split()) + + try: + result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True) + + # Check if any PDF files were downloaded + pdf_files = list(output_dir.glob('*.pdf')) + + if pdf_files: + # Return first PDF file + return True, str(pdf_files[0]), '' + else: + stderr = result.stderr + stdout = result.stdout + + # These are NOT errors - page simply has no downloadable paper + stderr_lower = stderr.lower() + stdout_lower = stdout.lower() + if 'not found' in stderr_lower or 'not found' in stdout_lower: + return True, None, '' # Paper not available - success, no output + if 'no results' in stderr_lower or 'no results' in stdout_lower: + return True, None, '' # No paper found - success, no output + if result.returncode == 0: + return True, None, '' # papers-dl exited cleanly, just no paper - success + + # These ARE errors - something went wrong + if '404' in stderr or '404' in stdout: + return False, None, '404 Not Found' + if '403' in stderr or '403' in stdout: + return False, None, '403 Forbidden' + + return False, None, f'papers-dl error: {stderr[:200] or stdout[:200]}' + + except subprocess.TimeoutExpired: + return False, None, f'Timed out after {timeout} seconds' + except Exception as e: + return False, None, f'{type(e).__name__}: {e}' + + +@click.command() +@click.option('--url', required=True, help='URL to download paper from') +@click.option('--snapshot-id', required=True, help='Snapshot UUID') +def main(url: str, snapshot_id: str): + """Download scientific paper from a URL using papers-dl.""" + + version = '' + output = None + status = 'failed' + error = '' + binary = None + cmd_str = '' + + try: + # Check if papers-dl is enabled + if not get_env_bool('SAVE_PAPERSDL', True): + print('Skipping papers-dl (SAVE_PAPERSDL=False)') + status = 'skipped' + print(f'STATUS={status}') + print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') + sys.exit(0) + + # Find binary + binary = find_papersdl() + if not binary: + print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr) + print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr) + print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr) + print(f'INSTALL_HINT=pip install papers-dl', file=sys.stderr) + sys.exit(1) + + version = get_version(binary) + cmd_str = f'{binary} fetch {url}' + + # Run extraction + success, output, error = save_paper(url, binary) + status = 'succeeded' if success else 'failed' + + if success: + if output: + output_path = Path(output) + file_size = output_path.stat().st_size + print(f'papers-dl completed: {output_path.name} ({file_size} bytes)') + else: + print(f'papers-dl completed: no paper found for this URL (this is normal)') + + except Exception as e: + error = f'{type(e).__name__}: {e}' + status = 'failed' + + # Print results + if cmd_str: + print(f'CMD={cmd_str}') + if version: + print(f'VERSION={version}') + if output: + print(f'OUTPUT={output}') + print(f'STATUS={status}') + + if error: + print(f'ERROR={error}', file=sys.stderr) + + # Print JSON result + result_json = { + 'extractor': EXTRACTOR_NAME, + 'url': url, + 'snapshot_id': snapshot_id, + 'status': status, + 'cmd_version': version, + 'output': output, + 'error': error or None, + } + print(f'RESULT_JSON={json.dumps(result_json)}') + + sys.exit(0 if status == 'succeeded' else 1) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/papersdl/templates/embed.html b/archivebox/plugins/papersdl/templates/embed.html new file mode 100644 index 00000000..45ef7d71 --- /dev/null +++ b/archivebox/plugins/papersdl/templates/embed.html @@ -0,0 +1,15 @@ + +
+
+ 📄 +

Scientific Paper

+
+
+ +
+ +
diff --git a/archivebox/plugins/papersdl/templates/fullscreen.html b/archivebox/plugins/papersdl/templates/fullscreen.html new file mode 100644 index 00000000..f2cee0c8 --- /dev/null +++ b/archivebox/plugins/papersdl/templates/fullscreen.html @@ -0,0 +1,71 @@ + + + + + + + Scientific Paper + + + +
+
📄
+

Scientific Paper

+
+
+ +
+ Download PDF + + diff --git a/archivebox/plugins/papersdl/templates/icon.html b/archivebox/plugins/papersdl/templates/icon.html new file mode 100644 index 00000000..063530f3 --- /dev/null +++ b/archivebox/plugins/papersdl/templates/icon.html @@ -0,0 +1 @@ +📄 \ No newline at end of file diff --git a/archivebox/plugins/papersdl/templates/thumbnail.html b/archivebox/plugins/papersdl/templates/thumbnail.html new file mode 100644 index 00000000..abe6f09a --- /dev/null +++ b/archivebox/plugins/papersdl/templates/thumbnail.html @@ -0,0 +1,7 @@ + +
+
+ 📄 + Paper +
+
diff --git a/archivebox/plugins/papersdl/tests/test_papersdl.py b/archivebox/plugins/papersdl/tests/test_papersdl.py new file mode 100644 index 00000000..bb2f3ec3 --- /dev/null +++ b/archivebox/plugins/papersdl/tests/test_papersdl.py @@ -0,0 +1,157 @@ +""" +Integration tests for papersdl plugin + +Tests verify: +1. Hook script exists +2. Dependencies installed via validation hooks +3. Verify deps with abx-pkg +4. Paper extraction works on paper URLs +5. JSONL output is correct +6. Config options work +7. Handles non-paper URLs gracefully +""" + +import json +import subprocess +import sys +import tempfile +from pathlib import Path +import pytest + +PLUGIN_DIR = Path(__file__).parent.parent +PLUGINS_ROOT = PLUGIN_DIR.parent +PAPERSDL_HOOK = PLUGIN_DIR / 'on_Snapshot__54_papersdl.py' +PAPERSDL_VALIDATE_HOOK = PLUGIN_DIR / 'on_Crawl__00_validate_papersdl.py' +TEST_URL = 'https://example.com' + +def test_hook_script_exists(): + """Verify on_Snapshot hook exists.""" + assert PAPERSDL_HOOK.exists(), f"Hook not found: {PAPERSDL_HOOK}" + + +def test_papersdl_validate_hook(): + """Test papers-dl validate hook checks for papers-dl.""" + # Run papers-dl validate hook + result = subprocess.run( + [sys.executable, str(PAPERSDL_VALIDATE_HOOK)], + capture_output=True, + text=True, + timeout=30 + ) + + # Hook exits 0 if all binaries found, 1 if any not found + # Parse output for InstalledBinary and Dependency records + found_binary = False + found_dependency = False + + for line in result.stdout.strip().split('\n'): + if line.strip(): + try: + record = json.loads(line) + if record.get('type') == 'InstalledBinary': + if record['name'] == 'papers-dl': + assert record['abspath'], "papers-dl should have abspath" + found_binary = True + elif record.get('type') == 'Dependency': + if record['bin_name'] == 'papers-dl': + found_dependency = True + except json.JSONDecodeError: + pass + + # papers-dl should either be found (InstalledBinary) or missing (Dependency) + assert found_binary or found_dependency, \ + "papers-dl should have either InstalledBinary or Dependency record" + + +def test_verify_deps_with_abx_pkg(): + """Verify papers-dl is available via abx-pkg.""" + from abx_pkg import Binary, PipProvider, EnvProvider, BinProviderOverrides + + missing_binaries = [] + + # Verify papers-dl is available + papersdl_binary = Binary(name='papers-dl', binproviders=[PipProvider(), EnvProvider()]) + papersdl_loaded = papersdl_binary.load() + if not (papersdl_loaded and papersdl_loaded.abspath): + missing_binaries.append('papers-dl') + + if missing_binaries: + pytest.skip(f"Binaries not available: {', '.join(missing_binaries)} - Dependency records should have been emitted") + + +def test_handles_non_paper_url(): + """Test that papers-dl extractor handles non-paper URLs gracefully via hook.""" + # Prerequisites checked by earlier test + + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + # Run papers-dl extraction hook on non-paper URL + result = subprocess.run( + [sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'test789'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=60 + ) + + # Should exit 0 even for non-paper URL + assert result.returncode == 0, f"Should handle non-paper URL gracefully: {result.stderr}" + + # Verify JSONL output + assert 'STATUS=' in result.stdout, "Should report status" + assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON" + + # Parse JSONL result + result_json = None + for line in result.stdout.split('\n'): + if line.startswith('RESULT_JSON='): + result_json = json.loads(line.split('=', 1)[1]) + break + + assert result_json, "Should have RESULT_JSON" + assert result_json['extractor'] == 'papersdl' + + +def test_config_save_papersdl_false_skips(): + """Test that SAVE_PAPERSDL=False causes skip.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['SAVE_PAPERSDL'] = 'False' + + result = subprocess.run( + [sys.executable, str(PAPERSDL_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}" + assert 'STATUS=' in result.stdout + + +def test_config_timeout(): + """Test that PAPERSDL_TIMEOUT config is respected.""" + import os + + with tempfile.TemporaryDirectory() as tmpdir: + env = os.environ.copy() + env['PAPERSDL_TIMEOUT'] = '5' + + result = subprocess.run( + [sys.executable, str(PAPERSDL_HOOK), '--url', 'https://example.com', '--snapshot-id', 'testtimeout'], + cwd=tmpdir, + capture_output=True, + text=True, + env=env, + timeout=30 + ) + + assert result.returncode == 0, "Should complete without hanging" + +if __name__ == '__main__': + pytest.main([__file__, '-v'])