""" Integration tests for mercury plugin Tests verify: 1. Hook script exists 2. Dependencies installed via validation hooks 3. Verify deps with abx-pkg 4. Mercury extraction works on https://example.com 5. JSONL output is correct 6. Filesystem output contains extracted content 7. Config options work """ import json import subprocess import sys import tempfile from pathlib import Path import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent MERCURY_HOOK = PLUGIN_DIR / 'on_Snapshot__53_mercury.py' MERCURY_INSTALL_HOOK = PLUGIN_DIR / 'on_Crawl__00_install_mercury.py' TEST_URL = 'https://example.com' def test_hook_script_exists(): """Verify on_Snapshot hook exists.""" assert MERCURY_HOOK.exists(), f"Hook not found: {MERCURY_HOOK}" def test_mercury_install_hook(): """Test mercury install hook checks for postlight-parser.""" # Run mercury install hook result = subprocess.run( [sys.executable, str(MERCURY_INSTALL_HOOK)], capture_output=True, text=True, timeout=30 ) # Hook exits 0 if binary found, 1 if not found (with Dependency record) if result.returncode == 0: # Binary found - verify Binary JSONL output found_binary = False for line in result.stdout.strip().split('\n'): if line.strip(): try: record = json.loads(line) if record.get('type') == 'Binary': assert record['name'] == 'postlight-parser' assert record['abspath'] found_binary = True break except json.JSONDecodeError: pass assert found_binary, "Should output Binary record when binary found" else: # Binary not found - verify Dependency JSONL output found_dependency = False for line in result.stdout.strip().split('\n'): if line.strip(): try: record = json.loads(line) if record.get('type') == 'Dependency': assert record['bin_name'] == 'postlight-parser' assert 'npm' in record['bin_providers'] found_dependency = True break except json.JSONDecodeError: pass assert found_dependency, "Should output Dependency record when binary not found" def test_verify_deps_with_abx_pkg(): """Verify postlight-parser is available via abx-pkg.""" from abx_pkg import Binary, NpmProvider, EnvProvider, BinProviderOverrides # Verify postlight-parser is available mercury_binary = Binary( name='postlight-parser', binproviders=[NpmProvider(), EnvProvider()], overrides={'npm': {'packages': ['@postlight/parser']}} ) mercury_loaded = mercury_binary.load() # If validate hook found it (exit 0), this should succeed # If validate hook didn't find it (exit 1), this may fail unless binprovider installed it if mercury_loaded and mercury_loaded.abspath: assert True, "postlight-parser is available" else: pytest.skip("postlight-parser not available - Dependency record should have been emitted") def test_extracts_with_mercury_parser(): """Test full workflow: extract with postlight-parser from real HTML via hook.""" # Prerequisites checked by earlier test with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) # Create HTML source that mercury can parse (tmpdir / 'singlefile').mkdir() (tmpdir / 'singlefile' / 'singlefile.html').write_text( 'Test Article' '

Example Article

This is test content for mercury parser.

' '' ) # Run mercury extraction hook result = subprocess.run( [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'], cwd=tmpdir, capture_output=True, text=True, timeout=60 ) assert result.returncode == 0, f"Extraction failed: {result.stderr}" # Parse clean JSONL output result_json = None for line in result.stdout.strip().split('\n'): line = line.strip() if line.startswith('{'): try: record = json.loads(line) if record.get('type') == 'ArchiveResult': result_json = record break except json.JSONDecodeError: pass assert result_json, "Should have ArchiveResult JSONL output" assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" # Verify filesystem output (hook writes to current directory) output_file = tmpdir / 'content.html' assert output_file.exists(), "content.html not created" content = output_file.read_text() assert len(content) > 0, "Output should not be empty" def test_config_save_mercury_false_skips(): """Test that SAVE_MERCURY=False exits without emitting JSONL.""" import os with tempfile.TemporaryDirectory() as tmpdir: env = os.environ.copy() env['SAVE_MERCURY'] = 'False' result = subprocess.run( [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], cwd=tmpdir, capture_output=True, text=True, env=env, timeout=30 ) assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}" # Feature disabled - no JSONL emission, just logs to stderr assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr" # Should NOT emit any JSONL jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}" def test_fails_gracefully_without_html(): """Test that mercury fails gracefully when no HTML source exists.""" with tempfile.TemporaryDirectory() as tmpdir: result = subprocess.run( [sys.executable, str(MERCURY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'], cwd=tmpdir, capture_output=True, text=True, timeout=30 ) # Should exit with non-zero or emit failure JSONL # Parse clean JSONL output result_json = None for line in result.stdout.strip().split('\n'): line = line.strip() if line.startswith('{'): try: record = json.loads(line) if record.get('type') == 'ArchiveResult': result_json = record break except json.JSONDecodeError: pass if result_json: # Should report failure or skip since no HTML source assert result_json['status'] in ['failed', 'skipped'], f"Should fail or skip without HTML: {result_json}" if __name__ == '__main__': pytest.main([__file__, '-v'])