improve plugin tests and config

2026-04-05 15:27:53 +10:00 · 2025-12-29 00:45:23 -08:00
parent f0aa19fa7d
commit 1e4d3ffd11
126 changed files with 2286 additions and 1717 deletions
--- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
@@ -15,8 +15,29 @@
 *     CHROME_USER_AGENT: User agent string (optional)
 *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
 *     CHROME_HEADLESS: Run in headless mode (default: true)
+ *     PDF_ENABLED: Enable PDF generation (default: true)
 */

+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvBool(name, defaultValue = false) {
+    const val = getEnv(name, '').toLowerCase();
+    if (['true', '1', 'yes', 'on'].includes(val)) return true;
+    if (['false', '0', 'no', 'off'].includes(val)) return false;
+    return defaultValue;
+}
+
+// Check if PDF is enabled BEFORE requiring puppeteer
+if (!getEnvBool('PDF_ENABLED', true)) {
+    console.error('Skipping PDF (PDF_ENABLED=False)');
+    // Temporary failure (config disabled) - NO JSONL emission
+    process.exit(0);
+}
+
+// Now safe to require puppeteer
 const fs = require('fs');
 const path = require('path');
 const puppeteer = require('puppeteer-core');
@@ -39,18 +60,6 @@ function parseArgs() {
    return args;
 }

-// Get environment variable with default
-function getEnv(name, defaultValue = '') {
-    return (process.env[name] || defaultValue).trim();
-}
-
-function getEnvBool(name, defaultValue = false) {
-    const val = getEnv(name, '').toLowerCase();
-    if (['true', '1', 'yes', 'on'].includes(val)) return true;
-    if (['false', '0', 'no', 'off'].includes(val)) return false;
-    return defaultValue;
-}
-
 function getEnvInt(name, defaultValue = 0) {
    const val = parseInt(getEnv(name, String(defaultValue)), 10);
    return isNaN(val) ? defaultValue : val;
@@ -237,62 +246,51 @@ async function main() {
        process.exit(1);
    }

-    const startTs = new Date();
-    let status = 'failed';
-    let output = null;
-    let error = '';
-
    try {
        // Check if staticfile extractor already handled this (permanent skip)
        if (hasStaticFileOutput()) {
-            console.log(`Skipping PDF - staticfile extractor already downloaded this`);
-            // Output clean JSONL (no RESULT_JSON= prefix)
+            console.error(`Skipping PDF - staticfile extractor already downloaded this`);
+            // Permanent skip - emit ArchiveResult
            console.log(JSON.stringify({
                type: 'ArchiveResult',
                status: 'skipped',
                output_str: 'staticfile already handled',
            }));
-            process.exit(0);  // Permanent skip - staticfile already handled
-        } else {
-            // Only wait for page load if using shared Chrome session
-            const cdpUrl = getCdpUrl();
-            if (cdpUrl) {
-                // Wait for page to be fully loaded
-                const pageLoaded = await waitForChromeTabLoaded(60000);
-                if (!pageLoaded) {
-                    throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
-                }
-            }
+            process.exit(0);
+        }

-            const result = await printToPdf(url);
-
-            if (result.success) {
-                status = 'succeeded';
-                output = result.output;
-                const size = fs.statSync(output).size;
-                console.log(`PDF saved (${size} bytes)`);
-            } else {
-                status = 'failed';
-                error = result.error;
+        // Only wait for page load if using shared Chrome session
+        const cdpUrl = getCdpUrl();
+        if (cdpUrl) {
+            // Wait for page to be fully loaded
+            const pageLoaded = await waitForChromeTabLoaded(60000);
+            if (!pageLoaded) {
+                throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
            }
        }
+
+        const result = await printToPdf(url);
+
+        if (result.success) {
+            // Success - emit ArchiveResult
+            const size = fs.statSync(result.output).size;
+            console.error(`PDF saved (${size} bytes)`);
+            console.log(JSON.stringify({
+                type: 'ArchiveResult',
+                status: 'succeeded',
+                output_str: result.output,
+            }));
+            process.exit(0);
+        } else {
+            // Transient error - emit NO JSONL
+            console.error(`ERROR: ${result.error}`);
+            process.exit(1);
+        }
    } catch (e) {
-        error = `${e.name}: ${e.message}`;
-        status = 'failed';
+        // Transient error - emit NO JSONL
+        console.error(`ERROR: ${e.name}: ${e.message}`);
+        process.exit(1);
    }
-
-    const endTs = new Date();
-
-    if (error) console.error(`ERROR: ${error}`);
-
-    // Output clean JSONL (no RESULT_JSON= prefix)
-    console.log(JSON.stringify({
-        type: 'ArchiveResult',
-        status,
-        output_str: output || error || '',
-    }));
-
-    process.exit(status === 'succeeded' ? 0 : 1);
 }

 main().catch(e => {
--- a/archivebox/plugins/pdf/tests/test_pdf.py
+++ b/archivebox/plugins/pdf/tests/test_pdf.py
@@ -23,8 +23,7 @@ import pytest

 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
-PDF_HOOK = PLUGIN_DIR / 'on_Snapshot__35_pdf.js'
-CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
+PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
 NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
 TEST_URL = 'https://example.com'

@@ -34,70 +33,6 @@ def test_hook_script_exists():
    assert PDF_HOOK.exists(), f"Hook not found: {PDF_HOOK}"


-def test_chrome_validation_and_install():
-    """Test chrome install hook to install puppeteer-core if needed."""
-    # Run chrome install hook (from chrome plugin)
-    result = subprocess.run(
-        [sys.executable, str(CHROME_INSTALL_HOOK)],
-        capture_output=True,
-        text=True,
-        timeout=30
-    )
-
-    # If exit 1, binary not found - need to install
-    if result.returncode == 1:
-        # Parse Dependency request from JSONL
-        dependency_request = None
-        for line in result.stdout.strip().split('\n'):
-            pass
-            if line.strip():
-                pass
-                try:
-                    record = json.loads(line)
-                    if record.get('type') == 'Dependency':
-                        dependency_request = record
-                        break
-                except json.JSONDecodeError:
-                    pass
-
-        if dependency_request:
-            bin_name = dependency_request['bin_name']
-            bin_providers = dependency_request['bin_providers']
-
-            # Install via npm provider hook
-            install_result = subprocess.run(
-                [
-                    sys.executable,
-                    str(NPM_PROVIDER_HOOK),
-                    '--dependency-id', 'test-dep-001',
-                    '--bin-name', bin_name,
-                    '--bin-providers', bin_providers
-                ],
-                capture_output=True,
-                text=True,
-                timeout=600
-            )
-
-            assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
-
-            # Verify installation via JSONL output
-            for line in install_result.stdout.strip().split('\n'):
-                pass
-                if line.strip():
-                    pass
-                    try:
-                        record = json.loads(line)
-                        if record.get('type') == 'Binary':
-                            assert record['name'] == bin_name
-                            assert record['abspath']
-                            break
-                    except json.JSONDecodeError:
-                        pass
-    else:
-        # Binary already available, verify via JSONL output
-        assert result.returncode == 0, f"Validation failed: {result.stderr}"
-
-
 def test_verify_deps_with_abx_pkg():
    """Verify dependencies are available via abx-pkg after hook installation."""
    from abx_pkg import Binary, EnvProvider, BinProviderOverrides
@@ -166,17 +101,13 @@ def test_extracts_pdf_from_example_com():


 def test_config_save_pdf_false_skips():
-    """Test that SAVE_PDF config is honored (Note: currently not implemented in hook)."""
+    """Test that PDF_ENABLED=False exits without emitting JSONL."""
    import os

-    # NOTE: The pdf hook doesn't currently check SAVE_PDF env var,
-    # so this test just verifies it runs without errors.
-    # TODO: Implement SAVE_PDF check in hook
-
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)
        env = os.environ.copy()
-        env['SAVE_PDF'] = 'False'
+        env['PDF_ENABLED'] = 'False'

        result = subprocess.run(
            ['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
@@ -184,11 +115,17 @@ def test_config_save_pdf_false_skips():
            capture_output=True,
            text=True,
            env=env,
-            timeout=120
+            timeout=30
        )

-        # Hook currently ignores SAVE_PDF, so it will run normally
-        assert result.returncode in (0, 1), "Should complete without hanging"
+        assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
+
+        # Feature disabled - temporary failure, should NOT emit JSONL
+        assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
+
+        # Should NOT emit any JSONL
+        jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
+        assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"


 def test_reports_missing_chrome():