improve plugin tests and config

This commit is contained in:
Nick Sweeting
2025-12-29 00:45:23 -08:00
parent f0aa19fa7d
commit 1e4d3ffd11
126 changed files with 2286 additions and 1717 deletions

View File

@@ -15,8 +15,29 @@
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* PDF_ENABLED: Enable PDF generation (default: true)
*/
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Check if PDF is enabled BEFORE requiring puppeteer
if (!getEnvBool('PDF_ENABLED', true)) {
console.error('Skipping PDF (PDF_ENABLED=False)');
// Temporary failure (config disabled) - NO JSONL emission
process.exit(0);
}
// Now safe to require puppeteer
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
@@ -39,18 +60,6 @@ function parseArgs() {
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
@@ -237,62 +246,51 @@ async function main() {
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.log(`Skipping PDF - staticfile extractor already downloaded this`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.error(`Skipping PDF - staticfile extractor already downloaded this`);
// Permanent skip - emit ArchiveResult
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'skipped',
output_str: 'staticfile already handled',
}));
process.exit(0); // Permanent skip - staticfile already handled
} else {
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
process.exit(0);
}
const result = await printToPdf(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const size = fs.statSync(output).size;
console.log(`PDF saved (${size} bytes)`);
} else {
status = 'failed';
error = result.error;
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await printToPdf(url);
if (result.success) {
// Success - emit ArchiveResult
const size = fs.statSync(result.output).size;
console.error(`PDF saved (${size} bytes)`);
console.log(JSON.stringify({
type: 'ArchiveResult',
status: 'succeeded',
output_str: result.output,
}));
process.exit(0);
} else {
// Transient error - emit NO JSONL
console.error(`ERROR: ${result.error}`);
process.exit(1);
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.name}: ${e.message}`);
process.exit(1);
}
const endTs = new Date();
if (error) console.error(`ERROR: ${error}`);
// Output clean JSONL (no RESULT_JSON= prefix)
console.log(JSON.stringify({
type: 'ArchiveResult',
status,
output_str: output || error || '',
}));
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {

View File

@@ -23,8 +23,7 @@ import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
PDF_HOOK = PLUGIN_DIR / 'on_Snapshot__35_pdf.js'
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
TEST_URL = 'https://example.com'
@@ -34,70 +33,6 @@ def test_hook_script_exists():
assert PDF_HOOK.exists(), f"Hook not found: {PDF_HOOK}"
def test_chrome_validation_and_install():
"""Test chrome install hook to install puppeteer-core if needed."""
# Run chrome install hook (from chrome plugin)
result = subprocess.run(
[sys.executable, str(CHROME_INSTALL_HOOK)],
capture_output=True,
text=True,
timeout=30
)
# If exit 1, binary not found - need to install
if result.returncode == 1:
# Parse Dependency request from JSONL
dependency_request = None
for line in result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Dependency':
dependency_request = record
break
except json.JSONDecodeError:
pass
if dependency_request:
bin_name = dependency_request['bin_name']
bin_providers = dependency_request['bin_providers']
# Install via npm provider hook
install_result = subprocess.run(
[
sys.executable,
str(NPM_PROVIDER_HOOK),
'--dependency-id', 'test-dep-001',
'--bin-name', bin_name,
'--bin-providers', bin_providers
],
capture_output=True,
text=True,
timeout=600
)
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
# Verify installation via JSONL output
for line in install_result.stdout.strip().split('\n'):
pass
if line.strip():
pass
try:
record = json.loads(line)
if record.get('type') == 'Binary':
assert record['name'] == bin_name
assert record['abspath']
break
except json.JSONDecodeError:
pass
else:
# Binary already available, verify via JSONL output
assert result.returncode == 0, f"Validation failed: {result.stderr}"
def test_verify_deps_with_abx_pkg():
"""Verify dependencies are available via abx-pkg after hook installation."""
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
@@ -166,17 +101,13 @@ def test_extracts_pdf_from_example_com():
def test_config_save_pdf_false_skips():
"""Test that SAVE_PDF config is honored (Note: currently not implemented in hook)."""
"""Test that PDF_ENABLED=False exits without emitting JSONL."""
import os
# NOTE: The pdf hook doesn't currently check SAVE_PDF env var,
# so this test just verifies it runs without errors.
# TODO: Implement SAVE_PDF check in hook
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = os.environ.copy()
env['SAVE_PDF'] = 'False'
env['PDF_ENABLED'] = 'False'
result = subprocess.run(
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
@@ -184,11 +115,17 @@ def test_config_save_pdf_false_skips():
capture_output=True,
text=True,
env=env,
timeout=120
timeout=30
)
# Hook currently ignores SAVE_PDF, so it will run normally
assert result.returncode in (0, 1), "Should complete without hanging"
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
# Should NOT emit any JSONL
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
def test_reports_missing_chrome():