mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
improve plugin tests and config
This commit is contained in:
@@ -15,8 +15,29 @@
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* PDF_ENABLED: Enable PDF generation (default: true)
|
||||
*/
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Check if PDF is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('PDF_ENABLED', true)) {
|
||||
console.error('Skipping PDF (PDF_ENABLED=False)');
|
||||
// Temporary failure (config disabled) - NO JSONL emission
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
@@ -39,18 +60,6 @@ function parseArgs() {
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
@@ -237,62 +246,51 @@ async function main() {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.log(`Skipping PDF - staticfile extractor already downloaded this`);
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.error(`Skipping PDF - staticfile extractor already downloaded this`);
|
||||
// Permanent skip - emit ArchiveResult
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await printToPdf(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const size = fs.statSync(output).size;
|
||||
console.log(`PDF saved (${size} bytes)`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await printToPdf(url);
|
||||
|
||||
if (result.success) {
|
||||
// Success - emit ArchiveResult
|
||||
const size = fs.statSync(result.output).size;
|
||||
console.error(`PDF saved (${size} bytes)`);
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: result.output,
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${result.error}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.name}: ${e.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
|
||||
if (error) console.error(`ERROR: ${error}`);
|
||||
|
||||
// Output clean JSONL (no RESULT_JSON= prefix)
|
||||
console.log(JSON.stringify({
|
||||
type: 'ArchiveResult',
|
||||
status,
|
||||
output_str: output || error || '',
|
||||
}));
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
|
||||
@@ -23,8 +23,7 @@ import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
PDF_HOOK = PLUGIN_DIR / 'on_Snapshot__35_pdf.js'
|
||||
CHROME_INSTALL_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__00_chrome_install.py'
|
||||
PDF_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_pdf.*'), None)
|
||||
NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
@@ -34,70 +33,6 @@ def test_hook_script_exists():
|
||||
assert PDF_HOOK.exists(), f"Hook not found: {PDF_HOOK}"
|
||||
|
||||
|
||||
def test_chrome_validation_and_install():
|
||||
"""Test chrome install hook to install puppeteer-core if needed."""
|
||||
# Run chrome install hook (from chrome plugin)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(CHROME_INSTALL_HOOK)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# If exit 1, binary not found - need to install
|
||||
if result.returncode == 1:
|
||||
# Parse Dependency request from JSONL
|
||||
dependency_request = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Dependency':
|
||||
dependency_request = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if dependency_request:
|
||||
bin_name = dependency_request['bin_name']
|
||||
bin_providers = dependency_request['bin_providers']
|
||||
|
||||
# Install via npm provider hook
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(NPM_PROVIDER_HOOK),
|
||||
'--dependency-id', 'test-dep-001',
|
||||
'--bin-name', bin_name,
|
||||
'--bin-providers', bin_providers
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
assert install_result.returncode == 0, f"Install failed: {install_result.stderr}"
|
||||
|
||||
# Verify installation via JSONL output
|
||||
for line in install_result.stdout.strip().split('\n'):
|
||||
pass
|
||||
if line.strip():
|
||||
pass
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary':
|
||||
assert record['name'] == bin_name
|
||||
assert record['abspath']
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
# Binary already available, verify via JSONL output
|
||||
assert result.returncode == 0, f"Validation failed: {result.stderr}"
|
||||
|
||||
|
||||
def test_verify_deps_with_abx_pkg():
|
||||
"""Verify dependencies are available via abx-pkg after hook installation."""
|
||||
from abx_pkg import Binary, EnvProvider, BinProviderOverrides
|
||||
@@ -166,17 +101,13 @@ def test_extracts_pdf_from_example_com():
|
||||
|
||||
|
||||
def test_config_save_pdf_false_skips():
|
||||
"""Test that SAVE_PDF config is honored (Note: currently not implemented in hook)."""
|
||||
"""Test that PDF_ENABLED=False exits without emitting JSONL."""
|
||||
import os
|
||||
|
||||
# NOTE: The pdf hook doesn't currently check SAVE_PDF env var,
|
||||
# so this test just verifies it runs without errors.
|
||||
# TODO: Implement SAVE_PDF check in hook
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
env = os.environ.copy()
|
||||
env['SAVE_PDF'] = 'False'
|
||||
env['PDF_ENABLED'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(PDF_HOOK), f'--url={TEST_URL}', '--snapshot-id=test999'],
|
||||
@@ -184,11 +115,17 @@ def test_config_save_pdf_false_skips():
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Hook currently ignores SAVE_PDF, so it will run normally
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
assert 'Skipping' in result.stderr or 'False' in result.stderr, "Should log skip reason to stderr"
|
||||
|
||||
# Should NOT emit any JSONL
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL when feature disabled, but got: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_reports_missing_chrome():
|
||||
|
||||
Reference in New Issue
Block a user