wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,53 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_SINGLEFILE": {
"type": "boolean",
"default": true,
"description": "Enable SingleFile archiving"
},
"SINGLEFILE_BINARY": {
"type": "string",
"default": "single-file",
"x-aliases": ["SINGLE_FILE_BINARY"],
"description": "Path to single-file binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"SINGLEFILE_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for SingleFile in seconds"
},
"SINGLEFILE_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
},
"SINGLEFILE_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"SINGLEFILE_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"description": "Default single-file arguments"
},
"SINGLEFILE_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for single-file"
}
}
}

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Validation hook for single-file binary.
Runs at crawl start to verify single-file (npm package) is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from single-file binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0 and result.stdout:
return result.stdout.strip().split('\n')[0][:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
# For scripts, hash the script content
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_singlefile() -> dict | None:
"""Find single-file binary."""
# Check env var first
env_path = os.environ.get('SINGLEFILE_BINARY', '')
if env_path and Path(env_path).is_file():
return {
'name': 'single-file',
'abspath': env_path,
'version': get_binary_version(env_path),
'sha256': get_binary_hash(env_path),
'binprovider': 'env',
}
# Try shutil.which
for name in ['single-file', 'singlefile']:
abspath = shutil.which(name)
if abspath:
return {
'name': 'single-file',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'npm',
}
# Check common npm paths
npm_paths = [
Path.home() / '.npm-global/bin/single-file',
Path.home() / 'node_modules/.bin/single-file',
Path('/usr/local/bin/single-file'),
Path('/usr/local/lib/node_modules/.bin/single-file'),
]
for path in npm_paths:
if path.is_file():
return {
'name': 'single-file',
'abspath': str(path),
'version': get_binary_version(str(path)),
'sha256': get_binary_hash(str(path)),
'binprovider': 'npm',
}
return None
def main():
result = find_singlefile()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/SINGLEFILE_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/SINGLEFILE_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'single-file',
'bin_providers': 'npm,env',
}))
print(f"single-file binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,270 @@
#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = 'singlefile';
const OUTPUT_FILE = 'singlefile.html';
/**
* Install the SingleFile extension
*/
async function installSinglefileExtension() {
console.log('[*] Installing SingleFile extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install SingleFile extension');
return null;
}
console.log('[+] SingleFile extension installed');
console.log('[+] Web pages will be saved as single HTML files');
return extension;
}
/**
* Wait for a specified amount of time
*/
function wait(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Save a page using the SingleFile extension
*
* @param {Object} page - Puppeteer page object
* @param {Object} extension - Extension metadata with dispatchAction method
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithExtension(page, extension, options = {}) {
if (!extension || !extension.version) {
throw new Error('SingleFile extension not found or not loaded');
}
const url = await page.url();
// Check for unsupported URL schemes
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
const scheme = url.split(':')[0];
if (URL_SCHEMES_IGNORED.includes(scheme)) {
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
return null;
}
// Ensure downloads directory exists
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// Get list of existing files to ignore
const files_before = new Set(
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'))
);
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// Bring page to front (extension action button acts on foreground tab)
await page.bringToFront();
// Trigger the extension's action (toolbar button click)
await extension.dispatchAction();
// Wait for file to appear in downloads directory
const check_delay = 3000; // 3 seconds
const max_tries = 10;
let files_new = [];
for (let attempt = 0; attempt < max_tries; attempt++) {
await wait(check_delay);
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'));
files_new = files_after.filter(file => !files_before.has(file));
if (files_new.length === 0) {
continue;
}
// Find the matching file by checking if it contains the URL in the HTML header
for (const file of files_new) {
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
const dl_header = dl_text.split('meta charset')[0];
if (dl_header.includes(`url: ${url}`)) {
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
await fs.promises.rename(dl_path, out_path);
return out_path;
}
}
}
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
return null;
}
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] SingleFile extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installSinglefileExtension();
// Export extension metadata for chrome_session to load
if (extension) {
// Write extension info to a cache file that chrome_session can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installSinglefileExtension,
saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] SingleFile extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] SingleFile extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,328 @@
#!/usr/bin/env python3
"""
Archive a URL using SingleFile.
Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
Output: Writes singlefile.html to $PWD
Environment variables:
SINGLEFILE_BINARY: Path to SingleFile binary
SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
SINGLEFILE_USER_AGENT: User agent string (optional)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
# Feature toggle
SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
# Chrome binary (SingleFile needs Chrome)
CHROME_BINARY: Path to Chrome/Chromium binary
# Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
"""
import json
import os
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'singlefile'
BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'singlefile'
OUTPUT_FILE = 'singlefile.html'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
STATICFILE_DIR = 'staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
staticfile_dir = Path(STATICFILE_DIR)
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Chrome binary search paths
CHROMIUM_BINARY_NAMES_LINUX = [
'chromium', 'chromium-browser', 'chromium-browser-beta',
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
]
CHROME_BINARY_NAMES_LINUX = [
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
]
CHROME_BINARY_NAMES_MACOS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
]
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
ALL_CHROME_BINARIES = (
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
)
def find_singlefile() -> str | None:
"""Find SingleFile binary."""
singlefile = get_env('SINGLEFILE_BINARY')
if singlefile and os.path.isfile(singlefile):
return singlefile
for name in ['single-file', 'singlefile']:
binary = shutil.which(name)
if binary:
return binary
return None
def find_chrome() -> str | None:
"""Find Chrome/Chromium binary."""
chrome = get_env('CHROME_BINARY')
if chrome and os.path.isfile(chrome):
return chrome
for name in ALL_CHROME_BINARIES:
if '/' in name:
if os.path.isfile(name):
return name
else:
binary = shutil.which(name)
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get SingleFile version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
CHROME_SESSION_DIR = 'chrome_session'
def get_cdp_url() -> str | None:
"""Get CDP URL from chrome_session if available."""
cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt'
if cdp_file.exists():
return cdp_file.read_text().strip()
return None
def get_port_from_cdp_url(cdp_url: str) -> str | None:
"""Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...)."""
import re
match = re.search(r':(\d+)/', cdp_url)
if match:
return match.group(1)
return None
def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Archive URL using SingleFile.
If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
Otherwise launches a new Chrome instance.
Returns: (success, output_path, error_message)
"""
# Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
chrome = find_chrome()
cmd = [binary]
# Try to use existing Chrome session via CDP
cdp_url = get_cdp_url()
if cdp_url:
# SingleFile can connect to existing browser via WebSocket
# Extract port from CDP URL (ws://127.0.0.1:PORT/...)
port = get_port_from_cdp_url(cdp_url)
if port:
cmd.extend(['--browser-server', f'http://127.0.0.1:{port}'])
elif chrome:
cmd.extend(['--browser-executable-path', chrome])
# Common options
cmd.extend([
'--browser-headless',
])
# SSL handling
if not check_ssl:
cmd.append('--browser-ignore-insecure-certs')
if user_agent:
cmd.extend(['--browser-user-agent', user_agent])
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--browser-cookies-file', cookies_file])
if extra_args:
cmd.extend(extra_args.split())
# Create output directory
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
cmd.extend([url, str(output_path)])
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if output_path.exists() and output_path.stat().st_size > 0:
return True, str(output_path), ''
else:
stderr = result.stderr.decode('utf-8', errors='replace')
if 'ERR_NAME_NOT_RESOLVED' in stderr:
return False, None, 'DNS resolution failed'
if 'ERR_CONNECTION_REFUSED' in stderr:
return False, None, 'Connection refused'
return False, None, f'SingleFile failed: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to archive')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Archive a URL using SingleFile."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if SingleFile is enabled
if not get_env_bool('SAVE_SINGLEFILE', True):
print('Skipping SingleFile (SAVE_SINGLEFILE=False)')
status = 'skipped'
end_ts = datetime.now(timezone.utc)
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Check if staticfile extractor already handled this (permanent skip)
if has_staticfile_output():
print(f'Skipping SingleFile - staticfile extractor already downloaded this')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - staticfile already handled
# Find binary
binary = find_singlefile()
if not binary:
print(f'ERROR: SingleFile binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=npm install -g single-file-cli', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
# Run extraction
success, output, error = save_singlefile(url, binary)
status = 'succeeded' if success else 'failed'
if success and output:
size = Path(output).stat().st_size
print(f'SingleFile saved ({size} bytes)')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if cmd_str:
print(f'CMD={cmd_str}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,110 @@
"""
Integration tests - archive example.com with SingleFile and verify output
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
TEST_URL = "https://example.com"
# Check if single-file CLI is available
try:
result = subprocess.run(
["which", "single-file"],
capture_output=True,
timeout=5
)
SINGLEFILE_CLI_AVAILABLE = result.returncode == 0
except:
SINGLEFILE_CLI_AVAILABLE = False
@pytest.mark.skipif(
not SINGLEFILE_CLI_AVAILABLE,
reason="single-file CLI not installed (npm install -g single-file-cli)"
)
def test_archives_example_com():
"""Archive example.com and verify output contains expected content"""
with tempfile.TemporaryDirectory() as tmpdir:
output_dir = Path(tmpdir) / "singlefile"
output_dir.mkdir()
output_file = output_dir / "singlefile.html"
# Run single-file CLI
result = subprocess.run(
[
"single-file",
"--browser-headless",
TEST_URL,
str(output_file)
],
capture_output=True,
text=True,
timeout=120
)
assert result.returncode == 0, f"Archive failed: {result.stderr}"
# Verify output exists
assert output_file.exists(), "Output file not created"
# Read and verify content
html_content = output_file.read_text()
file_size = output_file.stat().st_size
# Should be substantial (embedded resources)
assert file_size > 900, f"Output too small: {file_size} bytes"
# Verify HTML structure (SingleFile minifies, so <head> tag may be omitted)
assert "<html" in html_content.lower()
assert "<body" in html_content.lower()
assert "<title>" in html_content.lower() or "title>" in html_content.lower()
# Verify example.com content is actually present
assert "example domain" in html_content.lower(), "Missing 'Example Domain' title"
assert "this domain is" in html_content.lower(), "Missing example.com description text"
assert "iana.org" in html_content.lower(), "Missing IANA link"
# Verify it's not just empty/error page
assert file_size > 900, f"File too small: {file_size} bytes"
@pytest.mark.skipif(not SINGLEFILE_CLI_AVAILABLE, reason="single-file CLI not installed")
def test_different_urls_produce_different_outputs():
"""Verify different URLs produce different archived content"""
with tempfile.TemporaryDirectory() as tmpdir:
outputs = {}
for url in ["https://example.com", "https://example.org"]:
output_file = Path(tmpdir) / f"{url.replace('https://', '').replace('.', '_')}.html"
result = subprocess.run(
["single-file", "--browser-headless", url, str(output_file)],
capture_output=True,
timeout=120
)
if result.returncode == 0 and output_file.exists():
outputs[url] = output_file.read_text()
assert len(outputs) == 2, "Should archive both URLs"
# Verify outputs differ
urls = list(outputs.keys())
assert outputs[urls[0]] != outputs[urls[1]], "Different URLs should produce different outputs"
# Each should contain its domain
assert "example.com" in outputs[urls[0]]
assert "example.org" in outputs[urls[1]]

View File

@@ -0,0 +1,385 @@
/**
* Unit tests for singlefile plugin
*
* Run with: node --test tests/test_singlefile.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
const TEST_DOWNLOADS_DIR = path.join(TEST_DIR, 'chrome_downloads');
describe('singlefile plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id', () => {
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
assert.strictEqual(EXTENSION.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
assert.strictEqual(EXTENSION.name, 'singlefile');
});
});
describe('installSinglefileExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installSinglefileExtension } = require('../on_Snapshot__04_singlefile.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'singlefile.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_singlefile');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.22.90' })
);
const fakeCache = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
unpacked_path: fakeExtensionDir,
version: '1.22.90'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installSinglefileExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
});
});
describe('saveSinglefileWithExtension', () => {
beforeEach(() => {
process.env.CHROME_DOWNLOADS_DIR = TEST_DOWNLOADS_DIR;
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
delete process.env.CHROME_DOWNLOADS_DIR;
});
it('should require extension and version to be present', () => {
const mockExtension = {
name: 'singlefile',
version: '1.22.96',
id: 'test_id'
};
assert.ok(mockExtension.version);
assert.ok(mockExtension.id);
});
it('should filter unsupported URL schemes', () => {
const unsupportedSchemes = [
'about:',
'chrome:',
'chrome-extension:',
'data:',
'javascript:',
'blob:'
];
unsupportedSchemes.forEach(scheme => {
const testUrl = scheme + 'something';
const urlScheme = testUrl.split(':')[0];
assert.ok(unsupportedSchemes.some(s => s.startsWith(urlScheme)));
});
});
it('should wait for file to appear in downloads directory', async () => {
const checkDelay = 3000; // 3 seconds
const maxTries = 10;
// Total max wait time
const maxWaitTime = checkDelay * maxTries;
assert.strictEqual(maxWaitTime, 30000); // 30 seconds
});
it('should find downloaded file by checking URL in HTML header', () => {
const testUrl = 'https://example.com';
const mockHtml = `<!-- url: ${testUrl} --><html><head><meta charset="utf-8"></head></html>`;
// Should be able to extract URL from header
const headerPart = mockHtml.split('meta charset')[0];
assert.ok(headerPart.includes(`url: ${testUrl}`));
});
it('should move file from downloads to output directory', () => {
const downloadPath = path.join(TEST_DOWNLOADS_DIR, 'temp_file.html');
const outputDir = 'singlefile';
const outputFile = 'singlefile.html';
const outputPath = path.join(outputDir, outputFile);
// Verify paths are different
assert.notStrictEqual(downloadPath, outputPath);
});
});
describe('saveSinglefileWithCLI', () => {
it('should use single-file-cli as fallback', () => {
const cliCommand = 'single-file';
// Should check for CLI availability
assert.strictEqual(typeof cliCommand, 'string');
assert.ok(cliCommand.length > 0);
});
it('should pass correct arguments to CLI', () => {
const args = [
'--browser-headless',
'https://example.com',
'singlefile/singlefile.html'
];
assert.ok(args.includes('--browser-headless'));
assert.ok(args.some(arg => arg.startsWith('http')));
});
it('should handle optional CLI arguments', () => {
const options = {
userAgent: 'Mozilla/5.0...',
cookiesFile: '/path/to/cookies.txt',
ignoreSSL: true
};
// Optional args should be conditionally added
if (options.userAgent) {
assert.ok(options.userAgent.length > 0);
}
if (options.ignoreSSL) {
assert.strictEqual(options.ignoreSSL, true);
}
});
});
describe('priority and execution order', () => {
it('should have priority 04 (early)', () => {
const filename = 'on_Snapshot__04_singlefile.js';
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 4);
});
it('should run before chrome_session (priority 20)', () => {
const extensionPriority = 4;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
it('should install extensions in correct order', () => {
const priorities = {
captcha2: 1,
istilldontcareaboutcookies: 2,
ublock: 3,
singlefile: 4
};
// Should be in ascending order
assert.ok(priorities.captcha2 < priorities.istilldontcareaboutcookies);
assert.ok(priorities.istilldontcareaboutcookies < priorities.ublock);
assert.ok(priorities.ublock < priorities.singlefile);
});
});
describe('output structure', () => {
it('should define output directory and file', () => {
const OUTPUT_DIR = 'singlefile';
const OUTPUT_FILE = 'singlefile.html';
assert.strictEqual(OUTPUT_DIR, 'singlefile');
assert.strictEqual(OUTPUT_FILE, 'singlefile.html');
});
it('should create output directory if not exists', () => {
const outputDir = path.join(TEST_DIR, 'singlefile');
// Should create directory
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
assert.ok(fs.existsSync(outputDir));
// Cleanup
fs.rmSync(outputDir, { recursive: true });
});
});
describe('extension vs CLI fallback', () => {
it('should prefer extension over CLI', () => {
const preferenceOrder = [
'extension',
'cli'
];
assert.strictEqual(preferenceOrder[0], 'extension');
assert.strictEqual(preferenceOrder[1], 'cli');
});
it('should fallback to CLI if extension unavailable', () => {
const extensionAvailable = false;
const cliAvailable = true;
let method;
if (extensionAvailable) {
method = 'extension';
} else if (cliAvailable) {
method = 'cli';
}
assert.strictEqual(method, 'cli');
});
it('should use extension if available', () => {
const extensionAvailable = true;
let method;
if (extensionAvailable) {
method = 'extension';
} else {
method = 'cli';
}
assert.strictEqual(method, 'extension');
});
});
describe('file matching and validation', () => {
beforeEach(() => {
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
it('should filter HTML files from downloads', () => {
// Create mock download files
const files = [
'example.html',
'test.pdf',
'image.png',
'page.html'
];
const htmlFiles = files.filter(f => f.endsWith('.html'));
assert.strictEqual(htmlFiles.length, 2);
assert.ok(htmlFiles.includes('example.html'));
assert.ok(htmlFiles.includes('page.html'));
});
it('should match URL in HTML header comment', () => {
const testUrl = 'https://example.com/page';
const htmlContent = `<!--
Page saved with SingleFile
url: ${testUrl}
saved date: 2024-01-01
-->
<html>...</html>`;
const headerSection = htmlContent.split('meta charset')[0] || htmlContent.split('<html>')[0];
assert.ok(headerSection.includes(`url: ${testUrl}`));
});
it('should handle multiple new files in downloads', () => {
const filesBefore = new Set(['old1.html', 'old2.html']);
const filesAfter = ['old1.html', 'old2.html', 'new1.html', 'new2.html'];
const filesNew = filesAfter.filter(f => !filesBefore.has(f));
assert.strictEqual(filesNew.length, 2);
assert.ok(filesNew.includes('new1.html'));
assert.ok(filesNew.includes('new2.html'));
});
});
describe('error handling', () => {
it('should timeout after max wait time', () => {
const checkDelay = 3000; // ms
const maxTries = 10;
const timeoutMs = checkDelay * maxTries;
assert.strictEqual(timeoutMs, 30000); // 30 seconds
});
it('should handle missing extension gracefully', () => {
const extension = null;
if (!extension || !extension.version) {
// Should throw error
assert.ok(true);
}
});
it('should handle file not found after waiting', () => {
const filesNew = [];
const maxWaitReached = true;
if (filesNew.length === 0 && maxWaitReached) {
// Should return null
const result = null;
assert.strictEqual(result, null);
}
});
});
});

View File

@@ -0,0 +1,141 @@
"""
Unit tests for singlefile plugin
Tests invoke the plugin hook as an external process and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_extension_metadata():
"""Test that SingleFile extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert metadata["name"] == "singlefile"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
# Check cache file was created
cache_file = ext_dir / "singlefile.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert cache_data["name"] == "singlefile"
def test_install_uses_existing_cache():
"""Test that install uses existing cache when available"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
# Create fake cache
fake_extension_dir = ext_dir / "mpiodijhokgodhhofbcjdecpffjipkle__singlefile"
fake_extension_dir.mkdir(parents=True)
manifest = {"version": "1.22.96", "name": "SingleFile"}
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should use cache or install successfully
assert result.returncode == 0
def test_no_configuration_required():
"""Test that SingleFile works without configuration"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# No API keys needed
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should work without API keys
assert result.returncode == 0
def test_priority_order():
"""Test that singlefile has correct priority (04)"""
# Extract priority from filename
filename = INSTALL_SCRIPT.name
assert "04" in filename, "SingleFile should have priority 04"
assert filename.startswith("on_Snapshot__04_"), "Should follow priority naming convention"
def test_output_directory_structure():
"""Test that plugin defines correct output structure"""
# Verify the script mentions singlefile output directory
script_content = INSTALL_SCRIPT.read_text()
# Should mention singlefile output directory
assert "singlefile" in script_content.lower()
# Should mention HTML output
assert ".html" in script_content or "html" in script_content.lower()