mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 23:07:56 +10:00
wip major changes
This commit is contained in:
53
archivebox/plugins/singlefile/config.json
Normal file
53
archivebox/plugins/singlefile/config.json
Normal file
@@ -0,0 +1,53 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_SINGLEFILE": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable SingleFile archiving"
|
||||
},
|
||||
"SINGLEFILE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "single-file",
|
||||
"x-aliases": ["SINGLE_FILE_BINARY"],
|
||||
"description": "Path to single-file binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"description": "Path to Node.js binary"
|
||||
},
|
||||
"SINGLEFILE_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for SingleFile in seconds"
|
||||
},
|
||||
"SINGLEFILE_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
},
|
||||
"SINGLEFILE_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"SINGLEFILE_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"description": "Default single-file arguments"
|
||||
},
|
||||
"SINGLEFILE_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for single-file"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for single-file binary.
|
||||
|
||||
Runs at crawl start to verify single-file (npm package) is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from single-file binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
return result.stdout.strip().split('\n')[0][:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
# For scripts, hash the script content
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_singlefile() -> dict | None:
|
||||
"""Find single-file binary."""
|
||||
# Check env var first
|
||||
env_path = os.environ.get('SINGLEFILE_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': env_path,
|
||||
'version': get_binary_version(env_path),
|
||||
'sha256': get_binary_hash(env_path),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
# Try shutil.which
|
||||
for name in ['single-file', 'singlefile']:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'npm',
|
||||
}
|
||||
|
||||
# Check common npm paths
|
||||
npm_paths = [
|
||||
Path.home() / '.npm-global/bin/single-file',
|
||||
Path.home() / 'node_modules/.bin/single-file',
|
||||
Path('/usr/local/bin/single-file'),
|
||||
Path('/usr/local/lib/node_modules/.bin/single-file'),
|
||||
]
|
||||
for path in npm_paths:
|
||||
if path.is_file():
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': str(path),
|
||||
'version': get_binary_version(str(path)),
|
||||
'sha256': get_binary_hash(str(path)),
|
||||
'binprovider': 'npm',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_singlefile()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/SINGLEFILE_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/SINGLEFILE_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'single-file',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"single-file binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
270
archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
Executable file
270
archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
Executable file
@@ -0,0 +1,270 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* SingleFile Extension Plugin
|
||||
*
|
||||
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
||||
* Falls back to single-file-cli if the extension is not available.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
||||
*
|
||||
* Priority: 04 (early) - Must install before Chrome session starts
|
||||
* Hook: on_Snapshot
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Saves complete web pages as single HTML files
|
||||
* - Inlines all resources (CSS, JS, images, fonts)
|
||||
* - Preserves page fidelity better than wget/curl
|
||||
* - Works with SPAs and dynamically loaded content
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const { promisify } = require('util');
|
||||
const { exec } = require('child_process');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
name: 'singlefile',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = 'singlefile';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
/**
|
||||
* Install the SingleFile extension
|
||||
*/
|
||||
async function installSinglefileExtension() {
|
||||
console.log('[*] Installing SingleFile extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install SingleFile extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log('[+] SingleFile extension installed');
|
||||
console.log('[+] Web pages will be saved as single HTML files');
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for a specified amount of time
|
||||
*/
|
||||
function wait(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a page using the SingleFile extension
|
||||
*
|
||||
* @param {Object} page - Puppeteer page object
|
||||
* @param {Object} extension - Extension metadata with dispatchAction method
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
if (!extension || !extension.version) {
|
||||
throw new Error('SingleFile extension not found or not loaded');
|
||||
}
|
||||
|
||||
const url = await page.url();
|
||||
|
||||
// Check for unsupported URL schemes
|
||||
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
const scheme = url.split(':')[0];
|
||||
if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
||||
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure downloads directory exists
|
||||
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
|
||||
// Get list of existing files to ignore
|
||||
const files_before = new Set(
|
||||
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'))
|
||||
);
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
|
||||
// Bring page to front (extension action button acts on foreground tab)
|
||||
await page.bringToFront();
|
||||
|
||||
// Trigger the extension's action (toolbar button click)
|
||||
await extension.dispatchAction();
|
||||
|
||||
// Wait for file to appear in downloads directory
|
||||
const check_delay = 3000; // 3 seconds
|
||||
const max_tries = 10;
|
||||
let files_new = [];
|
||||
|
||||
for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
await wait(check_delay);
|
||||
|
||||
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'));
|
||||
|
||||
files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
if (files_new.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the matching file by checking if it contains the URL in the HTML header
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
const dl_header = dl_text.split('meta charset')[0];
|
||||
|
||||
if (dl_header.includes(`url: ${url}`)) {
|
||||
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
await fs.promises.rename(dl_path, out_path);
|
||||
return out_path;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a page using single-file-cli (fallback method)
|
||||
*
|
||||
* @param {string} url - URL to archive
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithCLI(url, options = {}) {
|
||||
console.log('[*] Falling back to single-file-cli...');
|
||||
|
||||
// Find single-file binary
|
||||
let binary = null;
|
||||
try {
|
||||
const { stdout } = await execAsync('which single-file');
|
||||
binary = stdout.trim();
|
||||
} catch (err) {
|
||||
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
const cmd = [
|
||||
binary,
|
||||
'--browser-headless',
|
||||
url,
|
||||
out_path,
|
||||
];
|
||||
|
||||
// Add optional args
|
||||
if (options.userAgent) {
|
||||
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
||||
}
|
||||
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
||||
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
||||
}
|
||||
if (options.ignoreSSL) {
|
||||
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
||||
}
|
||||
|
||||
// Execute
|
||||
try {
|
||||
const timeout = options.timeout || 120000;
|
||||
await execAsync(cmd.join(' '), { timeout });
|
||||
|
||||
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
||||
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
||||
return out_path;
|
||||
}
|
||||
|
||||
console.error('[❌] SingleFile CLI completed but no output file found');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] SingleFile extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installSinglefileExtension();
|
||||
|
||||
// Export extension metadata for chrome_session to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome_session can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installSinglefileExtension,
|
||||
saveSinglefileWithExtension,
|
||||
saveSinglefileWithCLI,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] SingleFile extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] SingleFile extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
328
archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
Normal file
328
archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
Normal file
@@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Archive a URL using SingleFile.
|
||||
|
||||
Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes singlefile.html to $PWD
|
||||
|
||||
Environment variables:
|
||||
SINGLEFILE_BINARY: Path to SingleFile binary
|
||||
SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
|
||||
SINGLEFILE_USER_AGENT: User agent string (optional)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
|
||||
SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
|
||||
|
||||
# Feature toggle
|
||||
SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
|
||||
|
||||
# Chrome binary (SingleFile needs Chrome)
|
||||
CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
USER_AGENT: Fallback user agent
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
COOKIES_FILE: Fallback cookies file
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'singlefile'
|
||||
BIN_NAME = 'single-file'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'singlefile'
|
||||
OUTPUT_FILE = 'singlefile.html'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
staticfile_dir = Path(STATICFILE_DIR)
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
# Chrome binary search paths
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
'chromium', 'chromium-browser', 'chromium-browser-beta',
|
||||
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
|
||||
]
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
|
||||
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
|
||||
|
||||
ALL_CHROME_BINARIES = (
|
||||
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
|
||||
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
|
||||
)
|
||||
|
||||
|
||||
def find_singlefile() -> str | None:
|
||||
"""Find SingleFile binary."""
|
||||
singlefile = get_env('SINGLEFILE_BINARY')
|
||||
if singlefile and os.path.isfile(singlefile):
|
||||
return singlefile
|
||||
|
||||
for name in ['single-file', 'singlefile']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_chrome() -> str | None:
|
||||
"""Find Chrome/Chromium binary."""
|
||||
chrome = get_env('CHROME_BINARY')
|
||||
if chrome and os.path.isfile(chrome):
|
||||
return chrome
|
||||
|
||||
for name in ALL_CHROME_BINARIES:
|
||||
if '/' in name:
|
||||
if os.path.isfile(name):
|
||||
return name
|
||||
else:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get SingleFile version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
|
||||
|
||||
def get_cdp_url() -> str | None:
|
||||
"""Get CDP URL from chrome_session if available."""
|
||||
cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
return cdp_file.read_text().strip()
|
||||
return None
|
||||
|
||||
|
||||
def get_port_from_cdp_url(cdp_url: str) -> str | None:
|
||||
"""Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...)."""
|
||||
import re
|
||||
match = re.search(r':(\d+)/', cdp_url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Archive URL using SingleFile.
|
||||
|
||||
If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
|
||||
Otherwise launches a new Chrome instance.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
|
||||
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
|
||||
chrome = find_chrome()
|
||||
|
||||
cmd = [binary]
|
||||
|
||||
# Try to use existing Chrome session via CDP
|
||||
cdp_url = get_cdp_url()
|
||||
if cdp_url:
|
||||
# SingleFile can connect to existing browser via WebSocket
|
||||
# Extract port from CDP URL (ws://127.0.0.1:PORT/...)
|
||||
port = get_port_from_cdp_url(cdp_url)
|
||||
if port:
|
||||
cmd.extend(['--browser-server', f'http://127.0.0.1:{port}'])
|
||||
elif chrome:
|
||||
cmd.extend(['--browser-executable-path', chrome])
|
||||
|
||||
# Common options
|
||||
cmd.extend([
|
||||
'--browser-headless',
|
||||
])
|
||||
|
||||
# SSL handling
|
||||
if not check_ssl:
|
||||
cmd.append('--browser-ignore-insecure-certs')
|
||||
|
||||
if user_agent:
|
||||
cmd.extend(['--browser-user-agent', user_agent])
|
||||
|
||||
if cookies_file and Path(cookies_file).is_file():
|
||||
cmd.extend(['--browser-cookies-file', cookies_file])
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
|
||||
cmd.extend([url, str(output_path)])
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if output_path.exists() and output_path.stat().st_size > 0:
|
||||
return True, str(output_path), ''
|
||||
else:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
if 'ERR_NAME_NOT_RESOLVED' in stderr:
|
||||
return False, None, 'DNS resolution failed'
|
||||
if 'ERR_CONNECTION_REFUSED' in stderr:
|
||||
return False, None, 'Connection refused'
|
||||
return False, None, f'SingleFile failed: {stderr[:200]}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to archive')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Archive a URL using SingleFile."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
cmd_str = ''
|
||||
|
||||
try:
|
||||
# Check if SingleFile is enabled
|
||||
if not get_env_bool('SAVE_SINGLEFILE', True):
|
||||
print('Skipping SingleFile (SAVE_SINGLEFILE=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile extractor already handled this (permanent skip)
|
||||
if has_staticfile_output():
|
||||
print(f'Skipping SingleFile - staticfile extractor already downloaded this')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - staticfile already handled
|
||||
|
||||
# Find binary
|
||||
binary = find_singlefile()
|
||||
if not binary:
|
||||
print(f'ERROR: SingleFile binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
print(f'INSTALL_HINT=npm install -g single-file-cli', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success and output:
|
||||
size = Path(output).stat().st_size
|
||||
print(f'SingleFile saved ({size} bytes)')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if cmd_str:
|
||||
print(f'CMD={cmd_str}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
110
archivebox/plugins/singlefile/tests/test_archiving.py
Normal file
110
archivebox/plugins/singlefile/tests/test_archiving.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
Integration tests - archive example.com with SingleFile and verify output
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
# Check if single-file CLI is available
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["which", "single-file"],
|
||||
capture_output=True,
|
||||
timeout=5
|
||||
)
|
||||
SINGLEFILE_CLI_AVAILABLE = result.returncode == 0
|
||||
except:
|
||||
SINGLEFILE_CLI_AVAILABLE = False
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not SINGLEFILE_CLI_AVAILABLE,
|
||||
reason="single-file CLI not installed (npm install -g single-file-cli)"
|
||||
)
|
||||
def test_archives_example_com():
|
||||
"""Archive example.com and verify output contains expected content"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_dir = Path(tmpdir) / "singlefile"
|
||||
output_dir.mkdir()
|
||||
|
||||
output_file = output_dir / "singlefile.html"
|
||||
|
||||
# Run single-file CLI
|
||||
result = subprocess.run(
|
||||
[
|
||||
"single-file",
|
||||
"--browser-headless",
|
||||
TEST_URL,
|
||||
str(output_file)
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Archive failed: {result.stderr}"
|
||||
|
||||
# Verify output exists
|
||||
assert output_file.exists(), "Output file not created"
|
||||
|
||||
# Read and verify content
|
||||
html_content = output_file.read_text()
|
||||
file_size = output_file.stat().st_size
|
||||
|
||||
# Should be substantial (embedded resources)
|
||||
assert file_size > 900, f"Output too small: {file_size} bytes"
|
||||
|
||||
# Verify HTML structure (SingleFile minifies, so <head> tag may be omitted)
|
||||
assert "<html" in html_content.lower()
|
||||
assert "<body" in html_content.lower()
|
||||
assert "<title>" in html_content.lower() or "title>" in html_content.lower()
|
||||
|
||||
# Verify example.com content is actually present
|
||||
assert "example domain" in html_content.lower(), "Missing 'Example Domain' title"
|
||||
assert "this domain is" in html_content.lower(), "Missing example.com description text"
|
||||
assert "iana.org" in html_content.lower(), "Missing IANA link"
|
||||
|
||||
# Verify it's not just empty/error page
|
||||
assert file_size > 900, f"File too small: {file_size} bytes"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not SINGLEFILE_CLI_AVAILABLE, reason="single-file CLI not installed")
|
||||
def test_different_urls_produce_different_outputs():
|
||||
"""Verify different URLs produce different archived content"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
outputs = {}
|
||||
|
||||
for url in ["https://example.com", "https://example.org"]:
|
||||
output_file = Path(tmpdir) / f"{url.replace('https://', '').replace('.', '_')}.html"
|
||||
|
||||
result = subprocess.run(
|
||||
["single-file", "--browser-headless", url, str(output_file)],
|
||||
capture_output=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode == 0 and output_file.exists():
|
||||
outputs[url] = output_file.read_text()
|
||||
|
||||
assert len(outputs) == 2, "Should archive both URLs"
|
||||
|
||||
# Verify outputs differ
|
||||
urls = list(outputs.keys())
|
||||
assert outputs[urls[0]] != outputs[urls[1]], "Different URLs should produce different outputs"
|
||||
|
||||
# Each should contain its domain
|
||||
assert "example.com" in outputs[urls[0]]
|
||||
assert "example.org" in outputs[urls[1]]
|
||||
385
archivebox/plugins/singlefile/tests/test_singlefile.js
Normal file
385
archivebox/plugins/singlefile/tests/test_singlefile.js
Normal file
@@ -0,0 +1,385 @@
|
||||
/**
|
||||
* Unit tests for singlefile plugin
|
||||
*
|
||||
* Run with: node --test tests/test_singlefile.js
|
||||
*/
|
||||
|
||||
const assert = require('assert');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
|
||||
|
||||
// Test fixtures
|
||||
const TEST_DIR = path.join(__dirname, '.test_fixtures');
|
||||
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
|
||||
const TEST_DOWNLOADS_DIR = path.join(TEST_DIR, 'chrome_downloads');
|
||||
|
||||
describe('singlefile plugin', () => {
|
||||
before(() => {
|
||||
if (!fs.existsSync(TEST_DIR)) {
|
||||
fs.mkdirSync(TEST_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
after(() => {
|
||||
if (fs.existsSync(TEST_DIR)) {
|
||||
fs.rmSync(TEST_DIR, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('EXTENSION metadata', () => {
|
||||
it('should have correct webstore_id', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
|
||||
});
|
||||
|
||||
it('should have correct name', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.name, 'singlefile');
|
||||
});
|
||||
});
|
||||
|
||||
describe('installSinglefileExtension', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should use cached extension if available', async () => {
|
||||
const { installSinglefileExtension } = require('../on_Snapshot__04_singlefile.js');
|
||||
|
||||
// Create fake cache
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_singlefile');
|
||||
|
||||
fs.mkdirSync(fakeExtensionDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(fakeExtensionDir, 'manifest.json'),
|
||||
JSON.stringify({ version: '1.22.90' })
|
||||
);
|
||||
|
||||
const fakeCache = {
|
||||
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
name: 'singlefile',
|
||||
unpacked_path: fakeExtensionDir,
|
||||
version: '1.22.90'
|
||||
};
|
||||
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
|
||||
|
||||
const result = await installSinglefileExtension();
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
assert.strictEqual(result.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
|
||||
});
|
||||
});
|
||||
|
||||
describe('saveSinglefileWithExtension', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_DOWNLOADS_DIR = TEST_DOWNLOADS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_DOWNLOADS_DIR;
|
||||
});
|
||||
|
||||
it('should require extension and version to be present', () => {
|
||||
const mockExtension = {
|
||||
name: 'singlefile',
|
||||
version: '1.22.96',
|
||||
id: 'test_id'
|
||||
};
|
||||
|
||||
assert.ok(mockExtension.version);
|
||||
assert.ok(mockExtension.id);
|
||||
});
|
||||
|
||||
it('should filter unsupported URL schemes', () => {
|
||||
const unsupportedSchemes = [
|
||||
'about:',
|
||||
'chrome:',
|
||||
'chrome-extension:',
|
||||
'data:',
|
||||
'javascript:',
|
||||
'blob:'
|
||||
];
|
||||
|
||||
unsupportedSchemes.forEach(scheme => {
|
||||
const testUrl = scheme + 'something';
|
||||
const urlScheme = testUrl.split(':')[0];
|
||||
|
||||
assert.ok(unsupportedSchemes.some(s => s.startsWith(urlScheme)));
|
||||
});
|
||||
});
|
||||
|
||||
it('should wait for file to appear in downloads directory', async () => {
|
||||
const checkDelay = 3000; // 3 seconds
|
||||
const maxTries = 10;
|
||||
|
||||
// Total max wait time
|
||||
const maxWaitTime = checkDelay * maxTries;
|
||||
|
||||
assert.strictEqual(maxWaitTime, 30000); // 30 seconds
|
||||
});
|
||||
|
||||
it('should find downloaded file by checking URL in HTML header', () => {
|
||||
const testUrl = 'https://example.com';
|
||||
const mockHtml = `<!-- url: ${testUrl} --><html><head><meta charset="utf-8"></head></html>`;
|
||||
|
||||
// Should be able to extract URL from header
|
||||
const headerPart = mockHtml.split('meta charset')[0];
|
||||
assert.ok(headerPart.includes(`url: ${testUrl}`));
|
||||
});
|
||||
|
||||
it('should move file from downloads to output directory', () => {
|
||||
const downloadPath = path.join(TEST_DOWNLOADS_DIR, 'temp_file.html');
|
||||
const outputDir = 'singlefile';
|
||||
const outputFile = 'singlefile.html';
|
||||
const outputPath = path.join(outputDir, outputFile);
|
||||
|
||||
// Verify paths are different
|
||||
assert.notStrictEqual(downloadPath, outputPath);
|
||||
});
|
||||
});
|
||||
|
||||
describe('saveSinglefileWithCLI', () => {
|
||||
it('should use single-file-cli as fallback', () => {
|
||||
const cliCommand = 'single-file';
|
||||
|
||||
// Should check for CLI availability
|
||||
assert.strictEqual(typeof cliCommand, 'string');
|
||||
assert.ok(cliCommand.length > 0);
|
||||
});
|
||||
|
||||
it('should pass correct arguments to CLI', () => {
|
||||
const args = [
|
||||
'--browser-headless',
|
||||
'https://example.com',
|
||||
'singlefile/singlefile.html'
|
||||
];
|
||||
|
||||
assert.ok(args.includes('--browser-headless'));
|
||||
assert.ok(args.some(arg => arg.startsWith('http')));
|
||||
});
|
||||
|
||||
it('should handle optional CLI arguments', () => {
|
||||
const options = {
|
||||
userAgent: 'Mozilla/5.0...',
|
||||
cookiesFile: '/path/to/cookies.txt',
|
||||
ignoreSSL: true
|
||||
};
|
||||
|
||||
// Optional args should be conditionally added
|
||||
if (options.userAgent) {
|
||||
assert.ok(options.userAgent.length > 0);
|
||||
}
|
||||
|
||||
if (options.ignoreSSL) {
|
||||
assert.strictEqual(options.ignoreSSL, true);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('priority and execution order', () => {
|
||||
it('should have priority 04 (early)', () => {
|
||||
const filename = 'on_Snapshot__04_singlefile.js';
|
||||
|
||||
const match = filename.match(/on_Snapshot__(\d+)_/);
|
||||
assert.ok(match);
|
||||
|
||||
const priority = parseInt(match[1]);
|
||||
assert.strictEqual(priority, 4);
|
||||
});
|
||||
|
||||
it('should run before chrome_session (priority 20)', () => {
|
||||
const extensionPriority = 4;
|
||||
const chromeSessionPriority = 20;
|
||||
|
||||
assert.ok(extensionPriority < chromeSessionPriority);
|
||||
});
|
||||
|
||||
it('should install extensions in correct order', () => {
|
||||
const priorities = {
|
||||
captcha2: 1,
|
||||
istilldontcareaboutcookies: 2,
|
||||
ublock: 3,
|
||||
singlefile: 4
|
||||
};
|
||||
|
||||
// Should be in ascending order
|
||||
assert.ok(priorities.captcha2 < priorities.istilldontcareaboutcookies);
|
||||
assert.ok(priorities.istilldontcareaboutcookies < priorities.ublock);
|
||||
assert.ok(priorities.ublock < priorities.singlefile);
|
||||
});
|
||||
});
|
||||
|
||||
describe('output structure', () => {
|
||||
it('should define output directory and file', () => {
|
||||
const OUTPUT_DIR = 'singlefile';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
assert.strictEqual(OUTPUT_DIR, 'singlefile');
|
||||
assert.strictEqual(OUTPUT_FILE, 'singlefile.html');
|
||||
});
|
||||
|
||||
it('should create output directory if not exists', () => {
|
||||
const outputDir = path.join(TEST_DIR, 'singlefile');
|
||||
|
||||
// Should create directory
|
||||
if (!fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
assert.ok(fs.existsSync(outputDir));
|
||||
|
||||
// Cleanup
|
||||
fs.rmSync(outputDir, { recursive: true });
|
||||
});
|
||||
});
|
||||
|
||||
describe('extension vs CLI fallback', () => {
|
||||
it('should prefer extension over CLI', () => {
|
||||
const preferenceOrder = [
|
||||
'extension',
|
||||
'cli'
|
||||
];
|
||||
|
||||
assert.strictEqual(preferenceOrder[0], 'extension');
|
||||
assert.strictEqual(preferenceOrder[1], 'cli');
|
||||
});
|
||||
|
||||
it('should fallback to CLI if extension unavailable', () => {
|
||||
const extensionAvailable = false;
|
||||
const cliAvailable = true;
|
||||
|
||||
let method;
|
||||
if (extensionAvailable) {
|
||||
method = 'extension';
|
||||
} else if (cliAvailable) {
|
||||
method = 'cli';
|
||||
}
|
||||
|
||||
assert.strictEqual(method, 'cli');
|
||||
});
|
||||
|
||||
it('should use extension if available', () => {
|
||||
const extensionAvailable = true;
|
||||
|
||||
let method;
|
||||
if (extensionAvailable) {
|
||||
method = 'extension';
|
||||
} else {
|
||||
method = 'cli';
|
||||
}
|
||||
|
||||
assert.strictEqual(method, 'extension');
|
||||
});
|
||||
});
|
||||
|
||||
describe('file matching and validation', () => {
|
||||
beforeEach(() => {
|
||||
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('should filter HTML files from downloads', () => {
|
||||
// Create mock download files
|
||||
const files = [
|
||||
'example.html',
|
||||
'test.pdf',
|
||||
'image.png',
|
||||
'page.html'
|
||||
];
|
||||
|
||||
const htmlFiles = files.filter(f => f.endsWith('.html'));
|
||||
|
||||
assert.strictEqual(htmlFiles.length, 2);
|
||||
assert.ok(htmlFiles.includes('example.html'));
|
||||
assert.ok(htmlFiles.includes('page.html'));
|
||||
});
|
||||
|
||||
it('should match URL in HTML header comment', () => {
|
||||
const testUrl = 'https://example.com/page';
|
||||
|
||||
const htmlContent = `<!--
|
||||
Page saved with SingleFile
|
||||
url: ${testUrl}
|
||||
saved date: 2024-01-01
|
||||
-->
|
||||
<html>...</html>`;
|
||||
|
||||
const headerSection = htmlContent.split('meta charset')[0] || htmlContent.split('<html>')[0];
|
||||
|
||||
assert.ok(headerSection.includes(`url: ${testUrl}`));
|
||||
});
|
||||
|
||||
it('should handle multiple new files in downloads', () => {
|
||||
const filesBefore = new Set(['old1.html', 'old2.html']);
|
||||
const filesAfter = ['old1.html', 'old2.html', 'new1.html', 'new2.html'];
|
||||
|
||||
const filesNew = filesAfter.filter(f => !filesBefore.has(f));
|
||||
|
||||
assert.strictEqual(filesNew.length, 2);
|
||||
assert.ok(filesNew.includes('new1.html'));
|
||||
assert.ok(filesNew.includes('new2.html'));
|
||||
});
|
||||
});
|
||||
|
||||
describe('error handling', () => {
|
||||
it('should timeout after max wait time', () => {
|
||||
const checkDelay = 3000; // ms
|
||||
const maxTries = 10;
|
||||
const timeoutMs = checkDelay * maxTries;
|
||||
|
||||
assert.strictEqual(timeoutMs, 30000); // 30 seconds
|
||||
});
|
||||
|
||||
it('should handle missing extension gracefully', () => {
|
||||
const extension = null;
|
||||
|
||||
if (!extension || !extension.version) {
|
||||
// Should throw error
|
||||
assert.ok(true);
|
||||
}
|
||||
});
|
||||
|
||||
it('should handle file not found after waiting', () => {
|
||||
const filesNew = [];
|
||||
const maxWaitReached = true;
|
||||
|
||||
if (filesNew.length === 0 && maxWaitReached) {
|
||||
// Should return null
|
||||
const result = null;
|
||||
assert.strictEqual(result, null);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
141
archivebox/plugins/singlefile/tests/test_singlefile.py
Normal file
141
archivebox/plugins/singlefile/tests/test_singlefile.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
Unit tests for singlefile plugin
|
||||
|
||||
Tests invoke the plugin hook as an external process and verify outputs/side effects.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that SingleFile extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert metadata["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "singlefile.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert cache_data["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_uses_existing_cache():
|
||||
"""Test that install uses existing cache when available"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
# Create fake cache
|
||||
fake_extension_dir = ext_dir / "mpiodijhokgodhhofbcjdecpffjipkle__singlefile"
|
||||
fake_extension_dir.mkdir(parents=True)
|
||||
|
||||
manifest = {"version": "1.22.96", "name": "SingleFile"}
|
||||
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should use cache or install successfully
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_no_configuration_required():
|
||||
"""Test that SingleFile works without configuration"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# No API keys needed
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should work without API keys
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_priority_order():
|
||||
"""Test that singlefile has correct priority (04)"""
|
||||
# Extract priority from filename
|
||||
filename = INSTALL_SCRIPT.name
|
||||
assert "04" in filename, "SingleFile should have priority 04"
|
||||
assert filename.startswith("on_Snapshot__04_"), "Should follow priority naming convention"
|
||||
|
||||
|
||||
def test_output_directory_structure():
|
||||
"""Test that plugin defines correct output structure"""
|
||||
# Verify the script mentions singlefile output directory
|
||||
script_content = INSTALL_SCRIPT.read_text()
|
||||
|
||||
# Should mention singlefile output directory
|
||||
assert "singlefile" in script_content.lower()
|
||||
# Should mention HTML output
|
||||
assert ".html" in script_content or "html" in script_content.lower()
|
||||
Reference in New Issue
Block a user