Files
ArchiveBox/archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
2025-12-24 20:10:38 -08:00

271 lines
8.7 KiB
JavaScript
Executable File

#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = 'singlefile';
const OUTPUT_FILE = 'singlefile.html';
/**
* Install the SingleFile extension
*/
async function installSinglefileExtension() {
console.log('[*] Installing SingleFile extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install SingleFile extension');
return null;
}
console.log('[+] SingleFile extension installed');
console.log('[+] Web pages will be saved as single HTML files');
return extension;
}
/**
* Wait for a specified amount of time
*/
function wait(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Save a page using the SingleFile extension
*
* @param {Object} page - Puppeteer page object
* @param {Object} extension - Extension metadata with dispatchAction method
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithExtension(page, extension, options = {}) {
if (!extension || !extension.version) {
throw new Error('SingleFile extension not found or not loaded');
}
const url = await page.url();
// Check for unsupported URL schemes
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
const scheme = url.split(':')[0];
if (URL_SCHEMES_IGNORED.includes(scheme)) {
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
return null;
}
// Ensure downloads directory exists
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// Get list of existing files to ignore
const files_before = new Set(
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'))
);
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// Bring page to front (extension action button acts on foreground tab)
await page.bringToFront();
// Trigger the extension's action (toolbar button click)
await extension.dispatchAction();
// Wait for file to appear in downloads directory
const check_delay = 3000; // 3 seconds
const max_tries = 10;
let files_new = [];
for (let attempt = 0; attempt < max_tries; attempt++) {
await wait(check_delay);
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'));
files_new = files_after.filter(file => !files_before.has(file));
if (files_new.length === 0) {
continue;
}
// Find the matching file by checking if it contains the URL in the HTML header
for (const file of files_new) {
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
const dl_header = dl_text.split('meta charset')[0];
if (dl_header.includes(`url: ${url}`)) {
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
await fs.promises.rename(dl_path, out_path);
return out_path;
}
}
}
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
return null;
}
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] SingleFile extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installSinglefileExtension();
// Export extension metadata for chrome_session to load
if (extension) {
// Write extension info to a cache file that chrome_session can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installSinglefileExtension,
saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] SingleFile extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] SingleFile extension setup failed:', err);
process.exit(1);
});
}