mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-04 09:55:33 +10:00
271 lines
8.7 KiB
JavaScript
Executable File
271 lines
8.7 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
/**
|
|
* SingleFile Extension Plugin
|
|
*
|
|
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
|
* Falls back to single-file-cli if the extension is not available.
|
|
*
|
|
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
|
*
|
|
* Priority: 04 (early) - Must install before Chrome session starts
|
|
* Hook: on_Snapshot
|
|
*
|
|
* This extension automatically:
|
|
* - Saves complete web pages as single HTML files
|
|
* - Inlines all resources (CSS, JS, images, fonts)
|
|
* - Preserves page fidelity better than wget/curl
|
|
* - Works with SPAs and dynamically loaded content
|
|
*/
|
|
|
|
const path = require('path');
|
|
const fs = require('fs');
|
|
const { promisify } = require('util');
|
|
const { exec } = require('child_process');
|
|
|
|
const execAsync = promisify(exec);
|
|
|
|
// Import extension utilities
|
|
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
|
|
|
// Extension metadata
|
|
const EXTENSION = {
|
|
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
|
name: 'singlefile',
|
|
};
|
|
|
|
// Get extensions directory from environment or use default
|
|
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
|
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
|
|
|
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
|
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
|
|
|
const OUTPUT_DIR = 'singlefile';
|
|
const OUTPUT_FILE = 'singlefile.html';
|
|
|
|
/**
|
|
* Install the SingleFile extension
|
|
*/
|
|
async function installSinglefileExtension() {
|
|
console.log('[*] Installing SingleFile extension...');
|
|
|
|
// Install the extension
|
|
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
|
|
|
if (!extension) {
|
|
console.error('[❌] Failed to install SingleFile extension');
|
|
return null;
|
|
}
|
|
|
|
console.log('[+] SingleFile extension installed');
|
|
console.log('[+] Web pages will be saved as single HTML files');
|
|
|
|
return extension;
|
|
}
|
|
|
|
/**
|
|
* Wait for a specified amount of time
|
|
*/
|
|
function wait(ms) {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
/**
|
|
* Save a page using the SingleFile extension
|
|
*
|
|
* @param {Object} page - Puppeteer page object
|
|
* @param {Object} extension - Extension metadata with dispatchAction method
|
|
* @param {Object} options - Additional options
|
|
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
|
*/
|
|
async function saveSinglefileWithExtension(page, extension, options = {}) {
|
|
if (!extension || !extension.version) {
|
|
throw new Error('SingleFile extension not found or not loaded');
|
|
}
|
|
|
|
const url = await page.url();
|
|
|
|
// Check for unsupported URL schemes
|
|
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
|
const scheme = url.split(':')[0];
|
|
if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
|
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
|
return null;
|
|
}
|
|
|
|
// Ensure downloads directory exists
|
|
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
|
|
|
// Get list of existing files to ignore
|
|
const files_before = new Set(
|
|
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
|
.filter(fn => fn.endsWith('.html'))
|
|
);
|
|
|
|
// Ensure output directory exists
|
|
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
|
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
|
|
|
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
|
|
|
// Bring page to front (extension action button acts on foreground tab)
|
|
await page.bringToFront();
|
|
|
|
// Trigger the extension's action (toolbar button click)
|
|
await extension.dispatchAction();
|
|
|
|
// Wait for file to appear in downloads directory
|
|
const check_delay = 3000; // 3 seconds
|
|
const max_tries = 10;
|
|
let files_new = [];
|
|
|
|
for (let attempt = 0; attempt < max_tries; attempt++) {
|
|
await wait(check_delay);
|
|
|
|
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
|
.filter(fn => fn.endsWith('.html'));
|
|
|
|
files_new = files_after.filter(file => !files_before.has(file));
|
|
|
|
if (files_new.length === 0) {
|
|
continue;
|
|
}
|
|
|
|
// Find the matching file by checking if it contains the URL in the HTML header
|
|
for (const file of files_new) {
|
|
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
|
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
|
const dl_header = dl_text.split('meta charset')[0];
|
|
|
|
if (dl_header.includes(`url: ${url}`)) {
|
|
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
|
await fs.promises.rename(dl_path, out_path);
|
|
return out_path;
|
|
}
|
|
}
|
|
}
|
|
|
|
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
|
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Save a page using single-file-cli (fallback method)
|
|
*
|
|
* @param {string} url - URL to archive
|
|
* @param {Object} options - Additional options
|
|
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
|
*/
|
|
async function saveSinglefileWithCLI(url, options = {}) {
|
|
console.log('[*] Falling back to single-file-cli...');
|
|
|
|
// Find single-file binary
|
|
let binary = null;
|
|
try {
|
|
const { stdout } = await execAsync('which single-file');
|
|
binary = stdout.trim();
|
|
} catch (err) {
|
|
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
|
return null;
|
|
}
|
|
|
|
// Ensure output directory exists
|
|
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
|
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
|
|
|
// Build command
|
|
const cmd = [
|
|
binary,
|
|
'--browser-headless',
|
|
url,
|
|
out_path,
|
|
];
|
|
|
|
// Add optional args
|
|
if (options.userAgent) {
|
|
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
|
}
|
|
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
|
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
|
}
|
|
if (options.ignoreSSL) {
|
|
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
|
}
|
|
|
|
// Execute
|
|
try {
|
|
const timeout = options.timeout || 120000;
|
|
await execAsync(cmd.join(' '), { timeout });
|
|
|
|
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
|
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
|
return out_path;
|
|
}
|
|
|
|
console.error('[❌] SingleFile CLI completed but no output file found');
|
|
return null;
|
|
} catch (err) {
|
|
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main entry point - install extension before archiving
|
|
*/
|
|
async function main() {
|
|
// Check if extension is already cached
|
|
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
|
|
|
if (fs.existsSync(cacheFile)) {
|
|
try {
|
|
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
|
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
|
|
|
if (fs.existsSync(manifestPath)) {
|
|
console.log('[*] SingleFile extension already installed (using cache)');
|
|
return cached;
|
|
}
|
|
} catch (e) {
|
|
// Cache file corrupted, re-install
|
|
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
|
}
|
|
}
|
|
|
|
// Install extension
|
|
const extension = await installSinglefileExtension();
|
|
|
|
// Export extension metadata for chrome_session to load
|
|
if (extension) {
|
|
// Write extension info to a cache file that chrome_session can read
|
|
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
|
await fs.promises.writeFile(
|
|
cacheFile,
|
|
JSON.stringify(extension, null, 2)
|
|
);
|
|
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
|
}
|
|
|
|
return extension;
|
|
}
|
|
|
|
// Export functions for use by other plugins
|
|
module.exports = {
|
|
EXTENSION,
|
|
installSinglefileExtension,
|
|
saveSinglefileWithExtension,
|
|
saveSinglefileWithCLI,
|
|
};
|
|
|
|
// Run if executed directly
|
|
if (require.main === module) {
|
|
main().then(() => {
|
|
console.log('[✓] SingleFile extension setup complete');
|
|
process.exit(0);
|
|
}).catch(err => {
|
|
console.error('[❌] SingleFile extension setup failed:', err);
|
|
process.exit(1);
|
|
});
|
|
}
|