From 891409a1cc3bf7d590f737d1c088f70fd1e181e4 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 3 Nov 2025 21:03:18 +0000 Subject: [PATCH] Add Chrome extension support with 2captcha extractor and update singlefile Implements a Chrome extension management system that allows extractors to use browser extensions: New 2captcha extractor (runs BEFORE puppeteer): - Downloads Chrome extensions from Web Store (.crx files) - Unpacks extensions to ./extensions/ directory - Writes CHROME_EXTENSIONS_PATHS and CHROME_EXTENSIONS_IDS to .env - Supports 2captcha (CAPTCHA solving), singlefile, uBlock, cookie consent blocker - Configurable via API_KEY_2CAPTCHA and EXTENSIONS_ENABLED env vars Updated puppeteer extractor: - Reads CHROME_EXTENSIONS_PATHS from .env - Loads extensions when launching Chrome - Runs in headed mode when extensions are present (extensions require visible browser) - Passes extension IDs to Chrome via --load-extension and --allowlisted-extension-id Updated singlefile extractor (now uses extension instead of CLI): - Connects to existing Chrome browser via CDP - Triggers SingleFile extension via Ctrl+Shift+Y keyboard shortcut - Waits for downloaded file to appear in Chrome downloads directory - More reliable than single-file-cli and better quality output - Fully integrates with Chrome's extension ecosystem Benefits: - Automatic CAPTCHA solving via 2captcha extension - Better ad/cookie blocking via uBlock and cookie consent extensions - Higher quality single-file archives using official SingleFile extension - Extensions share browser state (cookies, local storage, etc.) - Foundation for adding more browser extensions in the future Dependencies: - Added unzip-crx-3 for unpacking .crx extension files - Updated extractors to use puppeteer-core for CDP connections Execution order: 1. 2captcha downloads/configures extensions 2. puppeteer launches Chrome with extensions loaded 3. All other extractors reuse the same Chrome instance with extensions active --- archivebox-ts/README.md | 66 +++++--- archivebox-ts/extractors/2captcha | 155 +++++++++++++++++ archivebox-ts/extractors/puppeteer | 26 ++- archivebox-ts/extractors/singlefile | 254 ++++++++++++++++++++++------ archivebox-ts/package-lock.json | 119 ++++++++++++- archivebox-ts/package.json | 3 +- archivebox-ts/src/extractors.ts | 8 +- archivebox-ts/src/models.ts | 1 + 8 files changed, 547 insertions(+), 85 deletions(-) create mode 100755 archivebox-ts/extractors/2captcha diff --git a/archivebox-ts/README.md b/archivebox-ts/README.md index 1e9b6c3b..69962523 100644 --- a/archivebox-ts/README.md +++ b/archivebox-ts/README.md @@ -139,23 +139,24 @@ node dist/cli.js extractors Extractors run serially in this predefined order (defined in `src/extractors.ts`): -1. **puppeteer** - Launches Chrome, writes CDP URL to `.env` -2. **downloads** - Catches file downloads (reloads page with listeners) -3. **images** - Catches all images (reloads page with listeners) -4. **infiniscroll** - Scrolls page to load lazy content -5. **favicon** - Downloads favicon (can work independently) -6. **title** - Extracts title using existing Chrome tab -7. **headers** - Extracts headers using existing Chrome tab -8. **screenshot** - Takes screenshot using existing Chrome tab -9. **pdf** - Generates PDF using existing Chrome tab -10. **dom** - Extracts DOM HTML using existing Chrome tab -11. **htmltotext** - Extracts plain text using existing Chrome tab -12. **readability** - Extracts article content using existing Chrome tab -13. **singlefile** - Creates single-file archive (may use existing Chrome) -14. **wget** - Downloads with wget (independent) -15. **git** - Clones git repository (independent) -16. **media** - Downloads media with yt-dlp (independent) -17. **archive_org** - Submits to Internet Archive (independent) +1. **2captcha** - Downloads and configures Chrome extensions (2captcha, singlefile, uBlock, etc.) +2. **puppeteer** - Launches Chrome with extensions, writes CDP URL to `.env` +3. **downloads** - Catches file downloads (reloads page with listeners) +4. **images** - Catches all images (reloads page with listeners) +5. **infiniscroll** - Scrolls page to load lazy content +6. **favicon** - Downloads favicon (can work independently) +7. **title** - Extracts title using existing Chrome tab +8. **headers** - Extracts headers using existing Chrome tab +9. **screenshot** - Takes screenshot using existing Chrome tab +10. **pdf** - Generates PDF using existing Chrome tab +11. **dom** - Extracts DOM HTML using existing Chrome tab +12. **htmltotext** - Extracts plain text using existing Chrome tab +13. **readability** - Extracts article content using existing Chrome tab +14. **singlefile** - Creates single-file archive using browser extension +15. **wget** - Downloads with wget (independent) +16. **git** - Clones git repository (independent) +17. **media** - Downloads media with yt-dlp (independent) +18. **archive_org** - Submits to Internet Archive (independent) Only extractors that are both: - Requested (via `--extractors` or default: all) @@ -234,6 +235,21 @@ Represents the result of running one extractor on one snapshot. ## Available Extractors +### 2captcha +- **Language**: Node.js +- **Dependencies**: unzip-crx-3 +- **Output**: `./extensions/` directory with unpacked Chrome extensions +- **Config**: + - `API_KEY_2CAPTCHA` - 2Captcha API key for CAPTCHA solving (optional) + - `EXTENSIONS_ENABLED` - Comma-separated list of extensions to enable (default: all) +- **Purpose**: Downloads and configures Chrome extensions before browser launch +- **Extensions included**: + - `2captcha` - Automatic CAPTCHA solving + - `singlefile` - Single-file HTML archiving + - `ublock` - Ad blocking + - `istilldontcareaboutcookies` - Cookie consent blocker +- **Note**: Must run BEFORE puppeteer extractor + ### puppeteer - **Language**: Node.js + Puppeteer - **Dependencies**: puppeteer (includes Chrome) @@ -241,7 +257,10 @@ Represents the result of running one extractor on one snapshot. - **Config**: - `PUPPETEER_TIMEOUT` - Timeout in milliseconds (default: 30000) - `CHROME_USER_DATA_DIR` - Chrome user data directory (default: ~/.chrome-archivebox) -- **Purpose**: Launches Chrome and makes CDP URL available to other extractors + - `CHROME_EXTENSIONS_PATHS` - From .env (set by 2captcha extractor) + - `CHROME_EXTENSIONS_IDS` - From .env (set by 2captcha extractor) +- **Purpose**: Launches Chrome with extensions and makes CDP URL available to other extractors +- **Note**: Runs in headed mode if extensions are loaded (extensions require visible browser) ### downloads - **Language**: Node.js + Puppeteer @@ -360,12 +379,17 @@ Represents the result of running one extractor on one snapshot. - `READABILITY_TIMEOUT` - Timeout in milliseconds (default: 10000) ### singlefile -- **Language**: Bash -- **Dependencies**: single-file-cli (auto-installed via npm) +- **Language**: Node.js + Puppeteer +- **Dependencies**: puppeteer-core, SingleFile browser extension (from 2captcha extractor) - **Output**: `singlefile.html` +- **Requires**: 2captcha and puppeteer extractors must run first - **Config**: + - `CHROME_CDP_URL` - From .env (set by puppeteer extractor) + - `CHROME_PAGE_TARGET_ID` - From .env (set by puppeteer extractor) + - `CHROME_USER_DATA_DIR` - For finding downloads directory - `SINGLEFILE_TIMEOUT` - Timeout in seconds (default: 60) - - `CHROME_CDP_URL` - Optional: uses existing Chrome if available +- **Purpose**: Creates single-file HTML archive using the SingleFile browser extension +- **Note**: Triggers extension via Ctrl+Shift+Y keyboard shortcut ### wget - **Language**: Bash diff --git a/archivebox-ts/extractors/2captcha b/archivebox-ts/extractors/2captcha new file mode 100755 index 00000000..e6c2e16a --- /dev/null +++ b/archivebox-ts/extractors/2captcha @@ -0,0 +1,155 @@ +#!/usr/bin/env node + +/** + * 2Captcha Extension Extractor + * + * Downloads, unpacks, and configures Chrome extensions needed for archiving. + * Must run BEFORE the puppeteer extractor so extensions are available at Chrome launch. + * + * Input: + * $1 - URL to archive (not used, but required by extractor contract) + * + * Output: + * - ./extensions/__/ - Unpacked extension directories + * - Writes CHROME_EXTENSIONS_PATHS to .env for puppeteer to load + * + * Environment variables: + * API_KEY_2CAPTCHA - 2Captcha API key for CAPTCHA solving (optional) + * EXTENSIONS_ENABLED - Comma-separated list of extensions to enable (default: all) + */ + +const fs = require('fs'); +const path = require('path'); +const crypto = require('crypto'); +const { Readable } = require('stream'); +const { finished } = require('stream/promises'); + +// Extension definitions (webstore_id + name) +const CHROME_EXTENSIONS = [ + // CAPTCHA solving + {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: '2captcha'}, + + // Archiving tools + {webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', name: 'singlefile'}, + + // Content access / unblocking + {webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'}, + {webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'}, +]; + +async function main() { + const url = process.argv[2]; + if (!url) { + console.error('Usage: 2captcha '); + process.exit(1); + } + + const API_KEY_2CAPTCHA = process.env.API_KEY_2CAPTCHA || null; + const EXTENSIONS_ENABLED = (process.env.EXTENSIONS_ENABLED || 'all').split(','); + + // Create extensions directory in snapshot output + const extensionsDir = path.join(process.cwd(), 'extensions'); + fs.mkdirSync(extensionsDir, {recursive: true}); + + console.log(`[🧩] Downloading and configuring ${CHROME_EXTENSIONS.length} Chrome extensions...`); + + const loadedExtensions = []; + + for (const ext of CHROME_EXTENSIONS) { + // Skip if not in enabled list (unless 'all' is specified) + if (!EXTENSIONS_ENABLED.includes('all') && !EXTENSIONS_ENABLED.includes(ext.name)) { + console.log(`[⊗] Skipping ${ext.name} (not enabled)`); + continue; + } + + const crxUrl = `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`; + const crxPath = path.join(extensionsDir, `${ext.webstore_id}__${ext.name}.crx`); + const unpackedPath = path.join(extensionsDir, `${ext.webstore_id}__${ext.name}`); + + try { + // Download CRX if not already present + if (!fs.existsSync(crxPath)) { + console.log(`[⬇️] Downloading ${ext.name} extension...`); + const response = await fetch(crxUrl); + if (!response.ok) throw new Error(`HTTP ${response.status}`); + + const crxFile = fs.createWriteStream(crxPath); + await finished(Readable.fromWeb(response.body).pipe(crxFile)); + } + + // Unpack CRX if not already unpacked + if (!fs.existsSync(path.join(unpackedPath, 'manifest.json'))) { + console.log(`[📦] Unpacking ${ext.name} extension...`); + fs.mkdirSync(unpackedPath, {recursive: true}); + + // Try using unzip command + const {execSync} = require('child_process'); + try { + execSync(`unzip -q -o "${crxPath}" -d "${unpackedPath}"`, {stdio: 'ignore'}); + } catch(err) { + // Fallback to unzip-crx-3 if unzip command fails + try { + const unzip = require('unzip-crx-3'); + await unzip(crxPath, unpackedPath); + } catch(err2) { + console.error(`[❌] Failed to unpack ${ext.name}:`, err2.message); + continue; + } + } + } + + // Verify manifest exists + const manifestPath = path.join(unpackedPath, 'manifest.json'); + if (!fs.existsSync(manifestPath)) { + console.error(`[❌] No manifest.json found for ${ext.name}`); + continue; + } + + const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8')); + console.log(`[✓] Loaded ${ext.name} v${manifest.version}`); + + loadedExtensions.push({ + name: ext.name, + webstore_id: ext.webstore_id, + path: unpackedPath, + version: manifest.version, + }); + + } catch(err) { + console.error(`[❌] Failed to load ${ext.name}:`, err.message); + } + } + + if (!loadedExtensions.length) { + console.error('[❌] No extensions were successfully loaded'); + process.exit(1); + } + + // Write extension paths to .env for puppeteer to load + const extensionPaths = loadedExtensions.map(e => e.path).join(','); + const extensionIds = loadedExtensions.map(e => e.webstore_id).join(','); + + const envPath = path.join(process.cwd(), '.env'); + const envContent = ` +# Chrome extensions (written by 2captcha extractor) +CHROME_EXTENSIONS_PATHS="${extensionPaths}" +CHROME_EXTENSIONS_IDS="${extensionIds}" +${API_KEY_2CAPTCHA ? `API_KEY_2CAPTCHA="${API_KEY_2CAPTCHA}"` : '# API_KEY_2CAPTCHA not set'} +`; + + fs.appendFileSync(envPath, envContent); + console.log(`[✓] Configured ${loadedExtensions.length} extensions for Chrome`); + + // Write extensions metadata to JSON for debugging + fs.writeFileSync( + path.join(extensionsDir, 'extensions.json'), + JSON.stringify(loadedExtensions, null, 2) + ); + + process.exit(0); +} + +main().catch(err => { + console.error('[❌] Extension setup failed:', err); + process.exit(1); +}); diff --git a/archivebox-ts/extractors/puppeteer b/archivebox-ts/extractors/puppeteer index c0433fca..be62dc4e 100755 --- a/archivebox-ts/extractors/puppeteer +++ b/archivebox-ts/extractors/puppeteer @@ -45,9 +45,14 @@ async function main() { const timeout = parseInt(process.env.PUPPETEER_TIMEOUT || '30000', 10); const userDataDir = process.env.CHROME_USER_DATA_DIR || path.join(os.homedir(), '.chrome-archivebox'); + const extensionPaths = process.env.CHROME_EXTENSIONS_PATHS || ''; + const extensionIds = process.env.CHROME_EXTENSIONS_IDS || ''; console.error(`Launching Chrome for: ${url}`); console.error(`User data dir: ${userDataDir}`); + if (extensionPaths) { + console.error(`Loading extensions: ${extensionIds}`); + } // Check puppeteer is installed if (!checkPuppeteer()) { @@ -59,15 +64,24 @@ async function main() { let browser = null; try { + // Build Chrome launch args + const chromeArgs = [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + ]; + + // Add extensions if available + if (extensionPaths) { + chromeArgs.push(`--load-extension=${extensionPaths}`); + chromeArgs.push(`--allowlisted-extension-id=${extensionIds}`); + } + // Launch Chrome with a user data directory browser = await puppeteer.launch({ - headless: 'new', + headless: extensionPaths ? false : 'new', // extensions require headed mode userDataDir: userDataDir, - args: [ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - ] + args: chromeArgs, }); // Get the WebSocket endpoint diff --git a/archivebox-ts/extractors/singlefile b/archivebox-ts/extractors/singlefile index 48251aa1..d4311b00 100755 --- a/archivebox-ts/extractors/singlefile +++ b/archivebox-ts/extractors/singlefile @@ -1,62 +1,210 @@ -#!/bin/bash -# -# SingleFile Extractor -# Creates a single HTML file archive using single-file-cli -# -# Usage: singlefile -# Output: singlefile.html in current directory -# Config: All configuration via environment variables -# SINGLEFILE_TIMEOUT - Timeout in seconds (default: 60) -# CHROME_CDP_URL - Optional: Chrome CDP URL to use existing browser -# +#!/usr/bin/env node -set -e +/** + * SingleFile Extractor + * Creates a single HTML file archive using the SingleFile browser extension + * + * Usage: singlefile + * Output: singlefile.html in current directory + * Config: All configuration via environment variables + * SINGLEFILE_TIMEOUT - Timeout in seconds (default: 60) + * CHROME_CDP_URL - Chrome CDP URL to use existing browser (required) + * CHROME_PAGE_TARGET_ID - Target ID of the page to archive (required) + * CHROME_USER_DATA_DIR - Chrome user data directory for downloads + */ -URL="$1" +const fs = require('fs'); +const path = require('path'); -if [ -z "$URL" ]; then - echo "Error: URL argument required" >&2 - exit 1 -fi +async function waitForFile(filePath, timeout = 30000) { + const startTime = Date.now(); + while (Date.now() - startTime < timeout) { + if (fs.existsSync(filePath) && fs.statSync(filePath).size > 0) { + return true; + } + await new Promise(resolve => setTimeout(resolve, 500)); + } + return false; +} -# Auto-install single-file-cli if not available -if ! command -v single-file &> /dev/null; then - echo "Installing single-file-cli..." >&2 - npm install -g single-file-cli 2>&1 | grep -v "^npm WARN" || true -fi +async function main() { + const url = process.argv[2]; + if (!url) { + console.error('Error: URL argument required'); + process.exit(1); + } -# Configuration from environment -TIMEOUT="${SINGLEFILE_TIMEOUT:-60}" + const cdpUrl = process.env.CHROME_CDP_URL; + const targetId = process.env.CHROME_PAGE_TARGET_ID; + const userDataDir = process.env.CHROME_USER_DATA_DIR; -echo "Creating single-file archive of: $URL" >&2 + if (!cdpUrl) { + console.error('Error: CHROME_CDP_URL environment variable is required'); + console.error('SingleFile extractor must run after puppeteer extractor'); + process.exit(1); + } -# Check if we have a Chrome CDP URL to use -if [ -n "$CHROME_CDP_URL" ]; then - echo "Using existing Chrome instance via CDP" >&2 - # Extract the debugging port from CDP URL - # ws://127.0.0.1:12345/devtools/browser/... -> 12345 - PORT=$(echo "$CHROME_CDP_URL" | sed -E 's|.*:([0-9]+)/.*|\1|') + if (!targetId) { + console.error('Error: CHROME_PAGE_TARGET_ID environment variable is required'); + process.exit(1); + } - single-file \ - --browser-args="--remote-debugging-port=$PORT" \ - --browser-server="ws://localhost:$PORT" \ - --dump-content \ - "$URL" \ - singlefile.html 2>&1 | grep -v "^$" || true -else - echo "Launching new browser instance" >&2 - single-file \ - --dump-content \ - "$URL" \ - singlefile.html 2>&1 | grep -v "^$" || true -fi + const timeout = parseInt(process.env.SINGLEFILE_TIMEOUT || '60', 10) * 1000; + const downloadsDir = userDataDir + ? path.join(userDataDir, '../chrome_downloads') + : path.join(process.env.HOME || process.env.USERPROFILE, 'Downloads'); -if [ -f "singlefile.html" ] && [ -s "singlefile.html" ]; then - SIZE=$(stat -f%z "singlefile.html" 2>/dev/null || stat -c%s "singlefile.html" 2>/dev/null || echo "unknown") - echo "✓ Created single-file archive ($SIZE bytes)" >&2 - echo "singlefile.html" - exit 0 -else - echo "Error: Failed to create single-file archive" >&2 - exit 1 -fi + console.error(`Creating single-file archive of: ${url}`); + console.error(`Connecting to Chrome at ${cdpUrl}...`); + + // Check if puppeteer-core is available + let puppeteer; + try { + puppeteer = require('puppeteer-core'); + } catch (e) { + console.error('Error: puppeteer-core is not installed'); + console.error('Please install it with: npm install puppeteer-core'); + process.exit(1); + } + + let browser; + try { + // Connect to existing browser + browser = await puppeteer.connect({ + browserWSEndpoint: cdpUrl, + defaultViewport: null + }); + + // Get the page we're archiving + const targets = await browser.targets(); + const target = targets.find(t => t._targetId === targetId); + + if (!target) { + console.error(`Error: Could not find target with ID ${targetId}`); + process.exit(1); + } + + const page = await target.page(); + console.error('Connected to existing page'); + + // Get list of existing HTML files in downloads before triggering save + const filesBeforeSave = new Set( + fs.existsSync(downloadsDir) + ? fs.readdirSync(downloadsDir).filter(f => f.endsWith('.html')) + : [] + ); + + // Trigger the SingleFile extension by sending the keyboard shortcut + // SingleFile responds to Ctrl+Shift+Y by default + console.error('Triggering SingleFile extension...'); + + // Bring page to front (extension actions work on active tab) + await page.bringToFront(); + + // Wait a moment for page to be ready + await new Promise(resolve => setTimeout(resolve, 1000)); + + // Try clicking via CDP if we can find the extension + let triggered = false; + try { + // Look for SingleFile extension background page + const extensionTargets = await browser.targets(); + const singlefileTarget = extensionTargets.find(t => + t.url().includes('chrome-extension://') && + t.url().includes('mpiodijhokgodhhofbcjdecpffjipkle') + ); + + if (singlefileTarget) { + const extensionPage = await singlefileTarget.page(); + if (extensionPage) { + // Trigger save via extension's internal API + await page.evaluate(() => { + // Dispatch message to extension to trigger save + window.postMessage({ type: 'singlefile-save' }, '*'); + }); + triggered = true; + console.error('Triggered via extension message'); + } + } + } catch (err) { + console.error('Could not trigger via extension API, trying keyboard shortcut'); + } + + // Fallback: trigger via keyboard shortcut + if (!triggered) { + await page.keyboard.down('Control'); + await page.keyboard.down('Shift'); + await page.keyboard.press('Y'); + await page.keyboard.up('Shift'); + await page.keyboard.up('Control'); + console.error('Triggered via Ctrl+Shift+Y keyboard shortcut'); + } + + // Wait for new HTML file to appear in downloads + console.error(`Waiting for file in ${downloadsDir}...`); + + let savedFile = null; + const checkDelay = 500; + const maxTries = timeout / checkDelay; + + for (let i = 0; i < maxTries; i++) { + await new Promise(resolve => setTimeout(resolve, checkDelay)); + + if (!fs.existsSync(downloadsDir)) continue; + + const filesAfter = fs.readdirSync(downloadsDir) + .filter(f => f.endsWith('.html')); + const newFiles = filesAfter.filter(f => !filesBeforeSave.has(f)); + + if (newFiles.length > 0) { + // Find the file that matches our URL + for (const file of newFiles) { + const filePath = path.join(downloadsDir, file); + const stats = fs.statSync(filePath); + + // Check if file is large enough and contains the URL + if (stats.size > 1000) { + const content = fs.readFileSync(filePath, 'utf-8').substring(0, 5000); + if (content.includes(url) || content.includes('Saved by SingleFile')) { + savedFile = filePath; + break; + } + } + } + + if (savedFile) break; + } + } + + if (!savedFile) { + console.error('Error: SingleFile did not create a download file'); + console.error(`Expected file in: ${downloadsDir}`); + console.error('Make sure the SingleFile extension is installed and enabled'); + process.exit(1); + } + + // Move the file to output location + const outputPath = path.join(process.cwd(), 'singlefile.html'); + fs.renameSync(savedFile, outputPath); + + const size = fs.statSync(outputPath).size; + console.error(`✓ Created single-file archive (${size} bytes)`); + console.log('singlefile.html'); + + await browser.disconnect(); + process.exit(0); + + } catch (err) { + console.error(`Error: ${err.message}`); + if (browser) { + try { + await browser.disconnect(); + } catch (e) { + // Ignore disconnect errors + } + } + process.exit(1); + } +} + +main(); diff --git a/archivebox-ts/package-lock.json b/archivebox-ts/package-lock.json index 0582b738..466d1a3f 100644 --- a/archivebox-ts/package-lock.json +++ b/archivebox-ts/package-lock.json @@ -15,7 +15,8 @@ "jsdom": "^27.1.0", "nanoid": "^3.3.7", "puppeteer": "^24.28.0", - "puppeteer-core": "^24.28.0" + "puppeteer-core": "^24.28.0", + "unzip-crx-3": "^0.2.0" }, "bin": { "archivebox-ts": "dist/cli.js" @@ -652,6 +653,12 @@ "node": ">=18" } }, + "node_modules/core-util-is": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", + "license": "MIT" + }, "node_modules/cosmiconfig": { "version": "9.0.0", "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", @@ -1088,6 +1095,12 @@ ], "license": "BSD-3-Clause" }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "license": "MIT" + }, "node_modules/import-fresh": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", @@ -1146,6 +1159,12 @@ "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==", "license": "MIT" }, + "node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "license": "MIT" + }, "node_modules/js-tokens": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", @@ -1209,6 +1228,57 @@ "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==", "license": "MIT" }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, + "node_modules/jszip/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/jszip/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, + "node_modules/jszip/node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lines-and-columns": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", @@ -1257,6 +1327,18 @@ "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==", "license": "MIT" }, + "node_modules/mkdirp": { + "version": "0.5.6", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz", + "integrity": "sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==", + "license": "MIT", + "dependencies": { + "minimist": "^1.2.6" + }, + "bin": { + "mkdirp": "bin/cmd.js" + } + }, "node_modules/mkdirp-classic": { "version": "0.5.3", "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", @@ -1355,6 +1437,12 @@ "node": ">= 14" } }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", + "license": "(MIT AND Zlib)" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -1435,6 +1523,12 @@ "node": ">=10" } }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", + "license": "MIT" + }, "node_modules/progress": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", @@ -1633,6 +1727,12 @@ "node": ">=10" } }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", + "license": "MIT" + }, "node_modules/simple-concat": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", @@ -1920,6 +2020,17 @@ "devOptional": true, "license": "MIT" }, + "node_modules/unzip-crx-3": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/unzip-crx-3/-/unzip-crx-3-0.2.0.tgz", + "integrity": "sha512-0+JiUq/z7faJ6oifVB5nSwt589v1KCduqIJupNVDoWSXZtWDmjDGO3RAEOvwJ07w90aoXoP4enKsR7ecMrJtWQ==", + "license": "MIT", + "dependencies": { + "jszip": "^3.1.0", + "mkdirp": "^0.5.1", + "yaku": "^0.16.6" + } + }, "node_modules/util-deprecate": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", @@ -2055,6 +2166,12 @@ "node": ">=10" } }, + "node_modules/yaku": { + "version": "0.16.7", + "resolved": "https://registry.npmjs.org/yaku/-/yaku-0.16.7.tgz", + "integrity": "sha512-Syu3IB3rZvKvYk7yTiyl1bo/jiEFaaStrgv1V2TIJTqYPStSMQVO8EQjg/z+DRzLq/4LIIharNT3iH1hylEIRw==", + "license": "MIT" + }, "node_modules/yargs": { "version": "17.7.2", "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", diff --git a/archivebox-ts/package.json b/archivebox-ts/package.json index 3985af0b..baed0aa7 100644 --- a/archivebox-ts/package.json +++ b/archivebox-ts/package.json @@ -26,7 +26,8 @@ "jsdom": "^27.1.0", "nanoid": "^3.3.7", "puppeteer": "^24.28.0", - "puppeteer-core": "^24.28.0" + "puppeteer-core": "^24.28.0", + "unzip-crx-3": "^0.2.0" }, "devDependencies": { "@types/better-sqlite3": "^7.6.9", diff --git a/archivebox-ts/src/extractors.ts b/archivebox-ts/src/extractors.ts index 82b869f5..5d6fa4f3 100644 --- a/archivebox-ts/src/extractors.ts +++ b/archivebox-ts/src/extractors.ts @@ -9,10 +9,12 @@ import { spawn } from 'child_process'; import type { ExtractorName } from './models'; // Predefined order for running extractors -// puppeteer must be first as it launches the browser +// 2captcha must run FIRST to download/configure extensions before Chrome launches +// puppeteer runs second to launch Chrome with the extensions // downloads, images, and infiniscroll run early to capture dynamic content export const EXTRACTOR_ORDER: string[] = [ - 'puppeteer', // Launches Chrome and writes CDP URL to .env + '2captcha', // Downloads and configures Chrome extensions (2captcha, singlefile, etc.) + 'puppeteer', // Launches Chrome with extensions and writes CDP URL to .env 'downloads', // Catches file downloads (reloads page with listeners) 'images', // Catches all images (reloads page with listeners) 'infiniscroll', // Scrolls page to load lazy content @@ -24,7 +26,7 @@ export const EXTRACTOR_ORDER: string[] = [ 'dom', // Extracts DOM HTML using existing Chrome tab 'htmltotext', // Extracts plain text using existing Chrome tab 'readability', // Extracts article content using existing Chrome tab - 'singlefile', // Creates single-file archive (may use existing Chrome) + 'singlefile', // Creates single-file archive using browser extension 'wget', // Downloads with wget (independent) 'git', // Clones git repository (independent) 'media', // Downloads media with yt-dlp (independent) diff --git a/archivebox-ts/src/models.ts b/archivebox-ts/src/models.ts index fdaec489..ec14f9b7 100644 --- a/archivebox-ts/src/models.ts +++ b/archivebox-ts/src/models.ts @@ -6,6 +6,7 @@ export type SnapshotStatus = 'queued' | 'started' | 'sealed'; export type ArchiveResultStatus = 'queued' | 'started' | 'backoff' | 'succeeded' | 'failed' | 'skipped'; export type ExtractorName = + | '2captcha' | 'puppeteer' | 'downloads' | 'images'