mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-05 18:35:50 +10:00
Add remaining extractors (dom, pdf, htmltotext, readability, singlefile, git, media, archive_org)
Implements all remaining extractors from original ArchiveBox following the serial execution pattern: Browser-based extractors (using Puppeteer + CDP): - dom: Extract full DOM HTML - pdf: Generate PDF of page - htmltotext: Extract plain text content - readability: Extract article content using Mozilla Readability algorithm Binary-based extractors (using native tools): - singlefile: Create single-file archive using single-file-cli - git: Clone git repositories - media: Download media using yt-dlp - archive_org: Submit to Internet Archive Wayback Machine All extractors: - Auto-install dependencies if needed - Accept URL as $1 argument - Output to current working directory - Configure via environment variables only - Read from .env file for shared state - Follow exit code contract (0 = success) Updates: - Added all extractor types to ExtractorName union in models.ts - Updated EXTRACTOR_ORDER with complete 14-extractor sequence - Installed jsdom and @mozilla/readability dependencies - Made all extractors executable - Updated README with complete documentation for all extractors
This commit is contained in:
@@ -137,17 +137,19 @@ node dist/cli.js extractors
|
||||
Extractors run serially in this predefined order (defined in `src/extractors.ts`):
|
||||
|
||||
1. **puppeteer** - Launches Chrome, writes CDP URL to `.env`
|
||||
2. **favicon** - Downloads favicon
|
||||
2. **favicon** - Downloads favicon (can work independently)
|
||||
3. **title** - Extracts title using existing Chrome tab
|
||||
4. **headers** - Extracts headers using existing Chrome tab
|
||||
5. **screenshot** - Takes screenshot using existing Chrome tab
|
||||
6. **dom** - Extracts DOM using existing Chrome tab
|
||||
7. **wget** - Downloads with wget
|
||||
8. **singlefile** - Single file archive
|
||||
9. **readability** - Readable content extraction
|
||||
10. **media** - Media downloads
|
||||
11. **git** - Git clone
|
||||
12. **archive_org** - Submit to archive.org
|
||||
6. **pdf** - Generates PDF using existing Chrome tab
|
||||
7. **dom** - Extracts DOM HTML using existing Chrome tab
|
||||
8. **htmltotext** - Extracts plain text using existing Chrome tab
|
||||
9. **readability** - Extracts article content using existing Chrome tab
|
||||
10. **singlefile** - Creates single-file archive (may use existing Chrome)
|
||||
11. **wget** - Downloads with wget (independent)
|
||||
12. **git** - Clones git repository (independent)
|
||||
13. **media** - Downloads media with yt-dlp (independent)
|
||||
14. **archive_org** - Submits to Internet Archive (independent)
|
||||
|
||||
Only extractors that are both:
|
||||
- Requested (via `--extractors` or default: all)
|
||||
@@ -275,6 +277,55 @@ Represents the result of running one extractor on one snapshot.
|
||||
- `SCREENSHOT_HEIGHT` - Viewport height (default: 1080)
|
||||
- `SCREENSHOT_WAIT` - Wait time before screenshot in ms (default: 1000)
|
||||
|
||||
### pdf
|
||||
- **Language**: Node.js + Puppeteer
|
||||
- **Dependencies**: puppeteer-core, Chrome (from puppeteer extractor)
|
||||
- **Output**: `output.pdf`
|
||||
- **Requires**: puppeteer extractor must run first
|
||||
- **Config**:
|
||||
- `CHROME_CDP_URL` - From .env (set by puppeteer extractor)
|
||||
- `CHROME_PAGE_TARGET_ID` - From .env (set by puppeteer extractor)
|
||||
- `PDF_TIMEOUT` - Timeout in milliseconds (default: 30000)
|
||||
- `PDF_FORMAT` - Page format: Letter, A4, etc. (default: A4)
|
||||
|
||||
### dom
|
||||
- **Language**: Node.js + Puppeteer
|
||||
- **Dependencies**: puppeteer-core, Chrome (from puppeteer extractor)
|
||||
- **Output**: `output.html`
|
||||
- **Requires**: puppeteer extractor must run first
|
||||
- **Config**:
|
||||
- `CHROME_CDP_URL` - From .env (set by puppeteer extractor)
|
||||
- `CHROME_PAGE_TARGET_ID` - From .env (set by puppeteer extractor)
|
||||
- `DOM_TIMEOUT` - Timeout in milliseconds (default: 10000)
|
||||
|
||||
### htmltotext
|
||||
- **Language**: Node.js + Puppeteer
|
||||
- **Dependencies**: puppeteer-core, Chrome (from puppeteer extractor)
|
||||
- **Output**: `output.txt`
|
||||
- **Requires**: puppeteer extractor must run first
|
||||
- **Config**:
|
||||
- `CHROME_CDP_URL` - From .env (set by puppeteer extractor)
|
||||
- `CHROME_PAGE_TARGET_ID` - From .env (set by puppeteer extractor)
|
||||
- `HTMLTOTEXT_TIMEOUT` - Timeout in milliseconds (default: 10000)
|
||||
|
||||
### readability
|
||||
- **Language**: Node.js with Mozilla Readability
|
||||
- **Dependencies**: puppeteer-core, jsdom, @mozilla/readability, Chrome (from puppeteer extractor)
|
||||
- **Output**: `readability.html` and `readability.json`
|
||||
- **Requires**: puppeteer extractor must run first
|
||||
- **Config**:
|
||||
- `CHROME_CDP_URL` - From .env (set by puppeteer extractor)
|
||||
- `CHROME_PAGE_TARGET_ID` - From .env (set by puppeteer extractor)
|
||||
- `READABILITY_TIMEOUT` - Timeout in milliseconds (default: 10000)
|
||||
|
||||
### singlefile
|
||||
- **Language**: Bash
|
||||
- **Dependencies**: single-file-cli (auto-installed via npm)
|
||||
- **Output**: `singlefile.html`
|
||||
- **Config**:
|
||||
- `SINGLEFILE_TIMEOUT` - Timeout in seconds (default: 60)
|
||||
- `CHROME_CDP_URL` - Optional: uses existing Chrome if available
|
||||
|
||||
### wget
|
||||
- **Language**: Bash
|
||||
- **Dependencies**: wget (auto-installed)
|
||||
@@ -284,6 +335,31 @@ Represents the result of running one extractor on one snapshot.
|
||||
- `WGET_USER_AGENT` - User agent string
|
||||
- `WGET_ARGS` - Additional wget arguments
|
||||
|
||||
### git
|
||||
- **Language**: Bash
|
||||
- **Dependencies**: git (auto-installed)
|
||||
- **Output**: `git/` directory with cloned repository
|
||||
- **Config**:
|
||||
- `GIT_TIMEOUT` - Timeout in seconds (default: 300)
|
||||
- `GIT_DEPTH` - Clone depth (default: full clone)
|
||||
- **Note**: Only runs if URL appears to be a git repository
|
||||
|
||||
### media
|
||||
- **Language**: Bash
|
||||
- **Dependencies**: yt-dlp (auto-installed)
|
||||
- **Output**: `media/` directory with downloaded media files
|
||||
- **Config**:
|
||||
- `MEDIA_TIMEOUT` - Timeout in seconds (default: 3600)
|
||||
- `MEDIA_MAX_SIZE` - Max file size (default: 750m)
|
||||
- `MEDIA_FORMAT` - Format selection (default: best)
|
||||
|
||||
### archive_org
|
||||
- **Language**: Bash
|
||||
- **Dependencies**: curl (auto-installed)
|
||||
- **Output**: `archive_org.txt` with archived URL
|
||||
- **Config**:
|
||||
- `ARCHIVE_ORG_TIMEOUT` - Timeout in seconds (default: 60)
|
||||
|
||||
## Creating Custom Extractors
|
||||
|
||||
Extractors are standalone executable files in the `extractors/` directory.
|
||||
|
||||
90
archivebox-ts/extractors/archive_org
Executable file
90
archivebox-ts/extractors/archive_org
Executable file
@@ -0,0 +1,90 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Archive.org Extractor
|
||||
# Submits URL to Internet Archive Wayback Machine
|
||||
#
|
||||
# Usage: archive_org <url>
|
||||
# Output: archive_org.txt with the archived URL
|
||||
# Config: All configuration via environment variables
|
||||
# ARCHIVE_ORG_TIMEOUT - Timeout in seconds (default: 60)
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
URL="$1"
|
||||
|
||||
if [ -z "$URL" ]; then
|
||||
echo "Error: URL argument required" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Auto-install curl if not available
|
||||
if ! command -v curl &> /dev/null; then
|
||||
echo "Installing curl..." >&2
|
||||
if command -v apt-get &> /dev/null; then
|
||||
sudo apt-get update && sudo apt-get install -y curl
|
||||
elif command -v yum &> /dev/null; then
|
||||
sudo yum install -y curl
|
||||
elif command -v brew &> /dev/null; then
|
||||
brew install curl
|
||||
else
|
||||
echo "Error: Cannot install curl. Please install manually." >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Configuration from environment
|
||||
TIMEOUT="${ARCHIVE_ORG_TIMEOUT:-60}"
|
||||
|
||||
echo "Submitting to Internet Archive: $URL" >&2
|
||||
|
||||
# Submit to Wayback Machine Save API
|
||||
RESPONSE=$(curl -s -m "$TIMEOUT" -X POST \
|
||||
"https://web.archive.org/save/$URL" \
|
||||
-H "Accept: application/json" \
|
||||
-w "\n%{http_code}" \
|
||||
2>&1 || echo "000")
|
||||
|
||||
# Extract HTTP status code (last line)
|
||||
HTTP_CODE=$(echo "$RESPONSE" | tail -1)
|
||||
|
||||
# Extract response body (all but last line)
|
||||
BODY=$(echo "$RESPONSE" | head -n -1)
|
||||
|
||||
echo "HTTP Status: $HTTP_CODE" >&2
|
||||
|
||||
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "302" ]; then
|
||||
# Try to extract the archived URL from response
|
||||
ARCHIVED_URL=$(echo "$BODY" | grep -o "https://web.archive.org/web/[^\"]*" | head -1 || echo "")
|
||||
|
||||
if [ -z "$ARCHIVED_URL" ]; then
|
||||
# Fallback: construct URL from current timestamp
|
||||
TIMESTAMP=$(date +%Y%m%d%H%M%S)
|
||||
ARCHIVED_URL="https://web.archive.org/web/${TIMESTAMP}/${URL}"
|
||||
fi
|
||||
|
||||
echo "$ARCHIVED_URL" > archive_org.txt
|
||||
echo "✓ Submitted to Archive.org" >&2
|
||||
echo " Archived URL: $ARCHIVED_URL" >&2
|
||||
echo "archive_org.txt"
|
||||
exit 0
|
||||
else
|
||||
# Check if already archived
|
||||
echo "Checking if already archived..." >&2
|
||||
EXISTING=$(curl -s -m 10 "https://archive.org/wayback/available?url=$URL" 2>&1 || echo "")
|
||||
|
||||
if echo "$EXISTING" | grep -q "\"available\":true"; then
|
||||
ARCHIVED_URL=$(echo "$EXISTING" | grep -o "https://web.archive.org/web/[^\"]*" | head -1)
|
||||
|
||||
if [ -n "$ARCHIVED_URL" ]; then
|
||||
echo "$ARCHIVED_URL" > archive_org.txt
|
||||
echo "✓ Already archived at Archive.org" >&2
|
||||
echo " Archived URL: $ARCHIVED_URL" >&2
|
||||
echo "archive_org.txt"
|
||||
exit 0
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "Warning: Failed to submit to Archive.org (HTTP $HTTP_CODE)" >&2
|
||||
exit 1
|
||||
fi
|
||||
113
archivebox-ts/extractors/dom
Executable file
113
archivebox-ts/extractors/dom
Executable file
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env node
|
||||
//
|
||||
// DOM Extractor
|
||||
// Extracts the full DOM HTML from a given URL using Puppeteer
|
||||
//
|
||||
// This extractor reuses an existing Chrome tab if CHROME_CDP_URL and
|
||||
// CHROME_PAGE_TARGET_ID are set in the environment (from puppeteer extractor).
|
||||
//
|
||||
// Usage: dom <url>
|
||||
// Output: output.html in current directory
|
||||
// Config: All configuration via environment variables
|
||||
// CHROME_CDP_URL - Chrome DevTools Protocol WebSocket URL (from puppeteer extractor)
|
||||
// CHROME_PAGE_TARGET_ID - Target ID of the page to reuse (from puppeteer extractor)
|
||||
// DOM_TIMEOUT - Timeout in milliseconds (default: 10000)
|
||||
//
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
// Check if puppeteer is available
|
||||
function checkPuppeteer() {
|
||||
try {
|
||||
require.resolve('puppeteer-core');
|
||||
return true;
|
||||
} catch (e) {
|
||||
console.error('Error: puppeteer-core is not installed.');
|
||||
console.error('Please install it with: npm install puppeteer-core');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const url = process.argv[2];
|
||||
|
||||
if (!url) {
|
||||
console.error('Error: URL argument required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Configuration from environment
|
||||
const cdpUrl = process.env.CHROME_CDP_URL;
|
||||
const pageTargetId = process.env.CHROME_PAGE_TARGET_ID;
|
||||
const timeout = parseInt(process.env.DOM_TIMEOUT || '10000', 10);
|
||||
|
||||
console.error(`Extracting DOM from: ${url}`);
|
||||
|
||||
// Check puppeteer is installed
|
||||
if (!checkPuppeteer()) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
|
||||
try {
|
||||
if (!cdpUrl) {
|
||||
console.error('Error: CHROME_CDP_URL environment variable not set.');
|
||||
console.error('The puppeteer extractor should have set this in .env');
|
||||
console.error('Make sure puppeteer extractor runs before this one.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Connect to existing browser
|
||||
console.error(`Connecting to browser via CDP: ${cdpUrl}`);
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl
|
||||
});
|
||||
|
||||
// Try to reuse existing page if target ID is available
|
||||
if (pageTargetId) {
|
||||
console.error(`Attempting to reuse existing page: ${pageTargetId}`);
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.target()._targetId === pageTargetId);
|
||||
|
||||
if (page) {
|
||||
console.error(`✓ Reusing existing page`);
|
||||
} else {
|
||||
console.error(`⚠ Could not find existing page, creating new one`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
|
||||
}
|
||||
} else {
|
||||
console.error(`⚠ No page target ID, creating new page`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
|
||||
}
|
||||
|
||||
// Get the full HTML
|
||||
const html = await page.content();
|
||||
|
||||
if (html && html.trim()) {
|
||||
// Write to file
|
||||
fs.writeFileSync('output.html', html, 'utf8');
|
||||
console.error(`✓ Extracted DOM (${html.length} bytes)`);
|
||||
console.log('output.html');
|
||||
|
||||
// Leave page open for next extractor
|
||||
console.error(`⚠ Leaving page open for other extractors`);
|
||||
|
||||
process.exit(0);
|
||||
} else {
|
||||
console.error('Warning: Could not extract DOM');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
console.error(`Error: ${err.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
83
archivebox-ts/extractors/git
Executable file
83
archivebox-ts/extractors/git
Executable file
@@ -0,0 +1,83 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Git Extractor
|
||||
# Clones a git repository
|
||||
#
|
||||
# Usage: git <url>
|
||||
# Output: git/ directory in current directory
|
||||
# Config: All configuration via environment variables
|
||||
# GIT_TIMEOUT - Timeout in seconds (default: 300)
|
||||
# GIT_DEPTH - Clone depth (default: full clone)
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
URL="$1"
|
||||
|
||||
if [ -z "$URL" ]; then
|
||||
echo "Error: URL argument required" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Check if URL is a git repository
|
||||
if [[ ! "$URL" =~ \.git$ ]] && [[ ! "$URL" =~ ^git@ ]] && [[ ! "$URL" =~ ^https?://.*github\.com ]] && [[ ! "$URL" =~ ^https?://.*gitlab\.com ]]; then
|
||||
echo "Skipping: URL does not appear to be a git repository" >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Auto-install git if not available
|
||||
if ! command -v git &> /dev/null; then
|
||||
echo "Installing git..." >&2
|
||||
if command -v apt-get &> /dev/null; then
|
||||
sudo apt-get update && sudo apt-get install -y git
|
||||
elif command -v yum &> /dev/null; then
|
||||
sudo yum install -y git
|
||||
elif command -v brew &> /dev/null; then
|
||||
brew install git
|
||||
else
|
||||
echo "Error: Cannot install git. Please install manually." >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Configuration from environment
|
||||
TIMEOUT="${GIT_TIMEOUT:-300}"
|
||||
DEPTH="${GIT_DEPTH:-}"
|
||||
|
||||
echo "Cloning git repository: $URL" >&2
|
||||
|
||||
# Create output directory
|
||||
mkdir -p git
|
||||
|
||||
# Build git clone command
|
||||
GIT_CMD="git clone"
|
||||
|
||||
if [ -n "$DEPTH" ]; then
|
||||
GIT_CMD="$GIT_CMD --depth=$DEPTH"
|
||||
fi
|
||||
|
||||
# Set timeout and clone
|
||||
timeout "$TIMEOUT" $GIT_CMD "$URL" git/ 2>&1 | head -20 || {
|
||||
EXIT_CODE=$?
|
||||
if [ $EXIT_CODE -eq 124 ]; then
|
||||
echo "Error: Git clone timed out after ${TIMEOUT}s" >&2
|
||||
else
|
||||
echo "Error: Git clone failed with exit code $EXIT_CODE" >&2
|
||||
fi
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ -d "git/.git" ]; then
|
||||
# Get some stats
|
||||
COMMIT_COUNT=$(cd git && git rev-list --count HEAD 2>/dev/null || echo "unknown")
|
||||
REPO_SIZE=$(du -sh git 2>/dev/null | cut -f1 || echo "unknown")
|
||||
|
||||
echo "✓ Cloned repository" >&2
|
||||
echo " Commits: $COMMIT_COUNT" >&2
|
||||
echo " Size: $REPO_SIZE" >&2
|
||||
echo "git"
|
||||
exit 0
|
||||
else
|
||||
echo "Error: Failed to clone repository" >&2
|
||||
exit 1
|
||||
fi
|
||||
120
archivebox-ts/extractors/htmltotext
Executable file
120
archivebox-ts/extractors/htmltotext
Executable file
@@ -0,0 +1,120 @@
|
||||
#!/usr/bin/env node
|
||||
//
|
||||
// HTML to Text Extractor
|
||||
// Extracts readable text from a page using Puppeteer
|
||||
//
|
||||
// This extractor reuses an existing Chrome tab if CHROME_CDP_URL and
|
||||
// CHROME_PAGE_TARGET_ID are set in the environment (from puppeteer extractor).
|
||||
//
|
||||
// Usage: htmltotext <url>
|
||||
// Output: output.txt in current directory
|
||||
// Config: All configuration via environment variables
|
||||
// CHROME_CDP_URL - Chrome DevTools Protocol WebSocket URL (from puppeteer extractor)
|
||||
// CHROME_PAGE_TARGET_ID - Target ID of the page to reuse (from puppeteer extractor)
|
||||
// HTMLTOTEXT_TIMEOUT - Timeout in milliseconds (default: 10000)
|
||||
//
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
// Check if puppeteer is available
|
||||
function checkPuppeteer() {
|
||||
try {
|
||||
require.resolve('puppeteer-core');
|
||||
return true;
|
||||
} catch (e) {
|
||||
console.error('Error: puppeteer-core is not installed.');
|
||||
console.error('Please install it with: npm install puppeteer-core');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const url = process.argv[2];
|
||||
|
||||
if (!url) {
|
||||
console.error('Error: URL argument required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Configuration from environment
|
||||
const cdpUrl = process.env.CHROME_CDP_URL;
|
||||
const pageTargetId = process.env.CHROME_PAGE_TARGET_ID;
|
||||
const timeout = parseInt(process.env.HTMLTOTEXT_TIMEOUT || '10000', 10);
|
||||
|
||||
console.error(`Extracting text from: ${url}`);
|
||||
|
||||
// Check puppeteer is installed
|
||||
if (!checkPuppeteer()) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
|
||||
try {
|
||||
if (!cdpUrl) {
|
||||
console.error('Error: CHROME_CDP_URL environment variable not set.');
|
||||
console.error('The puppeteer extractor should have set this in .env');
|
||||
console.error('Make sure puppeteer extractor runs before this one.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Connect to existing browser
|
||||
console.error(`Connecting to browser via CDP: ${cdpUrl}`);
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl
|
||||
});
|
||||
|
||||
// Try to reuse existing page if target ID is available
|
||||
if (pageTargetId) {
|
||||
console.error(`Attempting to reuse existing page: ${pageTargetId}`);
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.target()._targetId === pageTargetId);
|
||||
|
||||
if (page) {
|
||||
console.error(`✓ Reusing existing page`);
|
||||
} else {
|
||||
console.error(`⚠ Could not find existing page, creating new one`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
|
||||
}
|
||||
} else {
|
||||
console.error(`⚠ No page target ID, creating new page`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
|
||||
}
|
||||
|
||||
// Extract text content from the page
|
||||
const text = await page.evaluate(() => {
|
||||
// Remove script and style elements
|
||||
const scripts = document.querySelectorAll('script, style, noscript');
|
||||
scripts.forEach(el => el.remove());
|
||||
|
||||
// Get text content
|
||||
return document.body.innerText || document.body.textContent;
|
||||
});
|
||||
|
||||
if (text && text.trim()) {
|
||||
// Write to file
|
||||
fs.writeFileSync('output.txt', text.trim(), 'utf8');
|
||||
console.error(`✓ Extracted text (${text.length} bytes)`);
|
||||
console.log('output.txt');
|
||||
|
||||
// Leave page open for next extractor
|
||||
console.error(`⚠ Leaving page open for other extractors`);
|
||||
|
||||
process.exit(0);
|
||||
} else {
|
||||
console.error('Warning: Could not extract text');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
console.error(`Error: ${err.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
91
archivebox-ts/extractors/media
Executable file
91
archivebox-ts/extractors/media
Executable file
@@ -0,0 +1,91 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Media Extractor
|
||||
# Downloads media (video/audio) using yt-dlp
|
||||
#
|
||||
# Usage: media <url>
|
||||
# Output: media/ directory in current directory
|
||||
# Config: All configuration via environment variables
|
||||
# MEDIA_TIMEOUT - Timeout in seconds (default: 3600)
|
||||
# MEDIA_MAX_SIZE - Max file size (default: 750m)
|
||||
# MEDIA_FORMAT - Format selection (default: best)
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
URL="$1"
|
||||
|
||||
if [ -z "$URL" ]; then
|
||||
echo "Error: URL argument required" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Auto-install yt-dlp if not available
|
||||
if ! command -v yt-dlp &> /dev/null; then
|
||||
echo "Installing yt-dlp..." >&2
|
||||
if command -v apt-get &> /dev/null; then
|
||||
sudo apt-get update && sudo apt-get install -y yt-dlp || {
|
||||
echo "Installing yt-dlp via pip..." >&2
|
||||
pip3 install -U yt-dlp || pip install -U yt-dlp
|
||||
}
|
||||
elif command -v brew &> /dev/null; then
|
||||
brew install yt-dlp
|
||||
else
|
||||
echo "Installing yt-dlp via pip..." >&2
|
||||
pip3 install -U yt-dlp 2>/dev/null || pip install -U yt-dlp 2>/dev/null || {
|
||||
echo "Error: Cannot install yt-dlp. Please install manually." >&2
|
||||
exit 1
|
||||
}
|
||||
fi
|
||||
fi
|
||||
|
||||
# Configuration from environment
|
||||
TIMEOUT="${MEDIA_TIMEOUT:-3600}"
|
||||
MAX_SIZE="${MEDIA_MAX_SIZE:-750m}"
|
||||
FORMAT="${MEDIA_FORMAT:-best}"
|
||||
|
||||
echo "Downloading media from: $URL" >&2
|
||||
|
||||
# Create output directory
|
||||
mkdir -p media
|
||||
|
||||
# Run yt-dlp with timeout
|
||||
timeout "$TIMEOUT" yt-dlp \
|
||||
--output "media/%(title)s.%(ext)s" \
|
||||
--format "$FORMAT" \
|
||||
--max-filesize "$MAX_SIZE" \
|
||||
--write-info-json \
|
||||
--write-description \
|
||||
--write-thumbnail \
|
||||
--write-annotations \
|
||||
--all-subs \
|
||||
--embed-subs \
|
||||
--embed-thumbnail \
|
||||
--embed-metadata \
|
||||
--add-metadata \
|
||||
--no-overwrites \
|
||||
--continue \
|
||||
--ignore-errors \
|
||||
--no-warnings \
|
||||
"$URL" 2>&1 | grep -v "^$" | head -50 || {
|
||||
EXIT_CODE=$?
|
||||
if [ $EXIT_CODE -eq 124 ]; then
|
||||
echo "Warning: Download timed out after ${TIMEOUT}s" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Check if any files were downloaded
|
||||
if [ -n "$(ls -A media 2>/dev/null)" ]; then
|
||||
FILE_COUNT=$(ls -1 media | wc -l)
|
||||
TOTAL_SIZE=$(du -sh media 2>/dev/null | cut -f1 || echo "unknown")
|
||||
|
||||
echo "✓ Downloaded media" >&2
|
||||
echo " Files: $FILE_COUNT" >&2
|
||||
echo " Total size: $TOTAL_SIZE" >&2
|
||||
echo "media"
|
||||
exit 0
|
||||
else
|
||||
echo "Warning: No media files downloaded (URL may not contain video/audio)" >&2
|
||||
# Don't fail - URL might just not have downloadable media
|
||||
exit 0
|
||||
fi
|
||||
118
archivebox-ts/extractors/pdf
Executable file
118
archivebox-ts/extractors/pdf
Executable file
@@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env node
|
||||
//
|
||||
// PDF Extractor
|
||||
// Generates a PDF of a given URL using Puppeteer
|
||||
//
|
||||
// This extractor reuses an existing Chrome tab if CHROME_CDP_URL and
|
||||
// CHROME_PAGE_TARGET_ID are set in the environment (from puppeteer extractor).
|
||||
//
|
||||
// Usage: pdf <url>
|
||||
// Output: output.pdf in current directory
|
||||
// Config: All configuration via environment variables
|
||||
// CHROME_CDP_URL - Chrome DevTools Protocol WebSocket URL (from puppeteer extractor)
|
||||
// CHROME_PAGE_TARGET_ID - Target ID of the page to reuse (from puppeteer extractor)
|
||||
// PDF_TIMEOUT - Timeout in milliseconds (default: 30000)
|
||||
// PDF_FORMAT - Page format: Letter, A4, etc. (default: A4)
|
||||
//
|
||||
|
||||
const fs = require('fs');
|
||||
|
||||
// Check if puppeteer is available
|
||||
function checkPuppeteer() {
|
||||
try {
|
||||
require.resolve('puppeteer-core');
|
||||
return true;
|
||||
} catch (e) {
|
||||
console.error('Error: puppeteer-core is not installed.');
|
||||
console.error('Please install it with: npm install puppeteer-core');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const url = process.argv[2];
|
||||
|
||||
if (!url) {
|
||||
console.error('Error: URL argument required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Configuration from environment
|
||||
const cdpUrl = process.env.CHROME_CDP_URL;
|
||||
const pageTargetId = process.env.CHROME_PAGE_TARGET_ID;
|
||||
const timeout = parseInt(process.env.PDF_TIMEOUT || '30000', 10);
|
||||
const format = process.env.PDF_FORMAT || 'A4';
|
||||
|
||||
console.error(`Generating PDF from: ${url}`);
|
||||
|
||||
// Check puppeteer is installed
|
||||
if (!checkPuppeteer()) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
|
||||
try {
|
||||
if (!cdpUrl) {
|
||||
console.error('Error: CHROME_CDP_URL environment variable not set.');
|
||||
console.error('The puppeteer extractor should have set this in .env');
|
||||
console.error('Make sure puppeteer extractor runs before this one.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Connect to existing browser
|
||||
console.error(`Connecting to browser via CDP: ${cdpUrl}`);
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl
|
||||
});
|
||||
|
||||
// Try to reuse existing page if target ID is available
|
||||
if (pageTargetId) {
|
||||
console.error(`Attempting to reuse existing page: ${pageTargetId}`);
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.target()._targetId === pageTargetId);
|
||||
|
||||
if (page) {
|
||||
console.error(`✓ Reusing existing page`);
|
||||
} else {
|
||||
console.error(`⚠ Could not find existing page, creating new one`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'networkidle2' });
|
||||
}
|
||||
} else {
|
||||
console.error(`⚠ No page target ID, creating new page`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'networkidle2' });
|
||||
}
|
||||
|
||||
// Generate PDF
|
||||
await page.pdf({
|
||||
path: 'output.pdf',
|
||||
format: format,
|
||||
printBackground: true,
|
||||
margin: {
|
||||
top: '20px',
|
||||
right: '20px',
|
||||
bottom: '20px',
|
||||
left: '20px'
|
||||
}
|
||||
});
|
||||
|
||||
console.error('✓ Generated PDF: output.pdf');
|
||||
console.log('output.pdf');
|
||||
|
||||
// Leave page open for next extractor
|
||||
console.error(`⚠ Leaving page open for other extractors`);
|
||||
|
||||
process.exit(0);
|
||||
|
||||
} catch (err) {
|
||||
console.error(`Error: ${err.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
153
archivebox-ts/extractors/readability
Executable file
153
archivebox-ts/extractors/readability
Executable file
@@ -0,0 +1,153 @@
|
||||
#!/usr/bin/env node
|
||||
//
|
||||
// Readability Extractor
|
||||
// Extracts article content using Mozilla's Readability algorithm
|
||||
//
|
||||
// This extractor reuses an existing Chrome tab to get the DOM, then
|
||||
// applies Readability to extract the main content.
|
||||
//
|
||||
// Usage: readability <url>
|
||||
// Output: readability.html and readability.json in current directory
|
||||
// Config: All configuration via environment variables
|
||||
// CHROME_CDP_URL - Chrome DevTools Protocol WebSocket URL (from puppeteer extractor)
|
||||
// CHROME_PAGE_TARGET_ID - Target ID of the page to reuse (from puppeteer extractor)
|
||||
// READABILITY_TIMEOUT - Timeout in milliseconds (default: 10000)
|
||||
//
|
||||
|
||||
const fs = require('fs');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const { Readability } = require('@mozilla/readability');
|
||||
|
||||
// Check if dependencies are available
|
||||
function checkDependencies() {
|
||||
try {
|
||||
require.resolve('puppeteer-core');
|
||||
require.resolve('jsdom');
|
||||
require.resolve('@mozilla/readability');
|
||||
return true;
|
||||
} catch (e) {
|
||||
console.error('Error: Missing dependencies.');
|
||||
console.error('Please install: npm install puppeteer-core jsdom @mozilla/readability');
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const url = process.argv[2];
|
||||
|
||||
if (!url) {
|
||||
console.error('Error: URL argument required');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Check dependencies
|
||||
if (!checkDependencies()) {
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Configuration from environment
|
||||
const cdpUrl = process.env.CHROME_CDP_URL;
|
||||
const pageTargetId = process.env.CHROME_PAGE_TARGET_ID;
|
||||
const timeout = parseInt(process.env.READABILITY_TIMEOUT || '10000', 10);
|
||||
|
||||
console.error(`Extracting readable content from: ${url}`);
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
|
||||
try {
|
||||
if (!cdpUrl) {
|
||||
console.error('Error: CHROME_CDP_URL environment variable not set.');
|
||||
console.error('The puppeteer extractor should have set this in .env');
|
||||
console.error('Make sure puppeteer extractor runs before this one.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Connect to existing browser
|
||||
console.error(`Connecting to browser via CDP: ${cdpUrl}`);
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl
|
||||
});
|
||||
|
||||
// Try to reuse existing page if target ID is available
|
||||
if (pageTargetId) {
|
||||
console.error(`Attempting to reuse existing page: ${pageTargetId}`);
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.target()._targetId === pageTargetId);
|
||||
|
||||
if (page) {
|
||||
console.error(`✓ Reusing existing page`);
|
||||
} else {
|
||||
console.error(`⚠ Could not find existing page, creating new one`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
|
||||
}
|
||||
} else {
|
||||
console.error(`⚠ No page target ID, creating new page`);
|
||||
page = await browser.newPage();
|
||||
await page.goto(url, { timeout, waitUntil: 'domcontentloaded' });
|
||||
}
|
||||
|
||||
// Get the HTML
|
||||
const html = await page.content();
|
||||
|
||||
// Parse with JSDOM
|
||||
const dom = new JSDOM(html, { url });
|
||||
const reader = new Readability(dom.window.document);
|
||||
const article = reader.parse();
|
||||
|
||||
if (article) {
|
||||
// Write HTML content
|
||||
const htmlContent = `<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>${article.title || 'Article'}</title>
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<h1>${article.title || ''}</h1>
|
||||
${article.byline ? `<p class="byline">${article.byline}</p>` : ''}
|
||||
<div class="content">
|
||||
${article.content}
|
||||
</div>
|
||||
</article>
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
fs.writeFileSync('readability.html', htmlContent, 'utf8');
|
||||
|
||||
// Write JSON metadata
|
||||
const metadata = {
|
||||
title: article.title,
|
||||
byline: article.byline,
|
||||
excerpt: article.excerpt,
|
||||
siteName: article.siteName,
|
||||
length: article.length,
|
||||
textContent: article.textContent.substring(0, 1000) + '...' // First 1000 chars
|
||||
};
|
||||
|
||||
fs.writeFileSync('readability.json', JSON.stringify(metadata, null, 2), 'utf8');
|
||||
|
||||
console.error(`✓ Extracted article: ${article.title || '(untitled)'}`);
|
||||
console.error(` Length: ${article.length} characters`);
|
||||
console.log('readability.html');
|
||||
|
||||
// Leave page open for next extractor
|
||||
console.error(`⚠ Leaving page open for other extractors`);
|
||||
|
||||
process.exit(0);
|
||||
} else {
|
||||
console.error('Warning: Could not extract readable content');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
} catch (err) {
|
||||
console.error(`Error: ${err.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
62
archivebox-ts/extractors/singlefile
Executable file
62
archivebox-ts/extractors/singlefile
Executable file
@@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# SingleFile Extractor
|
||||
# Creates a single HTML file archive using single-file-cli
|
||||
#
|
||||
# Usage: singlefile <url>
|
||||
# Output: singlefile.html in current directory
|
||||
# Config: All configuration via environment variables
|
||||
# SINGLEFILE_TIMEOUT - Timeout in seconds (default: 60)
|
||||
# CHROME_CDP_URL - Optional: Chrome CDP URL to use existing browser
|
||||
#
|
||||
|
||||
set -e
|
||||
|
||||
URL="$1"
|
||||
|
||||
if [ -z "$URL" ]; then
|
||||
echo "Error: URL argument required" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Auto-install single-file-cli if not available
|
||||
if ! command -v single-file &> /dev/null; then
|
||||
echo "Installing single-file-cli..." >&2
|
||||
npm install -g single-file-cli 2>&1 | grep -v "^npm WARN" || true
|
||||
fi
|
||||
|
||||
# Configuration from environment
|
||||
TIMEOUT="${SINGLEFILE_TIMEOUT:-60}"
|
||||
|
||||
echo "Creating single-file archive of: $URL" >&2
|
||||
|
||||
# Check if we have a Chrome CDP URL to use
|
||||
if [ -n "$CHROME_CDP_URL" ]; then
|
||||
echo "Using existing Chrome instance via CDP" >&2
|
||||
# Extract the debugging port from CDP URL
|
||||
# ws://127.0.0.1:12345/devtools/browser/... -> 12345
|
||||
PORT=$(echo "$CHROME_CDP_URL" | sed -E 's|.*:([0-9]+)/.*|\1|')
|
||||
|
||||
single-file \
|
||||
--browser-args="--remote-debugging-port=$PORT" \
|
||||
--browser-server="ws://localhost:$PORT" \
|
||||
--dump-content \
|
||||
"$URL" \
|
||||
singlefile.html 2>&1 | grep -v "^$" || true
|
||||
else
|
||||
echo "Launching new browser instance" >&2
|
||||
single-file \
|
||||
--dump-content \
|
||||
"$URL" \
|
||||
singlefile.html 2>&1 | grep -v "^$" || true
|
||||
fi
|
||||
|
||||
if [ -f "singlefile.html" ] && [ -s "singlefile.html" ]; then
|
||||
SIZE=$(stat -f%z "singlefile.html" 2>/dev/null || stat -c%s "singlefile.html" 2>/dev/null || echo "unknown")
|
||||
echo "✓ Created single-file archive ($SIZE bytes)" >&2
|
||||
echo "singlefile.html"
|
||||
exit 0
|
||||
else
|
||||
echo "Error: Failed to create single-file archive" >&2
|
||||
exit 1
|
||||
fi
|
||||
513
archivebox-ts/package-lock.json
generated
513
archivebox-ts/package-lock.json
generated
@@ -9,8 +9,10 @@
|
||||
"version": "0.1.0",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"better-sqlite3": "^11.0.0",
|
||||
"commander": "^12.0.0",
|
||||
"jsdom": "^27.1.0",
|
||||
"nanoid": "^3.3.7",
|
||||
"puppeteer": "^24.28.0",
|
||||
"puppeteer-core": "^24.28.0"
|
||||
@@ -24,6 +26,62 @@
|
||||
"typescript": "^5.3.3"
|
||||
}
|
||||
},
|
||||
"node_modules/@acemir/cssom": {
|
||||
"version": "0.9.19",
|
||||
"resolved": "https://registry.npmjs.org/@acemir/cssom/-/cssom-0.9.19.tgz",
|
||||
"integrity": "sha512-Pp2gAQXPZ2o7lt4j0IMwNRXqQ3pagxtDj5wctL5U2Lz4oV0ocDNlkgx4DpxfyKav4S/bePuI+SMqcBSUHLy9kg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@asamuzakjp/css-color": {
|
||||
"version": "4.0.5",
|
||||
"resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-4.0.5.tgz",
|
||||
"integrity": "sha512-lMrXidNhPGsDjytDy11Vwlb6OIGrT3CmLg3VWNFyWkLWtijKl7xjvForlh8vuj0SHGjgl4qZEQzUmYTeQA2JFQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@csstools/css-calc": "^2.1.4",
|
||||
"@csstools/css-color-parser": "^3.1.0",
|
||||
"@csstools/css-parser-algorithms": "^3.0.5",
|
||||
"@csstools/css-tokenizer": "^3.0.4",
|
||||
"lru-cache": "^11.2.1"
|
||||
}
|
||||
},
|
||||
"node_modules/@asamuzakjp/css-color/node_modules/lru-cache": {
|
||||
"version": "11.2.2",
|
||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.2.2.tgz",
|
||||
"integrity": "sha512-F9ODfyqML2coTIsQpSkRHnLSZMtkU8Q+mSfcaIyKwy58u+8k5nvAYeiNhsyMARvzNcXJ9QfWVrcPsC9e9rAxtg==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": "20 || >=22"
|
||||
}
|
||||
},
|
||||
"node_modules/@asamuzakjp/dom-selector": {
|
||||
"version": "6.7.4",
|
||||
"resolved": "https://registry.npmjs.org/@asamuzakjp/dom-selector/-/dom-selector-6.7.4.tgz",
|
||||
"integrity": "sha512-buQDjkm+wDPXd6c13534URWZqbz0RP5PAhXZ+LIoa5LgwInT9HVJvGIJivg75vi8I13CxDGdTnz+aY5YUJlIAA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@asamuzakjp/nwsapi": "^2.3.9",
|
||||
"bidi-js": "^1.0.3",
|
||||
"css-tree": "^3.1.0",
|
||||
"is-potential-custom-element-name": "^1.0.1",
|
||||
"lru-cache": "^11.2.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@asamuzakjp/dom-selector/node_modules/lru-cache": {
|
||||
"version": "11.2.2",
|
||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.2.2.tgz",
|
||||
"integrity": "sha512-F9ODfyqML2coTIsQpSkRHnLSZMtkU8Q+mSfcaIyKwy58u+8k5nvAYeiNhsyMARvzNcXJ9QfWVrcPsC9e9rAxtg==",
|
||||
"license": "ISC",
|
||||
"engines": {
|
||||
"node": "20 || >=22"
|
||||
}
|
||||
},
|
||||
"node_modules/@asamuzakjp/nwsapi": {
|
||||
"version": "2.3.9",
|
||||
"resolved": "https://registry.npmjs.org/@asamuzakjp/nwsapi/-/nwsapi-2.3.9.tgz",
|
||||
"integrity": "sha512-n8GuYSrI9bF7FFZ/SjhwevlHc8xaVlb/7HmHelnc/PZXBD2ZR49NnN9sMMuDdEGPeeRQ5d0hqlSlEpgCX3Wl0Q==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@babel/code-frame": {
|
||||
"version": "7.27.1",
|
||||
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
|
||||
@@ -47,6 +105,144 @@
|
||||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@csstools/color-helpers": {
|
||||
"version": "5.1.0",
|
||||
"resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-5.1.0.tgz",
|
||||
"integrity": "sha512-S11EXWJyy0Mz5SYvRmY8nJYTFFd1LCNV+7cXyAgQtOOuzb4EsgfqDufL+9esx72/eLhsRdGZwaldu/h+E4t4BA==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/csstools"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/csstools"
|
||||
}
|
||||
],
|
||||
"license": "MIT-0",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@csstools/css-calc": {
|
||||
"version": "2.1.4",
|
||||
"resolved": "https://registry.npmjs.org/@csstools/css-calc/-/css-calc-2.1.4.tgz",
|
||||
"integrity": "sha512-3N8oaj+0juUw/1H3YwmDDJXCgTB1gKU6Hc/bB502u9zR0q2vd786XJH9QfrKIEgFlZmhZiq6epXl4rHqhzsIgQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/csstools"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/csstools"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@csstools/css-parser-algorithms": "^3.0.5",
|
||||
"@csstools/css-tokenizer": "^3.0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@csstools/css-color-parser": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/@csstools/css-color-parser/-/css-color-parser-3.1.0.tgz",
|
||||
"integrity": "sha512-nbtKwh3a6xNVIp/VRuXV64yTKnb1IjTAEEh3irzS+HkKjAOYLTGNb9pmVNntZ8iVBHcWDA2Dof0QtPgFI1BaTA==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/csstools"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/csstools"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@csstools/color-helpers": "^5.1.0",
|
||||
"@csstools/css-calc": "^2.1.4"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@csstools/css-parser-algorithms": "^3.0.5",
|
||||
"@csstools/css-tokenizer": "^3.0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@csstools/css-parser-algorithms": {
|
||||
"version": "3.0.5",
|
||||
"resolved": "https://registry.npmjs.org/@csstools/css-parser-algorithms/-/css-parser-algorithms-3.0.5.tgz",
|
||||
"integrity": "sha512-DaDeUkXZKjdGhgYaHNJTV9pV7Y9B3b644jCLs9Upc3VeNGg6LWARAT6O+Q+/COo+2gg/bM5rhpMAtf70WqfBdQ==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/csstools"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/csstools"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"@csstools/css-tokenizer": "^3.0.4"
|
||||
}
|
||||
},
|
||||
"node_modules/@csstools/css-syntax-patches-for-csstree": {
|
||||
"version": "1.0.15",
|
||||
"resolved": "https://registry.npmjs.org/@csstools/css-syntax-patches-for-csstree/-/css-syntax-patches-for-csstree-1.0.15.tgz",
|
||||
"integrity": "sha512-q0p6zkVq2lJnmzZVPR33doA51G7YOja+FBvRdp5ISIthL0MtFCgYHHhR563z9WFGxcOn0WfjSkPDJ5Qig3H3Sw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/csstools"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/csstools"
|
||||
}
|
||||
],
|
||||
"license": "MIT-0",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@csstools/css-tokenizer": {
|
||||
"version": "3.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@csstools/css-tokenizer/-/css-tokenizer-3.0.4.tgz",
|
||||
"integrity": "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw==",
|
||||
"funding": [
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/csstools"
|
||||
},
|
||||
{
|
||||
"type": "opencollective",
|
||||
"url": "https://opencollective.com/csstools"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@mozilla/readability": {
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.6.0.tgz",
|
||||
"integrity": "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@puppeteer/browsers": {
|
||||
"version": "2.10.13",
|
||||
"resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.10.13.tgz",
|
||||
@@ -325,6 +521,15 @@
|
||||
"prebuild-install": "^7.1.1"
|
||||
}
|
||||
},
|
||||
"node_modules/bidi-js": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/bidi-js/-/bidi-js-1.0.3.tgz",
|
||||
"integrity": "sha512-RKshQI1R3YQ+n9YJz2QQ147P66ELpa1FQEg20Dk8oW9t2KgLbpDLLp9aGZ7y8WHSshDknG0bknqGw5/tyCs5tw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"require-from-string": "^2.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/bindings": {
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz",
|
||||
@@ -473,6 +678,33 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/css-tree": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/css-tree/-/css-tree-3.1.0.tgz",
|
||||
"integrity": "sha512-0eW44TGN5SQXU1mWSkKwFstI/22X2bG1nYzZTYMAWjylYURhse752YgbE4Cx46AC+bAvI+/dYTPRk1LqSUnu6w==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"mdn-data": "2.12.2",
|
||||
"source-map-js": "^1.0.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": "^10 || ^12.20.0 || ^14.13.0 || >=15.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/cssstyle": {
|
||||
"version": "5.3.2",
|
||||
"resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-5.3.2.tgz",
|
||||
"integrity": "sha512-zDMqXh8Vs1CdRYZQ2M633m/SFgcjlu8RB8b/1h82i+6vpArF507NSYIWJHGlJaTWoS+imcnctmEz43txhbVkOw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@asamuzakjp/css-color": "^4.0.3",
|
||||
"@csstools/css-syntax-patches-for-csstree": "^1.0.14",
|
||||
"css-tree": "^3.1.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
}
|
||||
},
|
||||
"node_modules/data-uri-to-buffer": {
|
||||
"version": "6.0.2",
|
||||
"resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz",
|
||||
@@ -482,6 +714,19 @@
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/data-urls": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/data-urls/-/data-urls-6.0.0.tgz",
|
||||
"integrity": "sha512-BnBS08aLUM+DKamupXs3w2tJJoqU+AkaE/+6vQxi/G/DPmIZFJJp9Dkb1kM03AZx8ADehDUZgsNxju3mPXZYIA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"whatwg-mimetype": "^4.0.0",
|
||||
"whatwg-url": "^15.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
}
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||
@@ -499,6 +744,12 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/decimal.js": {
|
||||
"version": "10.6.0",
|
||||
"resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.6.0.tgz",
|
||||
"integrity": "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/decompress-response": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
|
||||
@@ -567,6 +818,18 @@
|
||||
"once": "^1.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/entities": {
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz",
|
||||
"integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">=0.12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/fb55/entities?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/env-paths": {
|
||||
"version": "2.2.1",
|
||||
"resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz",
|
||||
@@ -755,6 +1018,18 @@
|
||||
"integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/html-encoding-sniffer": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz",
|
||||
"integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"whatwg-encoding": "^3.1.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/http-proxy-agent": {
|
||||
"version": "7.0.2",
|
||||
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz",
|
||||
@@ -781,6 +1056,18 @@
|
||||
"node": ">= 14"
|
||||
}
|
||||
},
|
||||
"node_modules/iconv-lite": {
|
||||
"version": "0.6.3",
|
||||
"resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
|
||||
"integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"safer-buffer": ">= 2.1.2 < 3.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/ieee754": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
|
||||
@@ -853,6 +1140,12 @@
|
||||
"node": ">=8"
|
||||
}
|
||||
},
|
||||
"node_modules/is-potential-custom-element-name": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz",
|
||||
"integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/js-tokens": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
|
||||
@@ -871,6 +1164,45 @@
|
||||
"js-yaml": "bin/js-yaml.js"
|
||||
}
|
||||
},
|
||||
"node_modules/jsdom": {
|
||||
"version": "27.1.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-27.1.0.tgz",
|
||||
"integrity": "sha512-Pcfm3eZ+eO4JdZCXthW9tCDT3nF4K+9dmeZ+5X39n+Kqz0DDIABRP5CAEOHRFZk8RGuC2efksTJxrjp8EXCunQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@acemir/cssom": "^0.9.19",
|
||||
"@asamuzakjp/dom-selector": "^6.7.3",
|
||||
"cssstyle": "^5.3.2",
|
||||
"data-urls": "^6.0.0",
|
||||
"decimal.js": "^10.6.0",
|
||||
"html-encoding-sniffer": "^4.0.0",
|
||||
"http-proxy-agent": "^7.0.2",
|
||||
"https-proxy-agent": "^7.0.6",
|
||||
"is-potential-custom-element-name": "^1.0.1",
|
||||
"parse5": "^8.0.0",
|
||||
"saxes": "^6.0.0",
|
||||
"symbol-tree": "^3.2.4",
|
||||
"tough-cookie": "^6.0.0",
|
||||
"w3c-xmlserializer": "^5.0.0",
|
||||
"webidl-conversions": "^8.0.0",
|
||||
"whatwg-encoding": "^3.1.1",
|
||||
"whatwg-mimetype": "^4.0.0",
|
||||
"whatwg-url": "^15.1.0",
|
||||
"ws": "^8.18.3",
|
||||
"xml-name-validator": "^5.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": "^20.19.0 || ^22.12.0 || >=24.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"canvas": "^3.0.0"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"canvas": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/json-parse-even-better-errors": {
|
||||
"version": "2.3.1",
|
||||
"resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz",
|
||||
@@ -892,6 +1224,12 @@
|
||||
"node": ">=12"
|
||||
}
|
||||
},
|
||||
"node_modules/mdn-data": {
|
||||
"version": "2.12.2",
|
||||
"resolved": "https://registry.npmjs.org/mdn-data/-/mdn-data-2.12.2.tgz",
|
||||
"integrity": "sha512-IEn+pegP1aManZuckezWCO+XZQDplx1366JoVhTpMpBB1sPey/SbveZQUosKiKiGYjg1wH4pMlNgXbCiYgihQA==",
|
||||
"license": "CC0-1.0"
|
||||
},
|
||||
"node_modules/mimic-response": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
|
||||
@@ -1047,6 +1385,18 @@
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/parse5": {
|
||||
"version": "8.0.0",
|
||||
"resolved": "https://registry.npmjs.org/parse5/-/parse5-8.0.0.tgz",
|
||||
"integrity": "sha512-9m4m5GSgXjL4AjumKzq1Fgfp3Z8rsvjRNbnkVwfu2ImRqE5D0LnY2QfDen18FSY9C573YU5XxSapdHZTZ2WolA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"entities": "^6.0.0"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/inikulin/parse5?sponsor=1"
|
||||
}
|
||||
},
|
||||
"node_modules/pend": {
|
||||
"version": "1.2.0",
|
||||
"resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
|
||||
@@ -1129,6 +1479,15 @@
|
||||
"once": "^1.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/punycode": {
|
||||
"version": "2.3.1",
|
||||
"resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
|
||||
"integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/puppeteer": {
|
||||
"version": "24.28.0",
|
||||
"resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-24.28.0.tgz",
|
||||
@@ -1206,6 +1565,15 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/require-from-string": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz",
|
||||
"integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/resolve-from": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
|
||||
@@ -1235,6 +1603,24 @@
|
||||
],
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/safer-buffer": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
|
||||
"integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/saxes": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz",
|
||||
"integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"xmlchars": "^2.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=v12.22.7"
|
||||
}
|
||||
},
|
||||
"node_modules/semver": {
|
||||
"version": "7.7.3",
|
||||
"resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
|
||||
@@ -1340,6 +1726,15 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/source-map-js": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
|
||||
"integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
|
||||
"license": "BSD-3-Clause",
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/streamx": {
|
||||
"version": "2.23.0",
|
||||
"resolved": "https://registry.npmjs.org/streamx/-/streamx-2.23.0.tgz",
|
||||
@@ -1395,6 +1790,12 @@
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
},
|
||||
"node_modules/symbol-tree": {
|
||||
"version": "3.2.4",
|
||||
"resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz",
|
||||
"integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tar-fs": {
|
||||
"version": "2.1.4",
|
||||
"resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz",
|
||||
@@ -1432,6 +1833,48 @@
|
||||
"b4a": "^1.6.4"
|
||||
}
|
||||
},
|
||||
"node_modules/tldts": {
|
||||
"version": "7.0.17",
|
||||
"resolved": "https://registry.npmjs.org/tldts/-/tldts-7.0.17.tgz",
|
||||
"integrity": "sha512-Y1KQBgDd/NUc+LfOtKS6mNsC9CCaH+m2P1RoIZy7RAPo3C3/t8X45+zgut31cRZtZ3xKPjfn3TkGTrctC2TQIQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tldts-core": "^7.0.17"
|
||||
},
|
||||
"bin": {
|
||||
"tldts": "bin/cli.js"
|
||||
}
|
||||
},
|
||||
"node_modules/tldts-core": {
|
||||
"version": "7.0.17",
|
||||
"resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-7.0.17.tgz",
|
||||
"integrity": "sha512-DieYoGrP78PWKsrXr8MZwtQ7GLCUeLxihtjC1jZsW1DnvSMdKPitJSe8OSYDM2u5H6g3kWJZpePqkp43TfLh0g==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/tough-cookie": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-6.0.0.tgz",
|
||||
"integrity": "sha512-kXuRi1mtaKMrsLUxz3sQYvVl37B0Ns6MzfrtV5DvJceE9bPyspOqk9xxv7XbZWcfLWbFmm997vl83qUWVJA64w==",
|
||||
"license": "BSD-3-Clause",
|
||||
"dependencies": {
|
||||
"tldts": "^7.0.5"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16"
|
||||
}
|
||||
},
|
||||
"node_modules/tr46": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/tr46/-/tr46-6.0.0.tgz",
|
||||
"integrity": "sha512-bLVMLPtstlZ4iMQHpFHTR7GAGj2jxi8Dg0s2h2MafAE4uSWF98FC/3MomU51iQAMf8/qDUbKWf5GxuvvVcXEhw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"punycode": "^2.3.1"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
}
|
||||
},
|
||||
"node_modules/tslib": {
|
||||
"version": "2.8.1",
|
||||
"resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
|
||||
@@ -1483,12 +1926,67 @@
|
||||
"integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/w3c-xmlserializer": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz",
|
||||
"integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"xml-name-validator": "^5.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/webdriver-bidi-protocol": {
|
||||
"version": "0.3.8",
|
||||
"resolved": "https://registry.npmjs.org/webdriver-bidi-protocol/-/webdriver-bidi-protocol-0.3.8.tgz",
|
||||
"integrity": "sha512-21Yi2GhGntMc671vNBCjiAeEVknXjVRoyu+k+9xOMShu+ZQfpGQwnBqbNz/Sv4GXZ6JmutlPAi2nIJcrymAWuQ==",
|
||||
"license": "Apache-2.0"
|
||||
},
|
||||
"node_modules/webidl-conversions": {
|
||||
"version": "8.0.0",
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-8.0.0.tgz",
|
||||
"integrity": "sha512-n4W4YFyz5JzOfQeA8oN7dUYpR+MBP3PIUsn2jLjWXwK5ASUzt0Jc/A5sAUZoCYFJRGF0FBKJ+1JjN43rNdsQzA==",
|
||||
"license": "BSD-2-Clause",
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-encoding": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz",
|
||||
"integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"iconv-lite": "0.6.3"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-mimetype": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz",
|
||||
"integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/whatwg-url": {
|
||||
"version": "15.1.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-15.1.0.tgz",
|
||||
"integrity": "sha512-2ytDk0kiEj/yu90JOAp44PVPUkO9+jVhyf+SybKlRHSDlvOOZhdPIrr7xTH64l4WixO2cP+wQIcgujkGBPPz6g==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"tr46": "^6.0.0",
|
||||
"webidl-conversions": "^8.0.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
}
|
||||
},
|
||||
"node_modules/wrap-ansi": {
|
||||
"version": "7.0.0",
|
||||
"resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
|
||||
@@ -1533,6 +2031,21 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/xml-name-validator": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz",
|
||||
"integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/xmlchars": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
|
||||
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==",
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/y18n": {
|
||||
"version": "5.0.8",
|
||||
"resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
|
||||
|
||||
@@ -20,8 +20,10 @@
|
||||
"author": "",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"better-sqlite3": "^11.0.0",
|
||||
"commander": "^12.0.0",
|
||||
"jsdom": "^27.1.0",
|
||||
"nanoid": "^3.3.7",
|
||||
"puppeteer": "^24.28.0",
|
||||
"puppeteer-core": "^24.28.0"
|
||||
|
||||
@@ -12,17 +12,19 @@ import type { ExtractorName } from './models';
|
||||
// puppeteer must be first as it launches the browser
|
||||
export const EXTRACTOR_ORDER: string[] = [
|
||||
'puppeteer', // Launches Chrome and writes CDP URL to .env
|
||||
'favicon', // Downloads favicon
|
||||
'favicon', // Downloads favicon (can work independently)
|
||||
'title', // Extracts title using existing Chrome tab
|
||||
'headers', // Extracts headers using existing Chrome tab
|
||||
'screenshot', // Takes screenshot using existing Chrome tab
|
||||
'dom', // Extracts DOM using existing Chrome tab
|
||||
'wget', // Downloads with wget
|
||||
'singlefile', // Single file archive
|
||||
'readability', // Readable content extraction
|
||||
'media', // Media downloads
|
||||
'git', // Git clone
|
||||
'archive_org', // Submit to archive.org
|
||||
'pdf', // Generates PDF using existing Chrome tab
|
||||
'dom', // Extracts DOM HTML using existing Chrome tab
|
||||
'htmltotext', // Extracts plain text using existing Chrome tab
|
||||
'readability', // Extracts article content using existing Chrome tab
|
||||
'singlefile', // Creates single-file archive (may use existing Chrome)
|
||||
'wget', // Downloads with wget (independent)
|
||||
'git', // Clones git repository (independent)
|
||||
'media', // Downloads media with yt-dlp (independent)
|
||||
'archive_org', // Submits to Internet Archive (independent)
|
||||
];
|
||||
|
||||
export interface ExtractorInfo {
|
||||
|
||||
@@ -6,17 +6,17 @@ export type SnapshotStatus = 'queued' | 'started' | 'sealed';
|
||||
export type ArchiveResultStatus = 'queued' | 'started' | 'backoff' | 'succeeded' | 'failed' | 'skipped';
|
||||
|
||||
export type ExtractorName =
|
||||
| 'puppeteer'
|
||||
| 'favicon'
|
||||
| 'title'
|
||||
| 'headers'
|
||||
| 'screenshot'
|
||||
| 'pdf'
|
||||
| 'dom'
|
||||
| 'htmltotext'
|
||||
| 'readability'
|
||||
| 'singlefile'
|
||||
| 'wget'
|
||||
| 'readability'
|
||||
| 'mercury'
|
||||
| 'htmltotext'
|
||||
| 'git'
|
||||
| 'media'
|
||||
| 'archive_org';
|
||||
|
||||
Reference in New Issue
Block a user