diff --git a/archivebox-ts/.gitignore b/archivebox-ts/.gitignore new file mode 100644 index 00000000..1e365813 --- /dev/null +++ b/archivebox-ts/.gitignore @@ -0,0 +1,8 @@ +node_modules/ +dist/ +*.log +.env +data/ +*.db +*.sqlite +*.sqlite3 diff --git a/archivebox-ts/ARCHITECTURE.md b/archivebox-ts/ARCHITECTURE.md new file mode 100644 index 00000000..13a9e407 --- /dev/null +++ b/archivebox-ts/ARCHITECTURE.md @@ -0,0 +1,391 @@ +# ArchiveBox-TS Architecture + +This document explains the architectural decisions and design philosophy of ArchiveBox-TS. + +## Design Philosophy + +### 1. Simplicity Over Flexibility + +Rather than building a complex plugin system, we use the simplest possible approach: standalone executable files. This makes the system easier to understand, debug, and extend. + +### 2. Convention Over Configuration + +- Extractors are discovered by scanning a directory +- URL is always the first argument +- Output always goes to current directory +- Config always via environment variables + +### 3. Language Agnostic + +Extractors can be written in any language (bash, Python, Node.js, Go, Rust, etc.) as long as they follow the simple contract: executable with shebang, URL as $1, files to current dir. + +### 4. Self-Contained + +Each extractor is responsible for its own dependencies. This removes the need for a central dependency management system. + +### 5. Fail Fast, Recover Gracefully + +Individual extractor failures don't stop the whole archiving process. Each extractor runs independently and reports its own status. + +## Core Components + +### 1. Database Layer (`src/db.ts`) + +Uses better-sqlite3 for synchronous SQLite operations. + +**Key Design Decisions:** + +- **Synchronous API**: Simpler to use than async for CLI applications +- **WAL Mode**: Better concurrency support +- **Compatible Schema**: Matches ArchiveBox's schema for potential data migration +- **JSON Storage**: Config and cmd stored as JSON for flexibility +- **UUID Primary Keys**: Globally unique, can be generated client-side + +**Schema Simplifications:** + +- No user system (single-user mode) +- No tags (can be added later) +- No crawls (can be added later) +- Simplified state machine (queued → started → sealed) + +### 2. Extractor Manager (`src/extractors.ts`) + +Discovers and orchestrates extractor execution. + +**Discovery Process:** + +1. Scan extractors directory +2. Check file execute permissions +3. Register available extractors + +**Execution Process:** + +1. Create output directory +2. Spawn process with URL as first argument +3. Set working directory to output directory +4. Pass environment variables +5. Capture stdout (output file) and stderr (logs) +6. Record exit code (0 = success) + +**Parallelization:** + +Extractors run in parallel using Promise.all(). Each extractor is independent and failures are isolated. + +### 3. CLI (`src/cli.ts`) + +Uses Commander.js for CLI argument parsing. + +**Commands:** + +- `init` - Set up data directory and database +- `add` - Archive a URL +- `list` - Show all snapshots +- `status` - Show snapshot details +- `extractors` - List available extractors + +**Flow for `add` command:** + +``` +1. Parse arguments +2. Open database +3. Check if URL exists +4. Create snapshot (or reuse existing) +5. Determine extractors to run +6. Update snapshot status to 'started' +7. Create output directory +8. Create ArchiveResult records for each extractor +9. Run extractors in parallel +10. Update ArchiveResult records with results +11. Update snapshot status to 'sealed' +12. Close database +``` + +## Data Model + +### Snapshot + +Represents a URL that has been (or is being) archived. + +**States:** +- `queued` - Created but not started +- `started` - Currently archiving +- `sealed` - Archiving complete + +**Key Fields:** +- `id` - UUID +- `abid` - ArchiveBox ID (for compatibility) +- `url` - The URL being archived +- `timestamp` - Unix timestamp string +- `output_dir` - Where files are stored + +### ArchiveResult + +Represents one extractor's result for one snapshot. + +**States:** +- `queued` - Waiting to run +- `started` - Currently running +- `succeeded` - Completed successfully +- `failed` - Failed with error +- `skipped` - Intentionally skipped +- `backoff` - Waiting to retry + +**Key Fields:** +- `id` - UUID +- `snapshot_id` - Foreign key to snapshot +- `extractor` - Name of extractor +- `cmd` - Command that was executed +- `output` - Main output file path +- `start_ts`, `end_ts` - Execution timing + +## Extractor Contract + +### Input + +1. **URL** - First positional argument (`$1`, `process.argv[2]`, `sys.argv[1]`) +2. **Environment Variables** - All configuration +3. **Working Directory** - Where to write output files + +### Output + +1. **Files** - Written to current working directory +2. **stdout** - Main output filename (e.g., "screenshot.png") +3. **stderr** - Logs, progress, errors +4. **Exit Code** - 0 for success, non-zero for failure + +### Lifecycle + +``` +1. Extractor spawned by ExtractorManager +2. Changes to output directory +3. Reads config from environment +4. (Optional) Auto-installs dependencies +5. Processes URL +6. Writes output files +7. Prints main file to stdout +8. Logs to stderr +9. Exits with status code +``` + +## File Organization + +``` +archivebox-ts/ +├── src/ # TypeScript source +│ ├── cli.ts # CLI entry point +│ ├── db.ts # Database operations +│ ├── models.ts # TypeScript types +│ └── extractors.ts # Extractor orchestration +├── extractors/ # Extractor executables +│ ├── favicon # Bash script +│ ├── title # Node.js script +│ ├── headers # Bash script +│ ├── wget # Bash script +│ └── screenshot # Python script +├── dist/ # Compiled JavaScript (gitignored) +├── data/ # Runtime data (gitignored) +│ ├── index.sqlite3 # Database +│ └── archive/ # Archived content +│ └── _/ +│ ├── favicon.ico +│ ├── title.txt +│ └── ... +├── package.json +├── tsconfig.json +├── README.md +├── QUICKSTART.md +├── EXTRACTOR_GUIDE.md +└── ARCHITECTURE.md (this file) +``` + +## Output Directory Naming + +Pattern: `_` + +Example: `1762193664373_example.com` + +**Why?** +- Timestamp ensures uniqueness +- Domain provides human-readable context +- Simple flat structure (no deep nesting) + +## Comparison to Original ArchiveBox + +### What We Kept + +1. **Database Schema** - Compatible with ArchiveBox for potential migration +2. **Snapshot/ArchiveResult Model** - Same conceptual model +3. **Extractor Names** - Same names (favicon, title, headers, etc.) +4. **Output Structure** - Similar file organization + +### What We Simplified + +1. **Plugin System** → Executable files +2. **Configuration Files** → Environment variables +3. **Django ORM** → Raw SQLite +4. **Web UI** → CLI only (for now) +5. **Background Workers** → Direct execution +6. **Multi-user** → Single-user +7. **ABX Framework** → Simple directory scan + +### What We Improved + +1. **Easier to Extend** - Just drop an executable in a directory +2. **Language Agnostic** - Use any language for extractors +3. **Simpler Dependencies** - Each extractor manages its own +4. **Easier to Test** - Extractors can be tested standalone +5. **Smaller Codebase** - ~500 lines vs thousands + +## Performance Characteristics + +### Time Complexity + +- **Add URL**: O(n) where n = number of extractors +- **List Snapshots**: O(n) where n = number of snapshots (with pagination) +- **Get Status**: O(1) for snapshot, O(m) for results where m = extractors used +- **Discover Extractors**: O(e) where e = files in extractors directory + +### Space Complexity + +- **Database**: O(n * m) where n = snapshots, m = extractors per snapshot +- **Archive Files**: Depends on content (potentially large) + +### Concurrency + +- **Extractors**: Run in parallel (Promise.all) +- **CLI Commands**: Sequential (SQLite has one writer) +- **Future**: Could add job queue for background processing + +## Scaling Considerations + +### Current Limits + +- Single machine +- One CLI command at a time +- No distributed execution +- Limited by SQLite write throughput + +### Future Enhancements + +1. **Job Queue** - Redis or database-based queue +2. **Worker Processes** - Multiple workers processing queue +3. **Distributed Execution** - Run extractors on different machines +4. **Caching** - Cache extractor results +5. **Incremental Archiving** - Only run changed extractors + +## Error Handling + +### Extractor Failures + +- Captured and stored in ArchiveResult.notes +- Don't stop other extractors +- Exit code determines success/failure +- stderr captured for debugging + +### Database Errors + +- Propagated to CLI +- Transaction rollback on failure +- Clear error messages + +### Network Errors + +- Handled by individual extractors +- Timeout via environment variables +- Retry logic in extractors (optional) + +## Testing Strategy + +### Unit Tests (Future) + +- Database operations +- Extractor discovery +- Model validation + +### Integration Tests (Future) + +- Full CLI commands +- Database + extractors +- Error scenarios + +### Extractor Tests + +- Manual testing (run standalone) +- Test with various URLs +- Test error conditions +- Test configuration options + +## Security Considerations + +### Current State + +- Runs with user permissions +- No input sanitization (URLs passed directly) +- Extractors can run arbitrary code +- No sandbox + +### Recommendations for Production + +1. **Input Validation** - Validate and sanitize URLs +2. **Sandboxing** - Run extractors in containers/VMs +3. **Resource Limits** - CPU, memory, disk quotas +4. **Authentication** - Add user system for web UI +5. **HTTPS Only** - Validate SSL certificates +6. **Rate Limiting** - Prevent abuse + +## Future Architecture Enhancements + +### 1. Background Processing + +```typescript +// Job queue pattern +interface Job { + id: string; + snapshot_id: string; + extractor: string; + status: 'pending' | 'running' | 'completed'; +} + +class JobQueue { + enqueue(snapshot_id: string, extractor: string): Job; + dequeue(): Job | null; + complete(job_id: string, result: ExtractorResult): void; +} +``` + +### 2. Web UI + +- Express/Fastify server +- Browse archived snapshots +- Trigger new archives +- View extractor results +- Search functionality + +### 3. API + +- RESTful API +- POST /snapshots - Create snapshot +- GET /snapshots - List snapshots +- GET /snapshots/:id - Get snapshot details +- POST /snapshots/:id/extract - Run extractors + +### 4. Plugins + +While keeping the extractor model simple, could add: +- Pre-processors (URL transformation) +- Post-processors (Content analysis) +- Notifications (Email, webhook) +- Storage backends (S3, B2) + +### 5. Distributed Execution + +- Extract coordinator and workers +- gRPC or HTTP API between coordinator/workers +- Shared database or message queue +- Worker pools by extractor type + +## Conclusion + +ArchiveBox-TS demonstrates that complex functionality can be achieved with simple, composable components. By embracing Unix philosophy (do one thing well, text streams, exit codes), we've created a system that's both powerful and easy to understand. + +The key insight is that **extractors don't need to be plugins** - they can be simple executables that follow a convention. This drastically simplifies the architecture while maintaining flexibility and extensibility. diff --git a/archivebox-ts/EXTRACTOR_GUIDE.md b/archivebox-ts/EXTRACTOR_GUIDE.md new file mode 100644 index 00000000..5ae307f1 --- /dev/null +++ b/archivebox-ts/EXTRACTOR_GUIDE.md @@ -0,0 +1,487 @@ +# Extractor Development Guide + +This guide explains how to create custom extractors for ArchiveBox-TS. + +## What is an Extractor? + +An extractor is a standalone executable program that: +1. Takes a URL as input +2. Processes/downloads content from that URL +3. Saves output files to the current directory +4. Reports success/failure via exit code + +## Extractor Contract + +Every extractor must follow these rules: + +### 1. File Location +- Place the extractor file in the `extractors/` directory +- The filename becomes the extractor name (e.g., `extractors/myextractor` → `myextractor`) + +### 2. Executable Permissions +```bash +chmod +x extractors/myextractor +``` + +### 3. Shebang Line +Start your file with the appropriate shebang: +- Bash: `#!/bin/bash` +- Node.js: `#!/usr/bin/env node` +- Python: `#!/usr/bin/env python3` +- Ruby: `#!/usr/bin/env ruby` +- Any other: `#!/usr/bin/env ` + +### 4. URL Input +The URL is passed as the first command-line argument: +- Bash: `$1` +- Node.js: `process.argv[2]` +- Python: `sys.argv[1]` + +### 5. Output Directory +The extractor runs in the output directory. Write all files to the current directory (`.`). + +### 6. Configuration +All configuration via environment variables: +- Read config: `${VAR_NAME:-default}` (bash) or `process.env.VAR_NAME` (Node.js) +- Name variables after your extractor: `MYEXTRACTOR_TIMEOUT`, `MYEXTRACTOR_WIDTH`, etc. +- Provide sensible defaults + +### 7. Standard Output (stdout) +Print the main output file path to stdout: +```bash +echo "output.html" +``` + +### 8. Standard Error (stderr) +Use stderr for all logging, progress, and error messages: +```bash +echo "Downloading..." >&2 +echo "Error: Failed" >&2 +``` + +### 9. Exit Code +- `0` = Success +- Non-zero = Failure + +### 10. Auto-Install Dependencies (Optional) +Your extractor can check for and install its dependencies: + +```bash +if ! command -v mytool &> /dev/null; then + echo "Installing mytool..." >&2 + # Install command here +fi +``` + +## Complete Examples + +### Bash Extractor: HTML Downloader + +```bash +#!/bin/bash +# +# HTML Extractor +# Downloads the raw HTML of a page +# +# Config: +# HTML_TIMEOUT - Timeout in seconds (default: 30) +# HTML_USER_AGENT - User agent string +# + +set -e # Exit on error + +URL="$1" + +# Validate input +if [ -z "$URL" ]; then + echo "Error: URL argument required" >&2 + exit 1 +fi + +# Auto-install curl if needed +if ! command -v curl &> /dev/null; then + echo "Installing curl..." >&2 + sudo apt-get update && sudo apt-get install -y curl +fi + +# Read config from environment +TIMEOUT="${HTML_TIMEOUT:-30}" +USER_AGENT="${HTML_USER_AGENT:-Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)}" + +# Log to stderr +echo "Downloading HTML from: $URL" >&2 + +# Download HTML +if curl -L -s --max-time "$TIMEOUT" --user-agent "$USER_AGENT" -o index.html "$URL"; then + echo "✓ Downloaded HTML" >&2 + echo "index.html" # Output file to stdout + exit 0 +else + echo "Error: Failed to download HTML" >&2 + exit 1 +fi +``` + +### Node.js Extractor: JSON Metadata + +```javascript +#!/usr/bin/env node +// +// Metadata Extractor +// Extracts metadata from a page and saves as JSON +// +// Config: +// METADATA_TIMEOUT - Timeout in milliseconds (default: 10000) +// + +const https = require('https'); +const http = require('http'); +const fs = require('fs'); +const { URL } = require('url'); + +// Get URL from first argument +const url = process.argv[2]; +if (!url) { + console.error('Error: URL argument required'); + process.exit(1); +} + +// Configuration +const TIMEOUT = parseInt(process.env.METADATA_TIMEOUT || '10000', 10); + +console.error(`Extracting metadata from: ${url}`); + +// Parse URL +let parsedUrl; +try { + parsedUrl = new URL(url); +} catch (err) { + console.error(`Error: Invalid URL: ${err.message}`); + process.exit(1); +} + +// Choose protocol +const client = parsedUrl.protocol === 'https:' ? https : http; + +// Make request +const options = { + timeout: TIMEOUT, + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)' + } +}; + +client.get(url, options, (res) => { + let html = ''; + + res.on('data', (chunk) => { + html += chunk; + }); + + res.on('end', () => { + // Extract metadata + const metadata = { + url: url, + status: res.statusCode, + headers: res.headers, + title: extractTitle(html), + description: extractMeta(html, 'description'), + keywords: extractMeta(html, 'keywords'), + author: extractMeta(html, 'author'), + timestamp: new Date().toISOString() + }; + + // Write to file + fs.writeFileSync('metadata.json', JSON.stringify(metadata, null, 2)); + + console.error('✓ Extracted metadata'); + console.log('metadata.json'); + process.exit(0); + }); +}).on('error', (err) => { + console.error(`Error: ${err.message}`); + process.exit(1); +}); + +function extractTitle(html) { + const match = html.match(/]*>(.*?)<\/title>/is); + return match ? match[1].trim() : null; +} + +function extractMeta(html, name) { + const regex = new RegExp(`]*name=["']${name}["'][^>]*content=["']([^"']*)["']`, 'i'); + const match = html.match(regex); + return match ? match[1] : null; +} +``` + +### Python Extractor: Link Extractor + +```python +#!/usr/bin/env python3 +# +# Links Extractor +# Extracts all links from a page +# +# Config: +# LINKS_TIMEOUT - Timeout in seconds (default: 30) +# LINKS_MAX - Maximum links to extract (default: 1000) +# + +import sys +import os +import subprocess +import re +from urllib.request import urlopen, Request +from urllib.parse import urljoin, urlparse + +def ensure_deps(): + """Auto-install dependencies""" + # For this simple example, we use stdlib only + pass + +def main(): + # Validate input + if len(sys.argv) < 2: + print("Error: URL argument required", file=sys.stderr) + sys.exit(1) + + url = sys.argv[1] + + # Configuration + timeout = int(os.environ.get('LINKS_TIMEOUT', '30')) + max_links = int(os.environ.get('LINKS_MAX', '1000')) + + print(f"Extracting links from: {url}", file=sys.stderr) + + ensure_deps() + + try: + # Fetch HTML + req = Request(url, headers={ + 'User-Agent': 'Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)' + }) + with urlopen(req, timeout=timeout) as response: + html = response.read().decode('utf-8', errors='ignore') + + # Extract links using regex (simple approach) + # In production, use BeautifulSoup or lxml + links = set() + + # Find + for match in re.finditer(r']+href=["\'](.*?)["\']', html, re.IGNORECASE): + href = match.group(1) + # Convert relative to absolute + absolute_url = urljoin(url, href) + links.add(absolute_url) + + if len(links) >= max_links: + break + + # Write to file + with open('links.txt', 'w') as f: + for link in sorted(links): + f.write(link + '\n') + + print(f"✓ Extracted {len(links)} links", file=sys.stderr) + print("links.txt") + sys.exit(0) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() +``` + +## Testing Your Extractor + +### Manual Testing + +1. Create a test directory: +```bash +mkdir test-output +cd test-output +``` + +2. Run your extractor: +```bash +/path/to/extractors/myextractor "https://example.com" +``` + +3. Check output: +```bash +ls -la +cat output-file.txt +``` + +### Environment Variable Testing + +```bash +# Set config +export MYEXTRACTOR_TIMEOUT=60 +export MYEXTRACTOR_DEBUG=true + +# Run +/path/to/extractors/myextractor "https://example.com" +``` + +### Error Handling Testing + +Test your extractor with: +- Invalid URLs +- URLs that timeout +- URLs that return 404 +- URLs with special characters +- Very large pages +- Redirects + +## Best Practices + +### 1. Error Handling +- Always validate the URL argument +- Handle network errors gracefully +- Provide clear error messages to stderr +- Exit with non-zero code on failure + +### 2. Timeouts +- Always set timeouts for network requests +- Make timeouts configurable via environment variables +- Use reasonable defaults (10-60 seconds) + +### 3. Output +- Create files with clear, descriptive names +- Use standard formats (JSON, HTML, TXT, PNG, etc.) +- Print the main output filename to stdout +- If multiple files, print the primary one or a summary file + +### 4. Logging +- Log progress to stderr +- Use prefixes: `✓` for success, `✗` for errors, `→` for progress +- Include URL in log messages +- Don't be too verbose (user can redirect if needed) + +### 5. Performance +- Stream large files, don't load everything in memory +- Use parallel downloads when appropriate +- Respect robots.txt (optional but recommended) +- Add delays for rate limiting if needed + +### 6. Configuration +- Use descriptive environment variable names +- Prefix with your extractor name: `MYEXT_VAR_NAME` +- Provide defaults for all settings +- Document all config options in comments + +### 7. Dependencies +- Auto-install common dependencies when possible +- Detect OS and use appropriate package manager +- Provide clear error if dependency can't be installed +- Test with fresh environment + +### 8. Idempotency +- Running twice should produce the same result +- Overwrite existing files +- Don't append to files + +### 9. Security +- Validate and sanitize URLs +- Don't execute arbitrary code from fetched content +- Be careful with file paths (prevent directory traversal) +- Limit resource usage (file size, memory, etc.) + +## Common Patterns + +### Retry Logic + +```bash +MAX_RETRIES=3 +RETRY_DELAY=2 + +for i in $(seq 1 $MAX_RETRIES); do + if download_url "$URL"; then + break + fi + + if [ $i -lt $MAX_RETRIES ]; then + echo "Retry $i/$MAX_RETRIES in ${RETRY_DELAY}s..." >&2 + sleep $RETRY_DELAY + fi +done +``` + +### Progress Reporting + +```javascript +const total = 100; +let done = 0; + +function updateProgress() { + done++; + console.error(`Progress: ${done}/${total} (${Math.round(done/total*100)}%)`); +} +``` + +### Conditional Extraction + +```python +# Only extract if page is HTML +content_type = response.headers.get('content-type', '') +if 'text/html' not in content_type.lower(): + print(f"Skipping non-HTML content: {content_type}", file=sys.stderr) + sys.exit(0) # Success but skipped +``` + +## Debugging + +### Enable Verbose Output + +Add a DEBUG environment variable: + +```bash +if [ "${MYEXT_DEBUG}" = "true" ]; then + set -x # Print commands +fi +``` + +### Test in Isolation + +```bash +# Run in a clean environment +env -i \ + PATH=/usr/bin:/bin \ + MYEXT_TIMEOUT=30 \ + /path/to/extractors/myextractor "https://example.com" +``` + +### Check Exit Codes + +```bash +/path/to/extractors/myextractor "https://example.com" +echo "Exit code: $?" +``` + +## Examples of Extractor Ideas + +- **RSS Feed**: Extract RSS/Atom feed +- **Images**: Download all images from page +- **Video**: Extract video using yt-dlp +- **Archive.org**: Submit to Internet Archive +- **PDF**: Convert page to PDF using wkhtmltopdf +- **Reader Mode**: Extract main content using readability +- **Git**: Clone git repository +- **Twitter Thread**: Unroll and save thread +- **Mastodon Post**: Archive toot with media +- **GitHub Repo**: Archive repository with stars/forks +- **HackerNews Thread**: Save discussion thread +- **Reddit Thread**: Archive post and comments + +## Next Steps + +1. Create your extractor file in `extractors/` +2. Make it executable: `chmod +x extractors/yourextractor` +3. Test it manually +4. Use it with archivebox-ts: `node dist/cli.js add --extractors yourextractor https://example.com` + +Happy extracting! 🎉 diff --git a/archivebox-ts/QUICKSTART.md b/archivebox-ts/QUICKSTART.md new file mode 100644 index 00000000..63c575a4 --- /dev/null +++ b/archivebox-ts/QUICKSTART.md @@ -0,0 +1,250 @@ +# ArchiveBox-TS Quick Start Guide + +Get up and running with ArchiveBox-TS in 5 minutes! + +## Installation + +```bash +cd archivebox-ts + +# Install dependencies +npm install + +# Build TypeScript +npm run build + +# Initialize database and data directory +node dist/cli.js init +``` + +## Basic Usage + +### Archive a URL + +```bash +# Archive with all available extractors +node dist/cli.js add "https://example.com" + +# Archive with specific extractors +node dist/cli.js add "https://github.com" --extractors title,headers,favicon + +# Add a custom title +node dist/cli.js add "https://example.com" --title "Example Domain" +``` + +### List Archives + +```bash +# List all snapshots +node dist/cli.js list + +# With pagination +node dist/cli.js list --limit 10 --offset 0 +``` + +### Check Status + +```bash +# Get the snapshot ID from list command +node dist/cli.js status +``` + +### List Extractors + +```bash +# See what extractors are available +node dist/cli.js extractors +``` + +## Directory Structure After Init + +``` +archivebox-ts/ +├── data/ +│ ├── index.sqlite3 # SQLite database +│ └── archive/ # Archived content +│ └── _/ # Individual snapshot directories +│ ├── headers.json +│ ├── title.txt +│ ├── favicon.ico +│ └── ... +``` + +## Environment Variables + +Configure extractors using environment variables: + +```bash +# Set favicon timeout +export FAVICON_TIMEOUT=30 + +# Set screenshot dimensions +export SCREENSHOT_WIDTH=1920 +export SCREENSHOT_HEIGHT=1080 + +# Run with custom config +node dist/cli.js add "https://example.com" --extractors screenshot +``` + +## Example Session + +```bash +# Initialize +$ node dist/cli.js init +Initializing ArchiveBox... +Data directory: /path/to/data +Database: /path/to/data/index.sqlite3 +Archive directory: /path/to/data/archive +✓ Initialization complete! + +# Add a URL +$ node dist/cli.js add "https://example.com" +Adding URL: https://example.com +✓ Created snapshot: 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71 +Running extractors: favicon, title, headers, wget, screenshot +Output directory: /path/to/data/archive/1762193664373_example.com + ✓ favicon: succeeded + ✓ title: succeeded + ✓ headers: succeeded + ✓ wget: succeeded + ✓ screenshot: succeeded +✓ Archiving complete! + +# Check status +$ node dist/cli.js status 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71 +Snapshot: 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71 +URL: https://example.com +Title: Example Domain +Status: sealed +Created: 2025-11-03T18:14:04.279Z +Downloaded: 2025-11-03T18:14:25.273Z +Output: /path/to/data/archive/1762193664373_example.com + +Archive Results (5): + ✓ favicon: succeeded + Output: favicon.ico + ✓ title: succeeded + Output: title.txt + ✓ headers: succeeded + Output: headers.json + ✓ wget: succeeded + Output: warc/archive.warc.gz + ✓ screenshot: succeeded + Output: screenshot.png + +# List all snapshots +$ node dist/cli.js list +Found 1 snapshot(s): + +ID: 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71 +URL: https://example.com +Title: Example Domain +Status: sealed +Created: 2025-11-03T18:14:04.279Z +Output: /path/to/data/archive/1762193664373_example.com +--- +``` + +## Creating Your First Extractor + +Create a simple bash extractor: + +```bash +# Create the extractor file +cat > extractors/myextractor << 'EOF' +#!/bin/bash +set -e + +URL="$1" +if [ -z "$URL" ]; then + echo "Error: URL required" >&2 + exit 1 +fi + +echo "Processing $URL..." >&2 +echo "Hello from myextractor!" > output.txt +echo "✓ Done" >&2 +echo "output.txt" +EOF + +# Make it executable +chmod +x extractors/myextractor + +# Test it +node dist/cli.js add "https://example.com" --extractors myextractor +``` + +See [EXTRACTOR_GUIDE.md](EXTRACTOR_GUIDE.md) for detailed information on creating extractors. + +## Troubleshooting + +### "No extractors available" + +Make sure the extractor files are executable: +```bash +chmod +x extractors/* +``` + +### "Extractor failed" + +Check the error message in the status output: +```bash +node dist/cli.js status +``` + +Common issues: +- Missing dependencies (extractor should auto-install) +- Network timeout (increase timeout via environment variable) +- Invalid URL format + +### "Database locked" + +Only one CLI command can run at a time. Wait for the current command to finish. + +## Next Steps + +- Read the [README.md](README.md) for architecture details +- Check out [EXTRACTOR_GUIDE.md](EXTRACTOR_GUIDE.md) to create custom extractors +- Browse the `extractors/` directory for examples +- Explore the TypeScript source code in `src/` + +## Performance Tips + +1. **Parallel Archiving**: Run multiple CLI instances with different data directories +2. **Selective Extractors**: Use `--extractors` flag to only run needed extractors +3. **Adjust Timeouts**: Increase timeouts for slow sites via environment variables +4. **Large Sites**: Use wget extractor for comprehensive archiving + +## Data Management + +### Backup + +```bash +# Backup database +cp data/index.sqlite3 data/index.sqlite3.backup + +# Backup everything +tar czf archivebox-backup.tar.gz data/ +``` + +### Export + +```bash +# Export database to SQL +sqlite3 data/index.sqlite3 .dump > export.sql + +# Query snapshots +sqlite3 data/index.sqlite3 "SELECT url, title, status FROM snapshots;" +``` + +### Clean Up + +```bash +# Remove old archives (manual) +rm -rf data/archive/_* + +# Remove from database +sqlite3 data/index.sqlite3 "DELETE FROM snapshots WHERE url = 'https://example.com';" +``` + +Happy archiving! 🎉 diff --git a/archivebox-ts/README.md b/archivebox-ts/README.md new file mode 100644 index 00000000..8a98523d --- /dev/null +++ b/archivebox-ts/README.md @@ -0,0 +1,380 @@ +# ArchiveBox TypeScript + +A TypeScript-based version of ArchiveBox with a simplified, modular architecture. + +## Overview + +This is a reimplementation of ArchiveBox using TypeScript with a focus on simplicity and modularity. The key architectural changes are: + +1. **Standalone Extractors**: Each extractor is a standalone executable (bash, Node.js, or Python with shebang) that can run independently +2. **Auto-Installing Dependencies**: Extractors automatically install their own dependencies when first run +3. **Simple Interface**: Extractors receive URL as `$1` CLI argument and output files to current working directory +4. **Environment-Based Config**: All configuration passed via environment variables, no CLI flags +5. **SQLite Database**: Uses SQLite with schema matching the original ArchiveBox + +## Directory Structure + +``` +archivebox-ts/ +├── src/ +│ ├── cli.ts # Main CLI entry point +│ ├── db.ts # SQLite database operations +│ ├── models.ts # TypeScript interfaces +│ └── extractors.ts # Extractor orchestration +├── extractors/ # Standalone extractor executables +│ ├── favicon # Bash script to download favicon +│ ├── title # Node.js script to extract title +│ ├── headers # Bash script to extract HTTP headers +│ ├── wget # Bash script for full page download +│ └── screenshot # Python script for screenshots +├── data/ # Created on init +│ ├── index.sqlite3 # SQLite database +│ └── archive/ # Archived snapshots +├── package.json +├── tsconfig.json +└── README.md +``` + +## Installation + +### Prerequisites + +- Node.js 18+ and npm +- For specific extractors: + - `wget` extractor: wget + - `screenshot` extractor: Python 3 + Playwright + +### Setup + +```bash +cd archivebox-ts + +# Install dependencies +npm install + +# Build TypeScript +npm run build + +# Initialize ArchiveBox +node dist/cli.js init +``` + +## Usage + +### Initialize + +Create the data directory and database: + +```bash +node dist/cli.js init +``` + +### Add a URL + +Archive a URL with all available extractors: + +```bash +node dist/cli.js add https://example.com +``` + +Archive with specific extractors: + +```bash +node dist/cli.js add https://example.com --extractors favicon,title,headers +``` + +Add with custom title: + +```bash +node dist/cli.js add https://example.com --title "Example Domain" +``` + +### List Snapshots + +List all archived snapshots: + +```bash +node dist/cli.js list +``` + +With pagination: + +```bash +node dist/cli.js list --limit 10 --offset 20 +``` + +### Check Status + +View detailed status of a snapshot: + +```bash +node dist/cli.js status +``` + +### List Extractors + +See all available extractors: + +```bash +node dist/cli.js extractors +``` + +## Database Schema + +The SQLite database uses a schema compatible with ArchiveBox: + +### Snapshots Table + +Represents a single URL being archived. + +| Column | Type | Description | +|--------|------|-------------| +| id | TEXT (UUID) | Primary key | +| abid | TEXT | ArchiveBox ID (snp_...) | +| url | TEXT | URL being archived (unique) | +| timestamp | TEXT | Unix timestamp string | +| title | TEXT | Page title | +| created_at | TEXT | ISO datetime | +| bookmarked_at | TEXT | ISO datetime | +| downloaded_at | TEXT | ISO datetime when complete | +| modified_at | TEXT | ISO datetime | +| status | TEXT | queued, started, sealed | +| retry_at | TEXT | ISO datetime for retry | +| config | TEXT (JSON) | Configuration | +| notes | TEXT | Extra notes | +| output_dir | TEXT | Path to output directory | + +### Archive Results Table + +Represents the result of running one extractor on one snapshot. + +| Column | Type | Description | +|--------|------|-------------| +| id | TEXT (UUID) | Primary key | +| abid | TEXT | ArchiveBox ID (res_...) | +| snapshot_id | TEXT | Foreign key to snapshot | +| extractor | TEXT | Extractor name | +| status | TEXT | queued, started, succeeded, failed, skipped, backoff | +| created_at | TEXT | ISO datetime | +| modified_at | TEXT | ISO datetime | +| start_ts | TEXT | ISO datetime when started | +| end_ts | TEXT | ISO datetime when finished | +| cmd | TEXT (JSON) | Command executed | +| pwd | TEXT | Working directory | +| cmd_version | TEXT | Binary version | +| output | TEXT | Output file path or result | +| retry_at | TEXT | ISO datetime for retry | +| config | TEXT (JSON) | Configuration | +| notes | TEXT | Extra notes | + +## Creating Custom Extractors + +Extractors are standalone executable files in the `extractors/` directory. + +### Extractor Contract + +1. **Executable**: File must have execute permissions (`chmod +x`) +2. **Shebang**: Must start with shebang (e.g., `#!/bin/bash`, `#!/usr/bin/env node`) +3. **First Argument**: Receives URL as `$1` (bash) or `process.argv[2]` (Node.js) or `sys.argv[1]` (Python) +4. **Working Directory**: Run in the output directory, write files there +5. **Environment Config**: Read all config from environment variables +6. **Exit Code**: Return 0 for success, non-zero for failure +7. **Output**: Print the main output file path to stdout +8. **Logging**: Print progress/errors to stderr +9. **Auto-Install**: Optionally auto-install dependencies on first run + +### Example Bash Extractor + +```bash +#!/bin/bash +# +# My Custom Extractor +# Description of what it does +# +# Config via environment variables: +# MY_TIMEOUT - Timeout in seconds (default: 30) +# + +set -e + +URL="$1" + +if [ -z "$URL" ]; then + echo "Error: URL argument required" >&2 + exit 1 +fi + +# Auto-install dependencies (optional) +if ! command -v some-tool &> /dev/null; then + echo "Installing some-tool..." >&2 + sudo apt-get install -y some-tool +fi + +# Get config from environment +TIMEOUT="${MY_TIMEOUT:-30}" + +echo "Processing $URL..." >&2 + +# Do the extraction work +some-tool --timeout "$TIMEOUT" "$URL" > output.txt + +echo "✓ Done" >&2 +echo "output.txt" +exit 0 +``` + +### Example Node.js Extractor + +```javascript +#!/usr/bin/env node +// +// My Custom Extractor +// Config via environment variables: +// MY_TIMEOUT - Timeout in ms +// + +const url = process.argv[2]; +if (!url) { + console.error('Error: URL argument required'); + process.exit(1); +} + +const timeout = parseInt(process.env.MY_TIMEOUT || '10000', 10); + +console.error(`Processing ${url}...`); + +// Do extraction work +// Write files to current directory + +console.error('✓ Done'); +console.log('output.txt'); +``` + +### Example Python Extractor + +```python +#!/usr/bin/env python3 +# +# My Custom Extractor +# Config via environment variables: +# MY_TIMEOUT - Timeout in seconds +# + +import sys +import os + +url = sys.argv[1] if len(sys.argv) > 1 else None +if not url: + print("Error: URL argument required", file=sys.stderr) + sys.exit(1) + +timeout = int(os.environ.get('MY_TIMEOUT', '30')) + +print(f"Processing {url}...", file=sys.stderr) + +# Do extraction work +# Write files to current directory + +print("✓ Done", file=sys.stderr) +print("output.txt") +``` + +## Available Extractors + +### favicon +- **Language**: Bash +- **Dependencies**: curl (auto-installed) +- **Output**: `favicon.ico` or `favicon.png` +- **Config**: + - `FAVICON_TIMEOUT` - Timeout in seconds (default: 10) + +### title +- **Language**: Node.js +- **Dependencies**: Built-in Node.js modules +- **Output**: `title.txt` +- **Config**: + - `TITLE_TIMEOUT` - Timeout in milliseconds (default: 10000) + - `TITLE_USER_AGENT` - User agent string + +### headers +- **Language**: Bash +- **Dependencies**: curl (auto-installed) +- **Output**: `headers.json` +- **Config**: + - `HEADERS_TIMEOUT` - Timeout in seconds (default: 10) + - `HEADERS_USER_AGENT` - User agent string + +### wget +- **Language**: Bash +- **Dependencies**: wget (auto-installed) +- **Output**: `warc/archive.warc.gz` and downloaded files +- **Config**: + - `WGET_TIMEOUT` - Timeout in seconds (default: 60) + - `WGET_USER_AGENT` - User agent string + - `WGET_ARGS` - Additional wget arguments + +### screenshot +- **Language**: Python +- **Dependencies**: playwright (auto-installed) +- **Output**: `screenshot.png` +- **Config**: + - `SCREENSHOT_TIMEOUT` - Timeout in milliseconds (default: 30000) + - `SCREENSHOT_WIDTH` - Viewport width (default: 1920) + - `SCREENSHOT_HEIGHT` - Viewport height (default: 1080) + - `SCREENSHOT_WAIT` - Wait time before screenshot in ms (default: 1000) + +## Development + +### Build + +```bash +npm run build +``` + +### Watch Mode + +```bash +npm run dev +``` + +### Project Structure + +- `src/models.ts` - TypeScript interfaces matching the database schema +- `src/db.ts` - Database layer with SQLite operations +- `src/extractors.ts` - Extractor discovery and orchestration +- `src/cli.ts` - CLI commands and application logic + +## Differences from Original ArchiveBox + +### Simplified + +1. **No Plugin System**: Instead of a complex ABX plugin framework, extractors are simple executable files +2. **Simpler Config**: Only environment variables, no configuration file parsing +3. **No Web UI**: Command-line only (for now) +4. **No Background Workers**: Direct execution (could be added) +5. **No User System**: Single-user mode + +### Architecture Improvements + +1. **Extractors are Standalone**: Each extractor can be tested independently +2. **Language Agnostic**: Write extractors in any language (bash, Python, Node.js, Go, etc.) +3. **Easy to Extend**: Just drop an executable file in `extractors/` directory +4. **Minimal Dependencies**: Core system only needs Node.js and SQLite + +## Future Enhancements + +- [ ] Background job queue for processing +- [ ] Web UI for browsing archives +- [ ] Search functionality +- [ ] More extractors (pdf, dom, singlefile, readability, etc.) +- [ ] Import/export functionality +- [ ] Schedule automatic archiving +- [ ] Browser extension integration + +## License + +MIT + +## Credits + +Based on [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox) by Nick Sweeting and contributors. diff --git a/archivebox-ts/extractors/favicon b/archivebox-ts/extractors/favicon new file mode 100755 index 00000000..29a9dafd --- /dev/null +++ b/archivebox-ts/extractors/favicon @@ -0,0 +1,79 @@ +#!/bin/bash +# +# Favicon Extractor +# Downloads the favicon for a given URL +# +# Usage: favicon +# Output: favicon.ico or favicon.png in current directory +# Config: All configuration via environment variables +# FAVICON_TIMEOUT - Timeout in seconds (default: 10) +# + +set -e + +URL="$1" + +if [ -z "$URL" ]; then + echo "Error: URL argument required" >&2 + exit 1 +fi + +# Auto-install dependencies +if ! command -v curl &> /dev/null; then + echo "Installing curl..." >&2 + if command -v apt-get &> /dev/null; then + sudo apt-get update && sudo apt-get install -y curl + elif command -v yum &> /dev/null; then + sudo yum install -y curl + elif command -v brew &> /dev/null; then + brew install curl + else + echo "Error: Cannot install curl. Please install manually." >&2 + exit 1 + fi +fi + +# Configuration from environment +TIMEOUT="${FAVICON_TIMEOUT:-10}" + +# Extract domain from URL +DOMAIN=$(echo "$URL" | sed -e 's|^[^/]*//||' -e 's|/.*$||') + +echo "Extracting favicon for $DOMAIN..." >&2 + +# Try common favicon locations +FAVICON_URLS=( + "https://${DOMAIN}/favicon.ico" + "http://${DOMAIN}/favicon.ico" + "https://${DOMAIN}/favicon.png" + "http://${DOMAIN}/favicon.png" +) + +SUCCESS=0 + +for FAVICON_URL in "${FAVICON_URLS[@]}"; do + echo "Trying: $FAVICON_URL" >&2 + + # Determine output filename + EXT="${FAVICON_URL##*.}" + OUTPUT="favicon.${EXT}" + + if curl -L -f -s --max-time "$TIMEOUT" -o "$OUTPUT" "$FAVICON_URL" 2>/dev/null; then + # Check if file is not empty + if [ -s "$OUTPUT" ]; then + echo "✓ Downloaded favicon: $OUTPUT" >&2 + echo "$OUTPUT" + SUCCESS=1 + break + else + rm -f "$OUTPUT" + fi + fi +done + +if [ "$SUCCESS" -eq 0 ]; then + echo "Warning: Could not download favicon" >&2 + exit 1 +fi + +exit 0 diff --git a/archivebox-ts/extractors/headers b/archivebox-ts/extractors/headers new file mode 100755 index 00000000..a82d4801 --- /dev/null +++ b/archivebox-ts/extractors/headers @@ -0,0 +1,85 @@ +#!/bin/bash +# +# Headers Extractor +# Extracts HTTP headers from a given URL +# +# Usage: headers +# Output: headers.json in current directory +# Config: All configuration via environment variables +# HEADERS_TIMEOUT - Timeout in seconds (default: 10) +# HEADERS_USER_AGENT - User agent string +# + +set -e + +URL="$1" + +if [ -z "$URL" ]; then + echo "Error: URL argument required" >&2 + exit 1 +fi + +# Auto-install dependencies +if ! command -v curl &> /dev/null; then + echo "Installing curl..." >&2 + if command -v apt-get &> /dev/null; then + sudo apt-get update && sudo apt-get install -y curl + elif command -v yum &> /dev/null; then + sudo yum install -y curl + elif command -v brew &> /dev/null; then + brew install curl + else + echo "Error: Cannot install curl. Please install manually." >&2 + exit 1 + fi +fi + +# Configuration from environment +TIMEOUT="${HEADERS_TIMEOUT:-10}" +USER_AGENT="${HEADERS_USER_AGENT:-Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)}" + +echo "Extracting headers from: $URL" >&2 + +# Get headers using curl +HEADERS=$(curl -I -L -s --max-time "$TIMEOUT" --user-agent "$USER_AGENT" "$URL" 2>&1 || echo "") + +if [ -z "$HEADERS" ]; then + echo "Error: Failed to fetch headers" >&2 + exit 1 +fi + +# Convert headers to JSON format (simple key-value pairs) +echo "{" > headers.json + +# Parse headers line by line +FIRST=1 +while IFS=: read -r key value; do + # Skip empty lines and HTTP status line + if [ -z "$key" ] || [[ "$key" =~ ^HTTP ]]; then + continue + fi + + # Clean up key and value + key=$(echo "$key" | tr -d '\r\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + value=$(echo "$value" | tr -d '\r\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//') + + if [ -n "$key" ] && [ -n "$value" ]; then + # Escape quotes in value + value=$(echo "$value" | sed 's/"/\\"/g') + + # Add comma if not first entry + if [ "$FIRST" -eq 0 ]; then + echo "," >> headers.json + fi + + echo -n " \"$key\": \"$value\"" >> headers.json + FIRST=0 + fi +done <<< "$HEADERS" + +echo "" >> headers.json +echo "}" >> headers.json + +echo "✓ Extracted headers to headers.json" >&2 +echo "headers.json" +exit 0 diff --git a/archivebox-ts/extractors/screenshot b/archivebox-ts/extractors/screenshot new file mode 100755 index 00000000..6afb39b9 --- /dev/null +++ b/archivebox-ts/extractors/screenshot @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# +# Screenshot Extractor +# Captures a screenshot of a given URL using Playwright +# +# Usage: screenshot +# Output: screenshot.png in current directory +# Config: All configuration via environment variables +# SCREENSHOT_TIMEOUT - Timeout in milliseconds (default: 30000) +# SCREENSHOT_WIDTH - Viewport width (default: 1920) +# SCREENSHOT_HEIGHT - Viewport height (default: 1080) +# SCREENSHOT_WAIT - Time to wait before screenshot in ms (default: 1000) +# + +import sys +import os +import subprocess + +def ensure_playwright(): + """Auto-install playwright if not available""" + try: + from playwright.sync_api import sync_playwright + return True + except ImportError: + print("Installing playwright...", file=sys.stderr) + try: + subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"]) + subprocess.check_call([sys.executable, "-m", "playwright", "install", "chromium"]) + from playwright.sync_api import sync_playwright + return True + except Exception as e: + print(f"Error installing playwright: {e}", file=sys.stderr) + return False + +def main(): + if len(sys.argv) < 2: + print("Error: URL argument required", file=sys.stderr) + sys.exit(1) + + url = sys.argv[1] + + # Configuration from environment + timeout = int(os.environ.get('SCREENSHOT_TIMEOUT', '30000')) + width = int(os.environ.get('SCREENSHOT_WIDTH', '1920')) + height = int(os.environ.get('SCREENSHOT_HEIGHT', '1080')) + wait = int(os.environ.get('SCREENSHOT_WAIT', '1000')) + + print(f"Capturing screenshot of: {url}", file=sys.stderr) + + # Ensure playwright is installed + if not ensure_playwright(): + sys.exit(1) + + from playwright.sync_api import sync_playwright + + try: + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page(viewport={'width': width, 'height': height}) + page.goto(url, timeout=timeout, wait_until='networkidle') + + # Wait a bit for any dynamic content + page.wait_for_timeout(wait) + + page.screenshot(path='screenshot.png', full_page=True) + browser.close() + + print("✓ Captured screenshot: screenshot.png", file=sys.stderr) + print("screenshot.png") + sys.exit(0) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/archivebox-ts/extractors/title b/archivebox-ts/extractors/title new file mode 100755 index 00000000..4321d923 --- /dev/null +++ b/archivebox-ts/extractors/title @@ -0,0 +1,89 @@ +#!/usr/bin/env node +// +// Title Extractor +// Extracts the page title from a given URL +// +// Usage: title +// Output: title.txt in current directory +// Config: All configuration via environment variables +// TITLE_TIMEOUT - Timeout in milliseconds (default: 10000) +// TITLE_USER_AGENT - User agent string +// + +const https = require('https'); +const http = require('http'); +const fs = require('fs'); +const { URL } = require('url'); + +const url = process.argv[2]; + +if (!url) { + console.error('Error: URL argument required'); + process.exit(1); +} + +// Configuration from environment +const TIMEOUT = parseInt(process.env.TITLE_TIMEOUT || '10000', 10); +const USER_AGENT = process.env.TITLE_USER_AGENT || 'Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)'; + +console.error(`Extracting title from: ${url}`); + +// Parse URL +let parsedUrl; +try { + parsedUrl = new URL(url); +} catch (err) { + console.error(`Error: Invalid URL: ${err.message}`); + process.exit(1); +} + +// Choose http or https module +const client = parsedUrl.protocol === 'https:' ? https : http; + +// Make request +const options = { + headers: { + 'User-Agent': USER_AGENT, + }, + timeout: TIMEOUT, +}; + +client.get(url, options, (res) => { + let html = ''; + + res.on('data', (chunk) => { + html += chunk; + + // Early exit if we found the title (optimization) + if (html.includes('')) { + res.destroy(); + } + }); + + res.on('end', () => { + // Extract title using regex + const titleMatch = html.match(/]*>(.*?)<\/title>/is); + + if (titleMatch && titleMatch[1]) { + const title = titleMatch[1] + .replace(/<[^>]*>/g, '') // Remove any HTML tags + .replace(/\s+/g, ' ') // Normalize whitespace + .trim(); + + // Write to file + fs.writeFileSync('title.txt', title, 'utf8'); + console.error(`✓ Extracted title: ${title}`); + console.log('title.txt'); + process.exit(0); + } else { + console.error('Warning: Could not find title tag'); + process.exit(1); + } + }); +}).on('error', (err) => { + console.error(`Error: ${err.message}`); + process.exit(1); +}).on('timeout', () => { + console.error('Error: Request timeout'); + process.exit(1); +}); diff --git a/archivebox-ts/extractors/wget b/archivebox-ts/extractors/wget new file mode 100755 index 00000000..2d79d451 --- /dev/null +++ b/archivebox-ts/extractors/wget @@ -0,0 +1,75 @@ +#!/bin/bash +# +# Wget Extractor +# Downloads a complete copy of the page using wget +# +# Usage: wget +# Output: Files in current directory +# Config: All configuration via environment variables +# WGET_TIMEOUT - Timeout in seconds (default: 60) +# WGET_USER_AGENT - User agent string +# WGET_ARGS - Additional wget arguments +# + +set -e + +URL="$1" + +if [ -z "$URL" ]; then + echo "Error: URL argument required" >&2 + exit 1 +fi + +# Auto-install dependencies +if ! command -v wget &> /dev/null; then + echo "Installing wget..." >&2 + if command -v apt-get &> /dev/null; then + sudo apt-get update && sudo apt-get install -y wget + elif command -v yum &> /dev/null; then + sudo yum install -y wget + elif command -v brew &> /dev/null; then + brew install wget + else + echo "Error: Cannot install wget. Please install manually." >&2 + exit 1 + fi +fi + +# Configuration from environment +TIMEOUT="${WGET_TIMEOUT:-60}" +USER_AGENT="${WGET_USER_AGENT:-Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)}" +EXTRA_ARGS="${WGET_ARGS:-}" + +echo "Downloading with wget: $URL" >&2 + +# Create warc directory +mkdir -p warc + +# Run wget with WARC output +wget \ + --timeout="$TIMEOUT" \ + --user-agent="$USER_AGENT" \ + --adjust-extension \ + --convert-links \ + --page-requisites \ + --span-hosts \ + --no-parent \ + --warc-file="warc/archive" \ + --warc-cdx \ + $EXTRA_ARGS \ + "$URL" 2>&1 || true + +if [ -f "warc/archive.warc.gz" ]; then + echo "✓ Created WARC archive" >&2 + echo "warc/archive.warc.gz" + exit 0 +else + echo "Warning: WARC file not created" >&2 + # Still succeed if we downloaded something + if [ -n "$(ls -A 2>/dev/null)" ]; then + echo "✓ Downloaded files" >&2 + echo "." + exit 0 + fi + exit 1 +fi diff --git a/archivebox-ts/package-lock.json b/archivebox-ts/package-lock.json new file mode 100644 index 00000000..67698d0b --- /dev/null +++ b/archivebox-ts/package-lock.json @@ -0,0 +1,542 @@ +{ + "name": "archivebox-ts", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "archivebox-ts", + "version": "0.1.0", + "license": "MIT", + "dependencies": { + "better-sqlite3": "^11.0.0", + "commander": "^12.0.0", + "nanoid": "^3.3.7" + }, + "bin": { + "archivebox-ts": "dist/cli.js" + }, + "devDependencies": { + "@types/better-sqlite3": "^7.6.9", + "@types/node": "^20.11.0", + "typescript": "^5.3.3" + } + }, + "node_modules/@types/better-sqlite3": { + "version": "7.6.13", + "resolved": "https://registry.npmjs.org/@types/better-sqlite3/-/better-sqlite3-7.6.13.tgz", + "integrity": "sha512-NMv9ASNARoKksWtsq/SHakpYAYnhBrQgGD8zkLYk/jaK8jUGn08CfEdTRgYhMypUQAfzSP8W6gNLe0q19/t4VA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, + "node_modules/@types/node": { + "version": "20.19.24", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.24.tgz", + "integrity": "sha512-FE5u0ezmi6y9OZEzlJfg37mqqf6ZDSF2V/NLjUyGrR9uTZ7Sb9F7bLNZ03S4XVUNRWGA7Ck4c1kK+YnuWjl+DA==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/better-sqlite3": { + "version": "11.10.0", + "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-11.10.0.tgz", + "integrity": "sha512-EwhOpyXiOEL/lKzHz9AW1msWFNzGc/z+LzeB3/jnFJpxu+th2yqvzsSWas1v9jgs9+xiXJcD5A8CJxAG2TaghQ==", + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "bindings": "^1.5.0", + "prebuild-install": "^7.1.1" + } + }, + "node_modules/bindings": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz", + "integrity": "sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==", + "license": "MIT", + "dependencies": { + "file-uri-to-path": "1.0.0" + } + }, + "node_modules/bl": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz", + "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==", + "license": "MIT", + "dependencies": { + "buffer": "^5.5.0", + "inherits": "^2.0.4", + "readable-stream": "^3.4.0" + } + }, + "node_modules/buffer": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", + "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.1.13" + } + }, + "node_modules/chownr": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz", + "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==", + "license": "ISC" + }, + "node_modules/commander": { + "version": "12.1.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz", + "integrity": "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, + "node_modules/decompress-response": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", + "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==", + "license": "MIT", + "dependencies": { + "mimic-response": "^3.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/deep-extend": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz", + "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==", + "license": "MIT", + "engines": { + "node": ">=4.0.0" + } + }, + "node_modules/detect-libc": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", + "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==", + "license": "Apache-2.0", + "engines": { + "node": ">=8" + } + }, + "node_modules/end-of-stream": { + "version": "1.4.5", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", + "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==", + "license": "MIT", + "dependencies": { + "once": "^1.4.0" + } + }, + "node_modules/expand-template": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz", + "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==", + "license": "(MIT OR WTFPL)", + "engines": { + "node": ">=6" + } + }, + "node_modules/file-uri-to-path": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz", + "integrity": "sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==", + "license": "MIT" + }, + "node_modules/fs-constants": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz", + "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==", + "license": "MIT" + }, + "node_modules/github-from-package": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz", + "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==", + "license": "MIT" + }, + "node_modules/ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "BSD-3-Clause" + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "license": "ISC" + }, + "node_modules/ini": { + "version": "1.3.8", + "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz", + "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==", + "license": "ISC" + }, + "node_modules/mimic-response": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz", + "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==", + "license": "MIT", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/minimist": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", + "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/mkdirp-classic": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz", + "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==", + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/napi-build-utils": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz", + "integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==", + "license": "MIT" + }, + "node_modules/node-abi": { + "version": "3.80.0", + "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.80.0.tgz", + "integrity": "sha512-LyPuZJcI9HVwzXK1GPxWNzrr+vr8Hp/3UqlmWxxh8p54U1ZbclOqbSog9lWHaCX+dBaiGi6n/hIX+mKu74GmPA==", + "license": "MIT", + "dependencies": { + "semver": "^7.3.5" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", + "license": "ISC", + "dependencies": { + "wrappy": "1" + } + }, + "node_modules/prebuild-install": { + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz", + "integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==", + "license": "MIT", + "dependencies": { + "detect-libc": "^2.0.0", + "expand-template": "^2.0.3", + "github-from-package": "0.0.0", + "minimist": "^1.2.3", + "mkdirp-classic": "^0.5.3", + "napi-build-utils": "^2.0.0", + "node-abi": "^3.3.0", + "pump": "^3.0.0", + "rc": "^1.2.7", + "simple-get": "^4.0.0", + "tar-fs": "^2.0.0", + "tunnel-agent": "^0.6.0" + }, + "bin": { + "prebuild-install": "bin.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/pump": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz", + "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==", + "license": "MIT", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, + "node_modules/rc": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz", + "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==", + "license": "(BSD-2-Clause OR MIT OR Apache-2.0)", + "dependencies": { + "deep-extend": "^0.6.0", + "ini": "~1.3.0", + "minimist": "^1.2.0", + "strip-json-comments": "~2.0.1" + }, + "bin": { + "rc": "cli.js" + } + }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "license": "MIT", + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/semver": { + "version": "7.7.3", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", + "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/simple-concat": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz", + "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, + "node_modules/simple-get": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz", + "integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "decompress-response": "^6.0.0", + "once": "^1.3.1", + "simple-concat": "^1.0.0" + } + }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "license": "MIT", + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, + "node_modules/strip-json-comments": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", + "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==", + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/tar-fs": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz", + "integrity": "sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==", + "license": "MIT", + "dependencies": { + "chownr": "^1.1.1", + "mkdirp-classic": "^0.5.2", + "pump": "^3.0.0", + "tar-stream": "^2.1.4" + } + }, + "node_modules/tar-stream": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz", + "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==", + "license": "MIT", + "dependencies": { + "bl": "^4.0.3", + "end-of-stream": "^1.4.1", + "fs-constants": "^1.0.0", + "inherits": "^2.0.3", + "readable-stream": "^3.1.1" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/tunnel-agent": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", + "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==", + "license": "Apache-2.0", + "dependencies": { + "safe-buffer": "^5.0.1" + }, + "engines": { + "node": "*" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "license": "MIT" + }, + "node_modules/wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==", + "license": "ISC" + } + } +} diff --git a/archivebox-ts/package.json b/archivebox-ts/package.json new file mode 100644 index 00000000..6258b391 --- /dev/null +++ b/archivebox-ts/package.json @@ -0,0 +1,32 @@ +{ + "name": "archivebox-ts", + "version": "0.1.0", + "description": "TypeScript-based version of ArchiveBox with simplified architecture", + "main": "dist/cli.js", + "bin": { + "archivebox-ts": "./dist/cli.js" + }, + "scripts": { + "build": "tsc", + "dev": "tsc --watch", + "start": "node dist/cli.js", + "test": "echo \"Error: no test specified\" && exit 1" + }, + "keywords": [ + "archiving", + "web-archiving", + "snapshot" + ], + "author": "", + "license": "MIT", + "dependencies": { + "better-sqlite3": "^11.0.0", + "commander": "^12.0.0", + "nanoid": "^3.3.7" + }, + "devDependencies": { + "@types/better-sqlite3": "^7.6.9", + "@types/node": "^20.11.0", + "typescript": "^5.3.3" + } +} diff --git a/archivebox-ts/src/cli.ts b/archivebox-ts/src/cli.ts new file mode 100644 index 00000000..81d3c30b --- /dev/null +++ b/archivebox-ts/src/cli.ts @@ -0,0 +1,312 @@ +#!/usr/bin/env node + +/** + * ArchiveBox TypeScript - Main CLI + */ + +import { Command } from 'commander'; +import * as path from 'path'; +import * as fs from 'fs'; +import { ArchiveDatabase } from './db'; +import { ExtractorManager } from './extractors'; +import type { ExtractorName } from './models'; + +const program = new Command(); + +// Default paths +const DEFAULT_DATA_DIR = path.join(process.cwd(), 'data'); +const DEFAULT_DB_PATH = path.join(DEFAULT_DATA_DIR, 'index.sqlite3'); +const DEFAULT_ARCHIVE_DIR = path.join(DEFAULT_DATA_DIR, 'archive'); +const EXTRACTORS_DIR = path.join(__dirname, '..', 'extractors'); + +// Helper to ensure data directory exists +function ensureDataDir(dataDir: string): void { + if (!fs.existsSync(dataDir)) { + fs.mkdirSync(dataDir, { recursive: true }); + } +} + +// Helper to get snapshot output directory +function getSnapshotOutputDir(archiveDir: string, snapshotId: string, url: string): string { + const urlObj = new URL(url); + const domain = urlObj.hostname; + const timestamp = Date.now().toString(); + + // Create directory structure: archive/_ + const dirName = `${timestamp}_${domain}`; + const outputDir = path.join(archiveDir, dirName); + + return outputDir; +} + +program + .name('archivebox-ts') + .description('TypeScript-based version of ArchiveBox with simplified architecture') + .version('0.1.0'); + +// Initialize command +program + .command('init') + .description('Initialize ArchiveBox data directory and database') + .option('-d, --data-dir ', 'Data directory path', DEFAULT_DATA_DIR) + .action((options) => { + const dataDir = options.dataDir; + const dbPath = path.join(dataDir, 'index.sqlite3'); + const archiveDir = path.join(dataDir, 'archive'); + + console.log('Initializing ArchiveBox...'); + console.log(`Data directory: ${dataDir}`); + console.log(`Database: ${dbPath}`); + console.log(`Archive directory: ${archiveDir}`); + + ensureDataDir(dataDir); + ensureDataDir(archiveDir); + + const db = new ArchiveDatabase(dbPath); + db.close(); + + console.log('✓ Initialization complete!'); + }); + +// Add command +program + .command('add') + .description('Add a URL to archive') + .argument('', 'URL to archive') + .option('-d, --data-dir ', 'Data directory path', DEFAULT_DATA_DIR) + .option('-e, --extractors ', 'Comma-separated list of extractors to run (default: all)') + .option('--title ', 'Page title') + .action(async (url, options) => { + const dataDir = options.dataDir; + const dbPath = path.join(dataDir, 'index.sqlite3'); + const archiveDir = path.join(dataDir, 'archive'); + + ensureDataDir(dataDir); + ensureDataDir(archiveDir); + + const db = new ArchiveDatabase(dbPath); + const extractorManager = new ExtractorManager(EXTRACTORS_DIR); + + try { + console.log(`Adding URL: ${url}`); + + // Check if URL already exists + let snapshot = db.getSnapshotByUrl(url); + if (snapshot) { + console.log(`URL already exists with ID: ${snapshot.id}`); + } else { + // Create new snapshot + snapshot = db.createSnapshot({ + url, + title: options.title, + }); + console.log(`✓ Created snapshot: ${snapshot.id}`); + } + + // Determine which extractors to run + const availableExtractors = extractorManager.getAvailableExtractors(); + let extractorsToRun: ExtractorName[]; + + if (options.extractors) { + extractorsToRun = options.extractors.split(',').map((e: string) => e.trim() as ExtractorName); + // Validate extractors + for (const extractor of extractorsToRun) { + if (!extractorManager.hasExtractor(extractor)) { + console.warn(`Warning: Extractor not found: ${extractor}`); + } + } + } else { + extractorsToRun = availableExtractors; + } + + if (extractorsToRun.length === 0) { + console.log('No extractors available. Place extractor executables in the extractors/ directory.'); + db.close(); + return; + } + + console.log(`Running extractors: ${extractorsToRun.join(', ')}`); + + // Update snapshot status + db.updateSnapshotStatus(snapshot.id, 'started'); + + // Create output directory + const outputDir = getSnapshotOutputDir(archiveDir, snapshot.id, url); + fs.mkdirSync(outputDir, { recursive: true }); + db.setSnapshotOutputDir(snapshot.id, outputDir); + + console.log(`Output directory: ${outputDir}`); + + // Create archive results for each extractor + const archiveResults = new Map<ExtractorName, string>(); + for (const extractor of extractorsToRun) { + if (extractorManager.hasExtractor(extractor)) { + const result = db.createArchiveResult({ + snapshot_id: snapshot.id, + extractor, + }); + archiveResults.set(extractor, result.id); + } + } + + // Run extractors + const results = await extractorManager.runExtractors( + extractorsToRun, + url, + outputDir, + {} // Environment variables can be passed here + ); + + // Update archive results + for (const [extractorName, result] of results.entries()) { + const resultId = archiveResults.get(extractorName); + if (resultId) { + db.updateArchiveResult(resultId, { + status: result.success ? 'succeeded' : 'failed', + start_ts: result.start_ts, + end_ts: result.end_ts, + cmd: result.cmd, + pwd: result.pwd, + output: result.output, + notes: result.error || '', + }); + + const status = result.success ? '✓' : '✗'; + console.log(` ${status} ${extractorName}: ${result.success ? 'succeeded' : 'failed'}`); + if (result.error) { + console.log(` Error: ${result.error}`); + } + } + } + + // Update snapshot status + db.updateSnapshotStatus(snapshot.id, 'sealed', new Date().toISOString()); + + console.log(`✓ Archiving complete!`); + console.log(`Snapshot ID: ${snapshot.id}`); + console.log(`Output: ${outputDir}`); + + } catch (err) { + console.error('Error:', err instanceof Error ? err.message : err); + process.exit(1); + } finally { + db.close(); + } + }); + +// List command +program + .command('list') + .description('List all archived snapshots') + .option('-d, --data-dir <path>', 'Data directory path', DEFAULT_DATA_DIR) + .option('-l, --limit <number>', 'Number of snapshots to show', '20') + .option('-o, --offset <number>', 'Offset for pagination', '0') + .action((options) => { + const dataDir = options.dataDir; + const dbPath = path.join(dataDir, 'index.sqlite3'); + + const db = new ArchiveDatabase(dbPath); + + try { + const snapshots = db.getAllSnapshots( + parseInt(options.limit), + parseInt(options.offset) + ); + + if (snapshots.length === 0) { + console.log('No snapshots found.'); + return; + } + + console.log(`\nFound ${snapshots.length} snapshot(s):\n`); + + for (const snapshot of snapshots) { + console.log(`ID: ${snapshot.id}`); + console.log(`URL: ${snapshot.url}`); + console.log(`Title: ${snapshot.title || '(none)'}`); + console.log(`Status: ${snapshot.status}`); + console.log(`Created: ${snapshot.created_at}`); + console.log(`Output: ${snapshot.output_dir || '(none)'}`); + console.log('---'); + } + } catch (err) { + console.error('Error:', err instanceof Error ? err.message : err); + process.exit(1); + } finally { + db.close(); + } + }); + +// Status command +program + .command('status') + .description('Show status of a snapshot') + .argument('<id>', 'Snapshot ID') + .option('-d, --data-dir <path>', 'Data directory path', DEFAULT_DATA_DIR) + .action((id, options) => { + const dataDir = options.dataDir; + const dbPath = path.join(dataDir, 'index.sqlite3'); + + const db = new ArchiveDatabase(dbPath); + + try { + const snapshot = db.getSnapshot(id); + if (!snapshot) { + console.error(`Snapshot not found: ${id}`); + process.exit(1); + } + + console.log(`\nSnapshot: ${snapshot.id}`); + console.log(`URL: ${snapshot.url}`); + console.log(`Title: ${snapshot.title || '(none)'}`); + console.log(`Status: ${snapshot.status}`); + console.log(`Created: ${snapshot.created_at}`); + console.log(`Downloaded: ${snapshot.downloaded_at || '(in progress)'}`); + console.log(`Output: ${snapshot.output_dir || '(none)'}`); + + const results = db.getArchiveResults(snapshot.id); + if (results.length > 0) { + console.log(`\nArchive Results (${results.length}):`); + for (const result of results) { + const statusIcon = result.status === 'succeeded' ? '✓' : + result.status === 'failed' ? '✗' : + result.status === 'started' ? '⋯' : '○'; + console.log(` ${statusIcon} ${result.extractor}: ${result.status}`); + if (result.output) { + console.log(` Output: ${result.output}`); + } + if (result.notes) { + console.log(` Notes: ${result.notes}`); + } + } + } + } catch (err) { + console.error('Error:', err instanceof Error ? err.message : err); + process.exit(1); + } finally { + db.close(); + } + }); + +// Extractors command +program + .command('extractors') + .description('List available extractors') + .action(() => { + const extractorManager = new ExtractorManager(EXTRACTORS_DIR); + const extractors = extractorManager.getAvailableExtractors(); + + if (extractors.length === 0) { + console.log('No extractors found.'); + console.log(`Place executable files in: ${EXTRACTORS_DIR}`); + return; + } + + console.log(`\nAvailable extractors (${extractors.length}):\n`); + for (const extractor of extractors) { + console.log(` - ${extractor}`); + } + console.log(); + }); + +program.parse(); diff --git a/archivebox-ts/src/db.ts b/archivebox-ts/src/db.ts new file mode 100644 index 00000000..2f710d17 --- /dev/null +++ b/archivebox-ts/src/db.ts @@ -0,0 +1,373 @@ +/** + * Database layer using SQLite with schema matching ArchiveBox + */ + +import Database from 'better-sqlite3'; +import { randomUUID } from 'crypto'; +import { nanoid } from 'nanoid'; +import * as path from 'path'; +import * as fs from 'fs'; +import type { + Snapshot, + ArchiveResult, + CreateSnapshotInput, + CreateArchiveResultInput, + SnapshotStatus, + ArchiveResultStatus, +} from './models'; + +export class ArchiveDatabase { + private db: Database.Database; + + constructor(dbPath: string) { + // Ensure the directory exists + const dir = path.dirname(dbPath); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }); + } + + this.db = new Database(dbPath); + this.db.pragma('journal_mode = WAL'); + this.initSchema(); + } + + private initSchema(): void { + // Create snapshots table (simplified from ArchiveBox schema) + this.db.exec(` + CREATE TABLE IF NOT EXISTS snapshots ( + id TEXT PRIMARY KEY, + abid TEXT NOT NULL UNIQUE, + url TEXT NOT NULL UNIQUE, + timestamp TEXT NOT NULL UNIQUE, + title TEXT, + created_at TEXT NOT NULL, + bookmarked_at TEXT NOT NULL, + downloaded_at TEXT, + modified_at TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'queued', + retry_at TEXT NOT NULL, + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + output_dir TEXT + ); + + CREATE INDEX IF NOT EXISTS idx_snapshots_url ON snapshots(url); + CREATE INDEX IF NOT EXISTS idx_snapshots_timestamp ON snapshots(timestamp); + CREATE INDEX IF NOT EXISTS idx_snapshots_created_at ON snapshots(created_at); + CREATE INDEX IF NOT EXISTS idx_snapshots_status ON snapshots(status); + `); + + // Create archive_results table (simplified from ArchiveBox schema) + this.db.exec(` + CREATE TABLE IF NOT EXISTS archive_results ( + id TEXT PRIMARY KEY, + abid TEXT NOT NULL UNIQUE, + snapshot_id TEXT NOT NULL, + extractor TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'queued', + created_at TEXT NOT NULL, + modified_at TEXT NOT NULL, + start_ts TEXT, + end_ts TEXT, + cmd TEXT, + pwd TEXT, + cmd_version TEXT, + output TEXT, + retry_at TEXT NOT NULL, + config TEXT NOT NULL DEFAULT '{}', + notes TEXT NOT NULL DEFAULT '', + FOREIGN KEY (snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE + ); + + CREATE INDEX IF NOT EXISTS idx_archive_results_snapshot_id ON archive_results(snapshot_id); + CREATE INDEX IF NOT EXISTS idx_archive_results_extractor ON archive_results(extractor); + CREATE INDEX IF NOT EXISTS idx_archive_results_status ON archive_results(status); + CREATE INDEX IF NOT EXISTS idx_archive_results_created_at ON archive_results(created_at); + `); + } + + /** + * Generate ABID (Archivable Bytes Identifier) similar to ArchiveBox + */ + private generateABID(prefix: string, url: string): string { + const randomPart = nanoid(8); + return `${prefix}${randomPart}`; + } + + /** + * Create a new snapshot + */ + createSnapshot(input: CreateSnapshotInput): Snapshot { + const now = new Date().toISOString(); + const timestamp = Date.now().toString(); + const id = randomUUID(); + const abid = this.generateABID('snp_', input.url); + + const snapshot: Snapshot = { + id, + abid, + url: input.url, + timestamp, + title: input.title || null, + created_at: now, + bookmarked_at: input.bookmarked_at || now, + downloaded_at: null, + modified_at: now, + status: 'queued', + retry_at: now, + config: JSON.stringify(input.config || {}), + notes: input.notes || '', + output_dir: null, + } as any; + + const stmt = this.db.prepare(` + INSERT INTO snapshots ( + id, abid, url, timestamp, title, created_at, bookmarked_at, + downloaded_at, modified_at, status, retry_at, config, notes, output_dir + ) VALUES ( + @id, @abid, @url, @timestamp, @title, @created_at, @bookmarked_at, + @downloaded_at, @modified_at, @status, @retry_at, @config, @notes, @output_dir + ) + `); + + stmt.run(snapshot); + return this.getSnapshot(id)!; + } + + /** + * Get a snapshot by ID + */ + getSnapshot(id: string): Snapshot | null { + const stmt = this.db.prepare('SELECT * FROM snapshots WHERE id = ?'); + const row = stmt.get(id) as any; + if (!row) return null; + + return { + ...row, + config: JSON.parse(row.config || '{}'), + } as Snapshot; + } + + /** + * Get a snapshot by URL + */ + getSnapshotByUrl(url: string): Snapshot | null { + const stmt = this.db.prepare('SELECT * FROM snapshots WHERE url = ?'); + const row = stmt.get(url) as any; + if (!row) return null; + + return { + ...row, + config: JSON.parse(row.config || '{}'), + } as Snapshot; + } + + /** + * Get all snapshots + */ + getAllSnapshots(limit: number = 100, offset: number = 0): Snapshot[] { + const stmt = this.db.prepare( + 'SELECT * FROM snapshots ORDER BY created_at DESC LIMIT ? OFFSET ?' + ); + const rows = stmt.all(limit, offset) as any[]; + + return rows.map(row => ({ + ...row, + config: JSON.parse(row.config || '{}'), + })) as Snapshot[]; + } + + /** + * Update snapshot status + */ + updateSnapshotStatus(id: string, status: SnapshotStatus, downloaded_at?: string): void { + const modified_at = new Date().toISOString(); + + if (downloaded_at) { + const stmt = this.db.prepare(` + UPDATE snapshots + SET status = ?, modified_at = ?, downloaded_at = ? + WHERE id = ? + `); + stmt.run(status, modified_at, downloaded_at, id); + } else { + const stmt = this.db.prepare(` + UPDATE snapshots + SET status = ?, modified_at = ? + WHERE id = ? + `); + stmt.run(status, modified_at, id); + } + } + + /** + * Set snapshot output directory + */ + setSnapshotOutputDir(id: string, output_dir: string): void { + const stmt = this.db.prepare(` + UPDATE snapshots SET output_dir = ?, modified_at = ? WHERE id = ? + `); + stmt.run(output_dir, new Date().toISOString(), id); + } + + /** + * Create a new archive result + */ + createArchiveResult(input: CreateArchiveResultInput): ArchiveResult { + const now = new Date().toISOString(); + const id = randomUUID(); + const snapshot = this.getSnapshot(input.snapshot_id); + if (!snapshot) { + throw new Error(`Snapshot ${input.snapshot_id} not found`); + } + + const abid = this.generateABID('res_', snapshot.url); + + const result: ArchiveResult = { + id, + abid, + snapshot_id: input.snapshot_id, + extractor: input.extractor, + status: 'queued', + created_at: now, + modified_at: now, + start_ts: null, + end_ts: null, + cmd: null, + pwd: null, + cmd_version: null, + output: null, + retry_at: now, + config: JSON.stringify(input.config || {}), + notes: input.notes || '', + } as any; + + const stmt = this.db.prepare(` + INSERT INTO archive_results ( + id, abid, snapshot_id, extractor, status, created_at, modified_at, + start_ts, end_ts, cmd, pwd, cmd_version, output, retry_at, config, notes + ) VALUES ( + @id, @abid, @snapshot_id, @extractor, @status, @created_at, @modified_at, + @start_ts, @end_ts, @cmd, @pwd, @cmd_version, @output, @retry_at, @config, @notes + ) + `); + + stmt.run(result); + return this.getArchiveResult(id)!; + } + + /** + * Get an archive result by ID + */ + getArchiveResult(id: string): ArchiveResult | null { + const stmt = this.db.prepare('SELECT * FROM archive_results WHERE id = ?'); + const row = stmt.get(id) as any; + if (!row) return null; + + return { + ...row, + cmd: row.cmd ? JSON.parse(row.cmd) : null, + config: JSON.parse(row.config || '{}'), + } as ArchiveResult; + } + + /** + * Get all archive results for a snapshot + */ + getArchiveResults(snapshot_id: string): ArchiveResult[] { + const stmt = this.db.prepare( + 'SELECT * FROM archive_results WHERE snapshot_id = ? ORDER BY created_at ASC' + ); + const rows = stmt.all(snapshot_id) as any[]; + + return rows.map(row => ({ + ...row, + cmd: row.cmd ? JSON.parse(row.cmd) : null, + config: JSON.parse(row.config || '{}'), + })) as ArchiveResult[]; + } + + /** + * Get archive results by status + */ + getArchiveResultsByStatus(status: ArchiveResultStatus): ArchiveResult[] { + const stmt = this.db.prepare( + 'SELECT * FROM archive_results WHERE status = ? ORDER BY created_at ASC' + ); + const rows = stmt.all(status) as any[]; + + return rows.map(row => ({ + ...row, + cmd: row.cmd ? JSON.parse(row.cmd) : null, + config: JSON.parse(row.config || '{}'), + })) as ArchiveResult[]; + } + + /** + * Update archive result + */ + updateArchiveResult( + id: string, + updates: { + status?: ArchiveResultStatus; + start_ts?: string; + end_ts?: string; + cmd?: string[]; + pwd?: string; + cmd_version?: string; + output?: string; + notes?: string; + } + ): void { + const fields: string[] = ['modified_at = ?']; + const values: any[] = [new Date().toISOString()]; + + if (updates.status !== undefined) { + fields.push('status = ?'); + values.push(updates.status); + } + if (updates.start_ts !== undefined) { + fields.push('start_ts = ?'); + values.push(updates.start_ts); + } + if (updates.end_ts !== undefined) { + fields.push('end_ts = ?'); + values.push(updates.end_ts); + } + if (updates.cmd !== undefined) { + fields.push('cmd = ?'); + values.push(JSON.stringify(updates.cmd)); + } + if (updates.pwd !== undefined) { + fields.push('pwd = ?'); + values.push(updates.pwd); + } + if (updates.cmd_version !== undefined) { + fields.push('cmd_version = ?'); + values.push(updates.cmd_version); + } + if (updates.output !== undefined) { + fields.push('output = ?'); + values.push(updates.output); + } + if (updates.notes !== undefined) { + fields.push('notes = ?'); + values.push(updates.notes); + } + + values.push(id); + + const stmt = this.db.prepare(` + UPDATE archive_results SET ${fields.join(', ')} WHERE id = ? + `); + + stmt.run(...values); + } + + /** + * Close the database connection + */ + close(): void { + this.db.close(); + } +} diff --git a/archivebox-ts/src/extractors.ts b/archivebox-ts/src/extractors.ts new file mode 100644 index 00000000..17917c58 --- /dev/null +++ b/archivebox-ts/src/extractors.ts @@ -0,0 +1,205 @@ +/** + * Extractor orchestration system + * Discovers and runs standalone extractor executables + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { spawn } from 'child_process'; +import type { ExtractorName } from './models'; + +export interface ExtractorInfo { + name: ExtractorName; + path: string; + executable: boolean; +} + +export interface ExtractorResult { + success: boolean; + output?: string; + error?: string; + cmd: string[]; + cmd_version?: string; + start_ts: string; + end_ts: string; + pwd: string; +} + +export class ExtractorManager { + private extractorsDir: string; + private availableExtractors: Map<ExtractorName, ExtractorInfo>; + + constructor(extractorsDir: string) { + this.extractorsDir = extractorsDir; + this.availableExtractors = new Map(); + this.discoverExtractors(); + } + + /** + * Discover all available extractors in the extractors directory + */ + private discoverExtractors(): void { + if (!fs.existsSync(this.extractorsDir)) { + console.warn(`Extractors directory not found: ${this.extractorsDir}`); + return; + } + + const files = fs.readdirSync(this.extractorsDir); + + for (const file of files) { + const filePath = path.join(this.extractorsDir, file); + const stats = fs.statSync(filePath); + + // Skip directories and non-executable files + if (stats.isDirectory()) continue; + + // Check if file is executable + try { + fs.accessSync(filePath, fs.constants.X_OK); + const name = file as ExtractorName; + + this.availableExtractors.set(name, { + name, + path: filePath, + executable: true, + }); + + console.log(`Discovered extractor: ${name}`); + } catch (err) { + // File is not executable, skip it + console.warn(`Skipping non-executable file: ${file}`); + } + } + } + + /** + * Get list of available extractors + */ + getAvailableExtractors(): ExtractorName[] { + return Array.from(this.availableExtractors.keys()); + } + + /** + * Check if an extractor is available + */ + hasExtractor(name: ExtractorName): boolean { + return this.availableExtractors.has(name); + } + + /** + * Run an extractor on a URL + * @param extractorName Name of the extractor to run + * @param url URL to extract + * @param outputDir Directory where extractor should output files + * @param env Environment variables to pass to the extractor + * @returns Promise with the extraction result + */ + async runExtractor( + extractorName: ExtractorName, + url: string, + outputDir: string, + env: Record<string, string> = {} + ): Promise<ExtractorResult> { + const extractor = this.availableExtractors.get(extractorName); + + if (!extractor) { + throw new Error(`Extractor not found: ${extractorName}`); + } + + // Ensure output directory exists + if (!fs.existsSync(outputDir)) { + fs.mkdirSync(outputDir, { recursive: true }); + } + + const start_ts = new Date().toISOString(); + const cmd = [extractor.path, url]; + + // Merge environment variables + const processEnv = { + ...process.env, + ...env, + ARCHIVEBOX_OUTPUT_DIR: outputDir, + }; + + return new Promise((resolve) => { + let stdout = ''; + let stderr = ''; + + const child = spawn(extractor.path, [url], { + cwd: outputDir, + env: processEnv, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + child.stdout?.on('data', (data) => { + stdout += data.toString(); + }); + + child.stderr?.on('data', (data) => { + stderr += data.toString(); + }); + + child.on('close', (code) => { + const end_ts = new Date().toISOString(); + + const result: ExtractorResult = { + success: code === 0, + output: stdout.trim(), + error: stderr.trim() || undefined, + cmd, + start_ts, + end_ts, + pwd: outputDir, + }; + + resolve(result); + }); + + child.on('error', (err) => { + const end_ts = new Date().toISOString(); + + const result: ExtractorResult = { + success: false, + error: err.message, + cmd, + start_ts, + end_ts, + pwd: outputDir, + }; + + resolve(result); + }); + }); + } + + /** + * Run multiple extractors in parallel + */ + async runExtractors( + extractorNames: ExtractorName[], + url: string, + outputDir: string, + env: Record<string, string> = {} + ): Promise<Map<ExtractorName, ExtractorResult>> { + const results = new Map<ExtractorName, ExtractorResult>(); + + const promises = extractorNames.map(async (name) => { + try { + const result = await this.runExtractor(name, url, outputDir, env); + results.set(name, result); + } catch (err) { + results.set(name, { + success: false, + error: err instanceof Error ? err.message : String(err), + cmd: [name, url], + start_ts: new Date().toISOString(), + end_ts: new Date().toISOString(), + pwd: outputDir, + }); + } + }); + + await Promise.all(promises); + return results; + } +} diff --git a/archivebox-ts/src/models.ts b/archivebox-ts/src/models.ts new file mode 100644 index 00000000..b7cf9bd2 --- /dev/null +++ b/archivebox-ts/src/models.ts @@ -0,0 +1,85 @@ +/** + * TypeScript models matching ArchiveBox database schema + */ + +export type SnapshotStatus = 'queued' | 'started' | 'sealed'; +export type ArchiveResultStatus = 'queued' | 'started' | 'backoff' | 'succeeded' | 'failed' | 'skipped'; + +export type ExtractorName = + | 'favicon' + | 'title' + | 'headers' + | 'screenshot' + | 'pdf' + | 'dom' + | 'singlefile' + | 'wget' + | 'readability' + | 'mercury' + | 'htmltotext' + | 'git' + | 'media' + | 'archive_org'; + +/** + * Snapshot represents a single URL being archived + */ +export interface Snapshot { + id: string; // UUID primary key + abid: string; // ABID identifier (snp_...) + url: string; // The URL being archived (unique) + timestamp: string; // Unix timestamp string (unique) + title: string | null; // Page title + created_at: string; // ISO datetime + bookmarked_at: string; // ISO datetime + downloaded_at: string | null; // ISO datetime when archiving completed + modified_at: string; // ISO datetime + status: SnapshotStatus; // Current status + retry_at: string; // ISO datetime for retry logic + config: Record<string, any>; // JSON configuration + notes: string; // Extra notes + output_dir: string | null; // Path to output directory +} + +/** + * ArchiveResult represents the result of running one extractor on one Snapshot + */ +export interface ArchiveResult { + id: string; // UUID primary key + abid: string; // ABID identifier (res_...) + snapshot_id: string; // Foreign key to Snapshot + extractor: ExtractorName; // Name of the extractor + status: ArchiveResultStatus; // Current status + created_at: string; // ISO datetime + modified_at: string; // ISO datetime + start_ts: string | null; // ISO datetime when extraction started + end_ts: string | null; // ISO datetime when extraction ended + cmd: string[] | null; // Command that was executed + pwd: string | null; // Working directory + cmd_version: string | null; // Version of the binary used + output: string | null; // Main output file path or result + retry_at: string; // ISO datetime for retry logic + config: Record<string, any>; // JSON configuration + notes: string; // Extra notes +} + +/** + * Simplified snapshot for creation + */ +export interface CreateSnapshotInput { + url: string; + title?: string | null; + bookmarked_at?: string; + config?: Record<string, any>; + notes?: string; +} + +/** + * Simplified archive result for creation + */ +export interface CreateArchiveResultInput { + snapshot_id: string; + extractor: ExtractorName; + config?: Record<string, any>; + notes?: string; +} diff --git a/archivebox-ts/tsconfig.json b/archivebox-ts/tsconfig.json new file mode 100644 index 00000000..a5c9a513 --- /dev/null +++ b/archivebox-ts/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "commonjs", + "lib": ["ES2022"], + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "declaration": true, + "declarationMap": true, + "sourceMap": true, + "moduleResolution": "node" + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +}