Add TypeScript-based ArchiveBox implementation with simplified architecture

This commit introduces archivebox-ts, a TypeScript reimplementation of ArchiveBox with a simplified, modular architecture. Key features: - Standalone executable extractors (bash, Node.js, Python with shebang) - Auto-installing dependencies per extractor - Simple interface: URL as $1 CLI arg, output to current directory - Environment variable-based configuration only - SQLite database with schema matching original ArchiveBox - Language-agnostic extractor system Core components: - src/cli.ts: Main CLI with Commander.js (init, add, list, status, extractors) - src/db.ts: SQLite operations using better-sqlite3 - src/models.ts: TypeScript interfaces matching database schema - src/extractors.ts: Extractor discovery and orchestration Sample extractors included: - favicon: Download site favicon (bash + curl) - title: Extract page title (Node.js) - headers: Extract HTTP headers (bash + curl) - wget: Full page download with WARC (bash + wget) - screenshot: Capture screenshot (Python + Playwright) Documentation: - README.md: Architecture overview and usage - QUICKSTART.md: 5-minute getting started guide - EXTRACTOR_GUIDE.md: Comprehensive extractor development guide - ARCHITECTURE.md: Design decisions and implementation details Tested and working: - Database initialization - URL archiving with multiple extractors - Parallel extractor execution - Result tracking and status reporting - All CLI commands functional
2026-01-05 18:35:50 +10:00 · 2025-11-03 18:16:57 +00:00
parent c3024815f3
commit f4bb10bdae
17 changed files with 3490 additions and 0 deletions
--- a/archivebox-ts/.gitignore
+++ b/archivebox-ts/.gitignore
@@ -0,0 +1,8 @@
+node_modules/
+dist/
+*.log
+.env
+data/
+*.db
+*.sqlite
+*.sqlite3
--- a/archivebox-ts/ARCHITECTURE.md
+++ b/archivebox-ts/ARCHITECTURE.md
@@ -0,0 +1,391 @@
+# ArchiveBox-TS Architecture
+
+This document explains the architectural decisions and design philosophy of ArchiveBox-TS.
+
+## Design Philosophy
+
+### 1. Simplicity Over Flexibility
+
+Rather than building a complex plugin system, we use the simplest possible approach: standalone executable files. This makes the system easier to understand, debug, and extend.
+
+### 2. Convention Over Configuration
+
+- Extractors are discovered by scanning a directory
+- URL is always the first argument
+- Output always goes to current directory
+- Config always via environment variables
+
+### 3. Language Agnostic
+
+Extractors can be written in any language (bash, Python, Node.js, Go, Rust, etc.) as long as they follow the simple contract: executable with shebang, URL as $1, files to current dir.
+
+### 4. Self-Contained
+
+Each extractor is responsible for its own dependencies. This removes the need for a central dependency management system.
+
+### 5. Fail Fast, Recover Gracefully
+
+Individual extractor failures don't stop the whole archiving process. Each extractor runs independently and reports its own status.
+
+## Core Components
+
+### 1. Database Layer (`src/db.ts`)
+
+Uses better-sqlite3 for synchronous SQLite operations.
+
+**Key Design Decisions:**
+
+- **Synchronous API**: Simpler to use than async for CLI applications
+- **WAL Mode**: Better concurrency support
+- **Compatible Schema**: Matches ArchiveBox's schema for potential data migration
+- **JSON Storage**: Config and cmd stored as JSON for flexibility
+- **UUID Primary Keys**: Globally unique, can be generated client-side
+
+**Schema Simplifications:**
+
+- No user system (single-user mode)
+- No tags (can be added later)
+- No crawls (can be added later)
+- Simplified state machine (queued → started → sealed)
+
+### 2. Extractor Manager (`src/extractors.ts`)
+
+Discovers and orchestrates extractor execution.
+
+**Discovery Process:**
+
+1. Scan extractors directory
+2. Check file execute permissions
+3. Register available extractors
+
+**Execution Process:**
+
+1. Create output directory
+2. Spawn process with URL as first argument
+3. Set working directory to output directory
+4. Pass environment variables
+5. Capture stdout (output file) and stderr (logs)
+6. Record exit code (0 = success)
+
+**Parallelization:**
+
+Extractors run in parallel using Promise.all(). Each extractor is independent and failures are isolated.
+
+### 3. CLI (`src/cli.ts`)
+
+Uses Commander.js for CLI argument parsing.
+
+**Commands:**
+
+- `init` - Set up data directory and database
+- `add` - Archive a URL
+- `list` - Show all snapshots
+- `status` - Show snapshot details
+- `extractors` - List available extractors
+
+**Flow for `add` command:**
+
+```
+1. Parse arguments
+2. Open database
+3. Check if URL exists
+4. Create snapshot (or reuse existing)
+5. Determine extractors to run
+6. Update snapshot status to 'started'
+7. Create output directory
+8. Create ArchiveResult records for each extractor
+9. Run extractors in parallel
+10. Update ArchiveResult records with results
+11. Update snapshot status to 'sealed'
+12. Close database
+```
+
+## Data Model
+
+### Snapshot
+
+Represents a URL that has been (or is being) archived.
+
+**States:**
+- `queued` - Created but not started
+- `started` - Currently archiving
+- `sealed` - Archiving complete
+
+**Key Fields:**
+- `id` - UUID
+- `abid` - ArchiveBox ID (for compatibility)
+- `url` - The URL being archived
+- `timestamp` - Unix timestamp string
+- `output_dir` - Where files are stored
+
+### ArchiveResult
+
+Represents one extractor's result for one snapshot.
+
+**States:**
+- `queued` - Waiting to run
+- `started` - Currently running
+- `succeeded` - Completed successfully
+- `failed` - Failed with error
+- `skipped` - Intentionally skipped
+- `backoff` - Waiting to retry
+
+**Key Fields:**
+- `id` - UUID
+- `snapshot_id` - Foreign key to snapshot
+- `extractor` - Name of extractor
+- `cmd` - Command that was executed
+- `output` - Main output file path
+- `start_ts`, `end_ts` - Execution timing
+
+## Extractor Contract
+
+### Input
+
+1. **URL** - First positional argument (`$1`, `process.argv[2]`, `sys.argv[1]`)
+2. **Environment Variables** - All configuration
+3. **Working Directory** - Where to write output files
+
+### Output
+
+1. **Files** - Written to current working directory
+2. **stdout** - Main output filename (e.g., "screenshot.png")
+3. **stderr** - Logs, progress, errors
+4. **Exit Code** - 0 for success, non-zero for failure
+
+### Lifecycle
+
+```
+1. Extractor spawned by ExtractorManager
+2. Changes to output directory
+3. Reads config from environment
+4. (Optional) Auto-installs dependencies
+5. Processes URL
+6. Writes output files
+7. Prints main file to stdout
+8. Logs to stderr
+9. Exits with status code
+```
+
+## File Organization
+
+```
+archivebox-ts/
+├── src/                    # TypeScript source
+│   ├── cli.ts             # CLI entry point
+│   ├── db.ts              # Database operations
+│   ├── models.ts          # TypeScript types
+│   └── extractors.ts      # Extractor orchestration
+├── extractors/            # Extractor executables
+│   ├── favicon            # Bash script
+│   ├── title              # Node.js script
+│   ├── headers            # Bash script
+│   ├── wget               # Bash script
+│   └── screenshot         # Python script
+├── dist/                  # Compiled JavaScript (gitignored)
+├── data/                  # Runtime data (gitignored)
+│   ├── index.sqlite3      # Database
+│   └── archive/           # Archived content
+│       └── <timestamp>_<domain>/
+│           ├── favicon.ico
+│           ├── title.txt
+│           └── ...
+├── package.json
+├── tsconfig.json
+├── README.md
+├── QUICKSTART.md
+├── EXTRACTOR_GUIDE.md
+└── ARCHITECTURE.md (this file)
+```
+
+## Output Directory Naming
+
+Pattern: `<timestamp>_<domain>`
+
+Example: `1762193664373_example.com`
+
+**Why?**
+- Timestamp ensures uniqueness
+- Domain provides human-readable context
+- Simple flat structure (no deep nesting)
+
+## Comparison to Original ArchiveBox
+
+### What We Kept
+
+1. **Database Schema** - Compatible with ArchiveBox for potential migration
+2. **Snapshot/ArchiveResult Model** - Same conceptual model
+3. **Extractor Names** - Same names (favicon, title, headers, etc.)
+4. **Output Structure** - Similar file organization
+
+### What We Simplified
+
+1. **Plugin System** → Executable files
+2. **Configuration Files** → Environment variables
+3. **Django ORM** → Raw SQLite
+4. **Web UI** → CLI only (for now)
+5. **Background Workers** → Direct execution
+6. **Multi-user** → Single-user
+7. **ABX Framework** → Simple directory scan
+
+### What We Improved
+
+1. **Easier to Extend** - Just drop an executable in a directory
+2. **Language Agnostic** - Use any language for extractors
+3. **Simpler Dependencies** - Each extractor manages its own
+4. **Easier to Test** - Extractors can be tested standalone
+5. **Smaller Codebase** - ~500 lines vs thousands
+
+## Performance Characteristics
+
+### Time Complexity
+
+- **Add URL**: O(n) where n = number of extractors
+- **List Snapshots**: O(n) where n = number of snapshots (with pagination)
+- **Get Status**: O(1) for snapshot, O(m) for results where m = extractors used
+- **Discover Extractors**: O(e) where e = files in extractors directory
+
+### Space Complexity
+
+- **Database**: O(n * m) where n = snapshots, m = extractors per snapshot
+- **Archive Files**: Depends on content (potentially large)
+
+### Concurrency
+
+- **Extractors**: Run in parallel (Promise.all)
+- **CLI Commands**: Sequential (SQLite has one writer)
+- **Future**: Could add job queue for background processing
+
+## Scaling Considerations
+
+### Current Limits
+
+- Single machine
+- One CLI command at a time
+- No distributed execution
+- Limited by SQLite write throughput
+
+### Future Enhancements
+
+1. **Job Queue** - Redis or database-based queue
+2. **Worker Processes** - Multiple workers processing queue
+3. **Distributed Execution** - Run extractors on different machines
+4. **Caching** - Cache extractor results
+5. **Incremental Archiving** - Only run changed extractors
+
+## Error Handling
+
+### Extractor Failures
+
+- Captured and stored in ArchiveResult.notes
+- Don't stop other extractors
+- Exit code determines success/failure
+- stderr captured for debugging
+
+### Database Errors
+
+- Propagated to CLI
+- Transaction rollback on failure
+- Clear error messages
+
+### Network Errors
+
+- Handled by individual extractors
+- Timeout via environment variables
+- Retry logic in extractors (optional)
+
+## Testing Strategy
+
+### Unit Tests (Future)
+
+- Database operations
+- Extractor discovery
+- Model validation
+
+### Integration Tests (Future)
+
+- Full CLI commands
+- Database + extractors
+- Error scenarios
+
+### Extractor Tests
+
+- Manual testing (run standalone)
+- Test with various URLs
+- Test error conditions
+- Test configuration options
+
+## Security Considerations
+
+### Current State
+
+- Runs with user permissions
+- No input sanitization (URLs passed directly)
+- Extractors can run arbitrary code
+- No sandbox
+
+### Recommendations for Production
+
+1. **Input Validation** - Validate and sanitize URLs
+2. **Sandboxing** - Run extractors in containers/VMs
+3. **Resource Limits** - CPU, memory, disk quotas
+4. **Authentication** - Add user system for web UI
+5. **HTTPS Only** - Validate SSL certificates
+6. **Rate Limiting** - Prevent abuse
+
+## Future Architecture Enhancements
+
+### 1. Background Processing
+
+```typescript
+// Job queue pattern
+interface Job {
+  id: string;
+  snapshot_id: string;
+  extractor: string;
+  status: 'pending' | 'running' | 'completed';
+}
+
+class JobQueue {
+  enqueue(snapshot_id: string, extractor: string): Job;
+  dequeue(): Job | null;
+  complete(job_id: string, result: ExtractorResult): void;
+}
+```
+
+### 2. Web UI
+
+- Express/Fastify server
+- Browse archived snapshots
+- Trigger new archives
+- View extractor results
+- Search functionality
+
+### 3. API
+
+- RESTful API
+- POST /snapshots - Create snapshot
+- GET /snapshots - List snapshots
+- GET /snapshots/:id - Get snapshot details
+- POST /snapshots/:id/extract - Run extractors
+
+### 4. Plugins
+
+While keeping the extractor model simple, could add:
+- Pre-processors (URL transformation)
+- Post-processors (Content analysis)
+- Notifications (Email, webhook)
+- Storage backends (S3, B2)
+
+### 5. Distributed Execution
+
+- Extract coordinator and workers
+- gRPC or HTTP API between coordinator/workers
+- Shared database or message queue
+- Worker pools by extractor type
+
+## Conclusion
+
+ArchiveBox-TS demonstrates that complex functionality can be achieved with simple, composable components. By embracing Unix philosophy (do one thing well, text streams, exit codes), we've created a system that's both powerful and easy to understand.
+
+The key insight is that **extractors don't need to be plugins** - they can be simple executables that follow a convention. This drastically simplifies the architecture while maintaining flexibility and extensibility.
--- a/archivebox-ts/EXTRACTOR_GUIDE.md
+++ b/archivebox-ts/EXTRACTOR_GUIDE.md
@@ -0,0 +1,487 @@
+# Extractor Development Guide
+
+This guide explains how to create custom extractors for ArchiveBox-TS.
+
+## What is an Extractor?
+
+An extractor is a standalone executable program that:
+1. Takes a URL as input
+2. Processes/downloads content from that URL
+3. Saves output files to the current directory
+4. Reports success/failure via exit code
+
+## Extractor Contract
+
+Every extractor must follow these rules:
+
+### 1. File Location
+- Place the extractor file in the `extractors/` directory
+- The filename becomes the extractor name (e.g., `extractors/myextractor` → `myextractor`)
+
+### 2. Executable Permissions
+```bash
+chmod +x extractors/myextractor
+```
+
+### 3. Shebang Line
+Start your file with the appropriate shebang:
+- Bash: `#!/bin/bash`
+- Node.js: `#!/usr/bin/env node`
+- Python: `#!/usr/bin/env python3`
+- Ruby: `#!/usr/bin/env ruby`
+- Any other: `#!/usr/bin/env <interpreter>`
+
+### 4. URL Input
+The URL is passed as the first command-line argument:
+- Bash: `$1`
+- Node.js: `process.argv[2]`
+- Python: `sys.argv[1]`
+
+### 5. Output Directory
+The extractor runs in the output directory. Write all files to the current directory (`.`).
+
+### 6. Configuration
+All configuration via environment variables:
+- Read config: `${VAR_NAME:-default}` (bash) or `process.env.VAR_NAME` (Node.js)
+- Name variables after your extractor: `MYEXTRACTOR_TIMEOUT`, `MYEXTRACTOR_WIDTH`, etc.
+- Provide sensible defaults
+
+### 7. Standard Output (stdout)
+Print the main output file path to stdout:
+```bash
+echo "output.html"
+```
+
+### 8. Standard Error (stderr)
+Use stderr for all logging, progress, and error messages:
+```bash
+echo "Downloading..." >&2
+echo "Error: Failed" >&2
+```
+
+### 9. Exit Code
+- `0` = Success
+- Non-zero = Failure
+
+### 10. Auto-Install Dependencies (Optional)
+Your extractor can check for and install its dependencies:
+
+```bash
+if ! command -v mytool &> /dev/null; then
+  echo "Installing mytool..." >&2
+  # Install command here
+fi
+```
+
+## Complete Examples
+
+### Bash Extractor: HTML Downloader
+
+```bash
+#!/bin/bash
+#
+# HTML Extractor
+# Downloads the raw HTML of a page
+#
+# Config:
+#   HTML_TIMEOUT - Timeout in seconds (default: 30)
+#   HTML_USER_AGENT - User agent string
+#
+
+set -e  # Exit on error
+
+URL="$1"
+
+# Validate input
+if [ -z "$URL" ]; then
+  echo "Error: URL argument required" >&2
+  exit 1
+fi
+
+# Auto-install curl if needed
+if ! command -v curl &> /dev/null; then
+  echo "Installing curl..." >&2
+  sudo apt-get update && sudo apt-get install -y curl
+fi
+
+# Read config from environment
+TIMEOUT="${HTML_TIMEOUT:-30}"
+USER_AGENT="${HTML_USER_AGENT:-Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)}"
+
+# Log to stderr
+echo "Downloading HTML from: $URL" >&2
+
+# Download HTML
+if curl -L -s --max-time "$TIMEOUT" --user-agent "$USER_AGENT" -o index.html "$URL"; then
+  echo "✓ Downloaded HTML" >&2
+  echo "index.html"  # Output file to stdout
+  exit 0
+else
+  echo "Error: Failed to download HTML" >&2
+  exit 1
+fi
+```
+
+### Node.js Extractor: JSON Metadata
+
+```javascript
+#!/usr/bin/env node
+//
+// Metadata Extractor
+// Extracts metadata from a page and saves as JSON
+//
+// Config:
+//   METADATA_TIMEOUT - Timeout in milliseconds (default: 10000)
+//
+
+const https = require('https');
+const http = require('http');
+const fs = require('fs');
+const { URL } = require('url');
+
+// Get URL from first argument
+const url = process.argv[2];
+if (!url) {
+  console.error('Error: URL argument required');
+  process.exit(1);
+}
+
+// Configuration
+const TIMEOUT = parseInt(process.env.METADATA_TIMEOUT || '10000', 10);
+
+console.error(`Extracting metadata from: ${url}`);
+
+// Parse URL
+let parsedUrl;
+try {
+  parsedUrl = new URL(url);
+} catch (err) {
+  console.error(`Error: Invalid URL: ${err.message}`);
+  process.exit(1);
+}
+
+// Choose protocol
+const client = parsedUrl.protocol === 'https:' ? https : http;
+
+// Make request
+const options = {
+  timeout: TIMEOUT,
+  headers: {
+    'User-Agent': 'Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)'
+  }
+};
+
+client.get(url, options, (res) => {
+  let html = '';
+
+  res.on('data', (chunk) => {
+    html += chunk;
+  });
+
+  res.on('end', () => {
+    // Extract metadata
+    const metadata = {
+      url: url,
+      status: res.statusCode,
+      headers: res.headers,
+      title: extractTitle(html),
+      description: extractMeta(html, 'description'),
+      keywords: extractMeta(html, 'keywords'),
+      author: extractMeta(html, 'author'),
+      timestamp: new Date().toISOString()
+    };
+
+    // Write to file
+    fs.writeFileSync('metadata.json', JSON.stringify(metadata, null, 2));
+
+    console.error('✓ Extracted metadata');
+    console.log('metadata.json');
+    process.exit(0);
+  });
+}).on('error', (err) => {
+  console.error(`Error: ${err.message}`);
+  process.exit(1);
+});
+
+function extractTitle(html) {
+  const match = html.match(/<title[^>]*>(.*?)<\/title>/is);
+  return match ? match[1].trim() : null;
+}
+
+function extractMeta(html, name) {
+  const regex = new RegExp(`<meta[^>]*name=["']${name}["'][^>]*content=["']([^"']*)["']`, 'i');
+  const match = html.match(regex);
+  return match ? match[1] : null;
+}
+```
+
+### Python Extractor: Link Extractor
+
+```python
+#!/usr/bin/env python3
+#
+# Links Extractor
+# Extracts all links from a page
+#
+# Config:
+#   LINKS_TIMEOUT - Timeout in seconds (default: 30)
+#   LINKS_MAX - Maximum links to extract (default: 1000)
+#
+
+import sys
+import os
+import subprocess
+import re
+from urllib.request import urlopen, Request
+from urllib.parse import urljoin, urlparse
+
+def ensure_deps():
+    """Auto-install dependencies"""
+    # For this simple example, we use stdlib only
+    pass
+
+def main():
+    # Validate input
+    if len(sys.argv) < 2:
+        print("Error: URL argument required", file=sys.stderr)
+        sys.exit(1)
+
+    url = sys.argv[1]
+
+    # Configuration
+    timeout = int(os.environ.get('LINKS_TIMEOUT', '30'))
+    max_links = int(os.environ.get('LINKS_MAX', '1000'))
+
+    print(f"Extracting links from: {url}", file=sys.stderr)
+
+    ensure_deps()
+
+    try:
+        # Fetch HTML
+        req = Request(url, headers={
+            'User-Agent': 'Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)'
+        })
+        with urlopen(req, timeout=timeout) as response:
+            html = response.read().decode('utf-8', errors='ignore')
+
+        # Extract links using regex (simple approach)
+        # In production, use BeautifulSoup or lxml
+        links = set()
+
+        # Find <a href="...">
+        for match in re.finditer(r'<a[^>]+href=["\'](.*?)["\']', html, re.IGNORECASE):
+            href = match.group(1)
+            # Convert relative to absolute
+            absolute_url = urljoin(url, href)
+            links.add(absolute_url)
+
+            if len(links) >= max_links:
+                break
+
+        # Write to file
+        with open('links.txt', 'w') as f:
+            for link in sorted(links):
+                f.write(link + '\n')
+
+        print(f"✓ Extracted {len(links)} links", file=sys.stderr)
+        print("links.txt")
+        sys.exit(0)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
+```
+
+## Testing Your Extractor
+
+### Manual Testing
+
+1. Create a test directory:
+```bash
+mkdir test-output
+cd test-output
+```
+
+2. Run your extractor:
+```bash
+/path/to/extractors/myextractor "https://example.com"
+```
+
+3. Check output:
+```bash
+ls -la
+cat output-file.txt
+```
+
+### Environment Variable Testing
+
+```bash
+# Set config
+export MYEXTRACTOR_TIMEOUT=60
+export MYEXTRACTOR_DEBUG=true
+
+# Run
+/path/to/extractors/myextractor "https://example.com"
+```
+
+### Error Handling Testing
+
+Test your extractor with:
+- Invalid URLs
+- URLs that timeout
+- URLs that return 404
+- URLs with special characters
+- Very large pages
+- Redirects
+
+## Best Practices
+
+### 1. Error Handling
+- Always validate the URL argument
+- Handle network errors gracefully
+- Provide clear error messages to stderr
+- Exit with non-zero code on failure
+
+### 2. Timeouts
+- Always set timeouts for network requests
+- Make timeouts configurable via environment variables
+- Use reasonable defaults (10-60 seconds)
+
+### 3. Output
+- Create files with clear, descriptive names
+- Use standard formats (JSON, HTML, TXT, PNG, etc.)
+- Print the main output filename to stdout
+- If multiple files, print the primary one or a summary file
+
+### 4. Logging
+- Log progress to stderr
+- Use prefixes: `✓` for success, `✗` for errors, `→` for progress
+- Include URL in log messages
+- Don't be too verbose (user can redirect if needed)
+
+### 5. Performance
+- Stream large files, don't load everything in memory
+- Use parallel downloads when appropriate
+- Respect robots.txt (optional but recommended)
+- Add delays for rate limiting if needed
+
+### 6. Configuration
+- Use descriptive environment variable names
+- Prefix with your extractor name: `MYEXT_VAR_NAME`
+- Provide defaults for all settings
+- Document all config options in comments
+
+### 7. Dependencies
+- Auto-install common dependencies when possible
+- Detect OS and use appropriate package manager
+- Provide clear error if dependency can't be installed
+- Test with fresh environment
+
+### 8. Idempotency
+- Running twice should produce the same result
+- Overwrite existing files
+- Don't append to files
+
+### 9. Security
+- Validate and sanitize URLs
+- Don't execute arbitrary code from fetched content
+- Be careful with file paths (prevent directory traversal)
+- Limit resource usage (file size, memory, etc.)
+
+## Common Patterns
+
+### Retry Logic
+
+```bash
+MAX_RETRIES=3
+RETRY_DELAY=2
+
+for i in $(seq 1 $MAX_RETRIES); do
+  if download_url "$URL"; then
+    break
+  fi
+
+  if [ $i -lt $MAX_RETRIES ]; then
+    echo "Retry $i/$MAX_RETRIES in ${RETRY_DELAY}s..." >&2
+    sleep $RETRY_DELAY
+  fi
+done
+```
+
+### Progress Reporting
+
+```javascript
+const total = 100;
+let done = 0;
+
+function updateProgress() {
+  done++;
+  console.error(`Progress: ${done}/${total} (${Math.round(done/total*100)}%)`);
+}
+```
+
+### Conditional Extraction
+
+```python
+# Only extract if page is HTML
+content_type = response.headers.get('content-type', '')
+if 'text/html' not in content_type.lower():
+    print(f"Skipping non-HTML content: {content_type}", file=sys.stderr)
+    sys.exit(0)  # Success but skipped
+```
+
+## Debugging
+
+### Enable Verbose Output
+
+Add a DEBUG environment variable:
+
+```bash
+if [ "${MYEXT_DEBUG}" = "true" ]; then
+  set -x  # Print commands
+fi
+```
+
+### Test in Isolation
+
+```bash
+# Run in a clean environment
+env -i \
+  PATH=/usr/bin:/bin \
+  MYEXT_TIMEOUT=30 \
+  /path/to/extractors/myextractor "https://example.com"
+```
+
+### Check Exit Codes
+
+```bash
+/path/to/extractors/myextractor "https://example.com"
+echo "Exit code: $?"
+```
+
+## Examples of Extractor Ideas
+
+- **RSS Feed**: Extract RSS/Atom feed
+- **Images**: Download all images from page
+- **Video**: Extract video using yt-dlp
+- **Archive.org**: Submit to Internet Archive
+- **PDF**: Convert page to PDF using wkhtmltopdf
+- **Reader Mode**: Extract main content using readability
+- **Git**: Clone git repository
+- **Twitter Thread**: Unroll and save thread
+- **Mastodon Post**: Archive toot with media
+- **GitHub Repo**: Archive repository with stars/forks
+- **HackerNews Thread**: Save discussion thread
+- **Reddit Thread**: Archive post and comments
+
+## Next Steps
+
+1. Create your extractor file in `extractors/`
+2. Make it executable: `chmod +x extractors/yourextractor`
+3. Test it manually
+4. Use it with archivebox-ts: `node dist/cli.js add --extractors yourextractor https://example.com`
+
+Happy extracting! 🎉
--- a/archivebox-ts/QUICKSTART.md
+++ b/archivebox-ts/QUICKSTART.md
@@ -0,0 +1,250 @@
+# ArchiveBox-TS Quick Start Guide
+
+Get up and running with ArchiveBox-TS in 5 minutes!
+
+## Installation
+
+```bash
+cd archivebox-ts
+
+# Install dependencies
+npm install
+
+# Build TypeScript
+npm run build
+
+# Initialize database and data directory
+node dist/cli.js init
+```
+
+## Basic Usage
+
+### Archive a URL
+
+```bash
+# Archive with all available extractors
+node dist/cli.js add "https://example.com"
+
+# Archive with specific extractors
+node dist/cli.js add "https://github.com" --extractors title,headers,favicon
+
+# Add a custom title
+node dist/cli.js add "https://example.com" --title "Example Domain"
+```
+
+### List Archives
+
+```bash
+# List all snapshots
+node dist/cli.js list
+
+# With pagination
+node dist/cli.js list --limit 10 --offset 0
+```
+
+### Check Status
+
+```bash
+# Get the snapshot ID from list command
+node dist/cli.js status <snapshot-id>
+```
+
+### List Extractors
+
+```bash
+# See what extractors are available
+node dist/cli.js extractors
+```
+
+## Directory Structure After Init
+
+```
+archivebox-ts/
+├── data/
+│   ├── index.sqlite3              # SQLite database
+│   └── archive/                   # Archived content
+│       └── <timestamp>_<domain>/  # Individual snapshot directories
+│           ├── headers.json
+│           ├── title.txt
+│           ├── favicon.ico
+│           └── ...
+```
+
+## Environment Variables
+
+Configure extractors using environment variables:
+
+```bash
+# Set favicon timeout
+export FAVICON_TIMEOUT=30
+
+# Set screenshot dimensions
+export SCREENSHOT_WIDTH=1920
+export SCREENSHOT_HEIGHT=1080
+
+# Run with custom config
+node dist/cli.js add "https://example.com" --extractors screenshot
+```
+
+## Example Session
+
+```bash
+# Initialize
+$ node dist/cli.js init
+Initializing ArchiveBox...
+Data directory: /path/to/data
+Database: /path/to/data/index.sqlite3
+Archive directory: /path/to/data/archive
+✓ Initialization complete!
+
+# Add a URL
+$ node dist/cli.js add "https://example.com"
+Adding URL: https://example.com
+✓ Created snapshot: 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71
+Running extractors: favicon, title, headers, wget, screenshot
+Output directory: /path/to/data/archive/1762193664373_example.com
+  ✓ favicon: succeeded
+  ✓ title: succeeded
+  ✓ headers: succeeded
+  ✓ wget: succeeded
+  ✓ screenshot: succeeded
+✓ Archiving complete!
+
+# Check status
+$ node dist/cli.js status 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71
+Snapshot: 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71
+URL: https://example.com
+Title: Example Domain
+Status: sealed
+Created: 2025-11-03T18:14:04.279Z
+Downloaded: 2025-11-03T18:14:25.273Z
+Output: /path/to/data/archive/1762193664373_example.com
+
+Archive Results (5):
+  ✓ favicon: succeeded
+    Output: favicon.ico
+  ✓ title: succeeded
+    Output: title.txt
+  ✓ headers: succeeded
+    Output: headers.json
+  ✓ wget: succeeded
+    Output: warc/archive.warc.gz
+  ✓ screenshot: succeeded
+    Output: screenshot.png
+
+# List all snapshots
+$ node dist/cli.js list
+Found 1 snapshot(s):
+
+ID: 488ef9a8-fcd3-40c7-a209-3ab3b0a0eb71
+URL: https://example.com
+Title: Example Domain
+Status: sealed
+Created: 2025-11-03T18:14:04.279Z
+Output: /path/to/data/archive/1762193664373_example.com
+---
+```
+
+## Creating Your First Extractor
+
+Create a simple bash extractor:
+
+```bash
+# Create the extractor file
+cat > extractors/myextractor << 'EOF'
+#!/bin/bash
+set -e
+
+URL="$1"
+if [ -z "$URL" ]; then
+  echo "Error: URL required" >&2
+  exit 1
+fi
+
+echo "Processing $URL..." >&2
+echo "Hello from myextractor!" > output.txt
+echo "✓ Done" >&2
+echo "output.txt"
+EOF
+
+# Make it executable
+chmod +x extractors/myextractor
+
+# Test it
+node dist/cli.js add "https://example.com" --extractors myextractor
+```
+
+See [EXTRACTOR_GUIDE.md](EXTRACTOR_GUIDE.md) for detailed information on creating extractors.
+
+## Troubleshooting
+
+### "No extractors available"
+
+Make sure the extractor files are executable:
+```bash
+chmod +x extractors/*
+```
+
+### "Extractor failed"
+
+Check the error message in the status output:
+```bash
+node dist/cli.js status <snapshot-id>
+```
+
+Common issues:
+- Missing dependencies (extractor should auto-install)
+- Network timeout (increase timeout via environment variable)
+- Invalid URL format
+
+### "Database locked"
+
+Only one CLI command can run at a time. Wait for the current command to finish.
+
+## Next Steps
+
+- Read the [README.md](README.md) for architecture details
+- Check out [EXTRACTOR_GUIDE.md](EXTRACTOR_GUIDE.md) to create custom extractors
+- Browse the `extractors/` directory for examples
+- Explore the TypeScript source code in `src/`
+
+## Performance Tips
+
+1. **Parallel Archiving**: Run multiple CLI instances with different data directories
+2. **Selective Extractors**: Use `--extractors` flag to only run needed extractors
+3. **Adjust Timeouts**: Increase timeouts for slow sites via environment variables
+4. **Large Sites**: Use wget extractor for comprehensive archiving
+
+## Data Management
+
+### Backup
+
+```bash
+# Backup database
+cp data/index.sqlite3 data/index.sqlite3.backup
+
+# Backup everything
+tar czf archivebox-backup.tar.gz data/
+```
+
+### Export
+
+```bash
+# Export database to SQL
+sqlite3 data/index.sqlite3 .dump > export.sql
+
+# Query snapshots
+sqlite3 data/index.sqlite3 "SELECT url, title, status FROM snapshots;"
+```
+
+### Clean Up
+
+```bash
+# Remove old archives (manual)
+rm -rf data/archive/<old-timestamp>_*
+
+# Remove from database
+sqlite3 data/index.sqlite3 "DELETE FROM snapshots WHERE url = 'https://example.com';"
+```
+
+Happy archiving! 🎉
--- a/archivebox-ts/README.md
+++ b/archivebox-ts/README.md
@@ -0,0 +1,380 @@
+# ArchiveBox TypeScript
+
+A TypeScript-based version of ArchiveBox with a simplified, modular architecture.
+
+## Overview
+
+This is a reimplementation of ArchiveBox using TypeScript with a focus on simplicity and modularity. The key architectural changes are:
+
+1. **Standalone Extractors**: Each extractor is a standalone executable (bash, Node.js, or Python with shebang) that can run independently
+2. **Auto-Installing Dependencies**: Extractors automatically install their own dependencies when first run
+3. **Simple Interface**: Extractors receive URL as `$1` CLI argument and output files to current working directory
+4. **Environment-Based Config**: All configuration passed via environment variables, no CLI flags
+5. **SQLite Database**: Uses SQLite with schema matching the original ArchiveBox
+
+## Directory Structure
+
+```
+archivebox-ts/
+├── src/
+│   ├── cli.ts           # Main CLI entry point
+│   ├── db.ts            # SQLite database operations
+│   ├── models.ts        # TypeScript interfaces
+│   └── extractors.ts    # Extractor orchestration
+├── extractors/          # Standalone extractor executables
+│   ├── favicon          # Bash script to download favicon
+│   ├── title            # Node.js script to extract title
+│   ├── headers          # Bash script to extract HTTP headers
+│   ├── wget             # Bash script for full page download
+│   └── screenshot       # Python script for screenshots
+├── data/                # Created on init
+│   ├── index.sqlite3    # SQLite database
+│   └── archive/         # Archived snapshots
+├── package.json
+├── tsconfig.json
+└── README.md
+```
+
+## Installation
+
+### Prerequisites
+
+- Node.js 18+ and npm
+- For specific extractors:
+  - `wget` extractor: wget
+  - `screenshot` extractor: Python 3 + Playwright
+
+### Setup
+
+```bash
+cd archivebox-ts
+
+# Install dependencies
+npm install
+
+# Build TypeScript
+npm run build
+
+# Initialize ArchiveBox
+node dist/cli.js init
+```
+
+## Usage
+
+### Initialize
+
+Create the data directory and database:
+
+```bash
+node dist/cli.js init
+```
+
+### Add a URL
+
+Archive a URL with all available extractors:
+
+```bash
+node dist/cli.js add https://example.com
+```
+
+Archive with specific extractors:
+
+```bash
+node dist/cli.js add https://example.com --extractors favicon,title,headers
+```
+
+Add with custom title:
+
+```bash
+node dist/cli.js add https://example.com --title "Example Domain"
+```
+
+### List Snapshots
+
+List all archived snapshots:
+
+```bash
+node dist/cli.js list
+```
+
+With pagination:
+
+```bash
+node dist/cli.js list --limit 10 --offset 20
+```
+
+### Check Status
+
+View detailed status of a snapshot:
+
+```bash
+node dist/cli.js status <snapshot-id>
+```
+
+### List Extractors
+
+See all available extractors:
+
+```bash
+node dist/cli.js extractors
+```
+
+## Database Schema
+
+The SQLite database uses a schema compatible with ArchiveBox:
+
+### Snapshots Table
+
+Represents a single URL being archived.
+
+| Column | Type | Description |
+|--------|------|-------------|
+| id | TEXT (UUID) | Primary key |
+| abid | TEXT | ArchiveBox ID (snp_...) |
+| url | TEXT | URL being archived (unique) |
+| timestamp | TEXT | Unix timestamp string |
+| title | TEXT | Page title |
+| created_at | TEXT | ISO datetime |
+| bookmarked_at | TEXT | ISO datetime |
+| downloaded_at | TEXT | ISO datetime when complete |
+| modified_at | TEXT | ISO datetime |
+| status | TEXT | queued, started, sealed |
+| retry_at | TEXT | ISO datetime for retry |
+| config | TEXT (JSON) | Configuration |
+| notes | TEXT | Extra notes |
+| output_dir | TEXT | Path to output directory |
+
+### Archive Results Table
+
+Represents the result of running one extractor on one snapshot.
+
+| Column | Type | Description |
+|--------|------|-------------|
+| id | TEXT (UUID) | Primary key |
+| abid | TEXT | ArchiveBox ID (res_...) |
+| snapshot_id | TEXT | Foreign key to snapshot |
+| extractor | TEXT | Extractor name |
+| status | TEXT | queued, started, succeeded, failed, skipped, backoff |
+| created_at | TEXT | ISO datetime |
+| modified_at | TEXT | ISO datetime |
+| start_ts | TEXT | ISO datetime when started |
+| end_ts | TEXT | ISO datetime when finished |
+| cmd | TEXT (JSON) | Command executed |
+| pwd | TEXT | Working directory |
+| cmd_version | TEXT | Binary version |
+| output | TEXT | Output file path or result |
+| retry_at | TEXT | ISO datetime for retry |
+| config | TEXT (JSON) | Configuration |
+| notes | TEXT | Extra notes |
+
+## Creating Custom Extractors
+
+Extractors are standalone executable files in the `extractors/` directory.
+
+### Extractor Contract
+
+1. **Executable**: File must have execute permissions (`chmod +x`)
+2. **Shebang**: Must start with shebang (e.g., `#!/bin/bash`, `#!/usr/bin/env node`)
+3. **First Argument**: Receives URL as `$1` (bash) or `process.argv[2]` (Node.js) or `sys.argv[1]` (Python)
+4. **Working Directory**: Run in the output directory, write files there
+5. **Environment Config**: Read all config from environment variables
+6. **Exit Code**: Return 0 for success, non-zero for failure
+7. **Output**: Print the main output file path to stdout
+8. **Logging**: Print progress/errors to stderr
+9. **Auto-Install**: Optionally auto-install dependencies on first run
+
+### Example Bash Extractor
+
+```bash
+#!/bin/bash
+#
+# My Custom Extractor
+# Description of what it does
+#
+# Config via environment variables:
+#   MY_TIMEOUT - Timeout in seconds (default: 30)
+#
+
+set -e
+
+URL="$1"
+
+if [ -z "$URL" ]; then
+  echo "Error: URL argument required" >&2
+  exit 1
+fi
+
+# Auto-install dependencies (optional)
+if ! command -v some-tool &> /dev/null; then
+  echo "Installing some-tool..." >&2
+  sudo apt-get install -y some-tool
+fi
+
+# Get config from environment
+TIMEOUT="${MY_TIMEOUT:-30}"
+
+echo "Processing $URL..." >&2
+
+# Do the extraction work
+some-tool --timeout "$TIMEOUT" "$URL" > output.txt
+
+echo "✓ Done" >&2
+echo "output.txt"
+exit 0
+```
+
+### Example Node.js Extractor
+
+```javascript
+#!/usr/bin/env node
+//
+// My Custom Extractor
+// Config via environment variables:
+//   MY_TIMEOUT - Timeout in ms
+//
+
+const url = process.argv[2];
+if (!url) {
+  console.error('Error: URL argument required');
+  process.exit(1);
+}
+
+const timeout = parseInt(process.env.MY_TIMEOUT || '10000', 10);
+
+console.error(`Processing ${url}...`);
+
+// Do extraction work
+// Write files to current directory
+
+console.error('✓ Done');
+console.log('output.txt');
+```
+
+### Example Python Extractor
+
+```python
+#!/usr/bin/env python3
+#
+# My Custom Extractor
+# Config via environment variables:
+#   MY_TIMEOUT - Timeout in seconds
+#
+
+import sys
+import os
+
+url = sys.argv[1] if len(sys.argv) > 1 else None
+if not url:
+    print("Error: URL argument required", file=sys.stderr)
+    sys.exit(1)
+
+timeout = int(os.environ.get('MY_TIMEOUT', '30'))
+
+print(f"Processing {url}...", file=sys.stderr)
+
+# Do extraction work
+# Write files to current directory
+
+print("✓ Done", file=sys.stderr)
+print("output.txt")
+```
+
+## Available Extractors
+
+### favicon
+- **Language**: Bash
+- **Dependencies**: curl (auto-installed)
+- **Output**: `favicon.ico` or `favicon.png`
+- **Config**:
+  - `FAVICON_TIMEOUT` - Timeout in seconds (default: 10)
+
+### title
+- **Language**: Node.js
+- **Dependencies**: Built-in Node.js modules
+- **Output**: `title.txt`
+- **Config**:
+  - `TITLE_TIMEOUT` - Timeout in milliseconds (default: 10000)
+  - `TITLE_USER_AGENT` - User agent string
+
+### headers
+- **Language**: Bash
+- **Dependencies**: curl (auto-installed)
+- **Output**: `headers.json`
+- **Config**:
+  - `HEADERS_TIMEOUT` - Timeout in seconds (default: 10)
+  - `HEADERS_USER_AGENT` - User agent string
+
+### wget
+- **Language**: Bash
+- **Dependencies**: wget (auto-installed)
+- **Output**: `warc/archive.warc.gz` and downloaded files
+- **Config**:
+  - `WGET_TIMEOUT` - Timeout in seconds (default: 60)
+  - `WGET_USER_AGENT` - User agent string
+  - `WGET_ARGS` - Additional wget arguments
+
+### screenshot
+- **Language**: Python
+- **Dependencies**: playwright (auto-installed)
+- **Output**: `screenshot.png`
+- **Config**:
+  - `SCREENSHOT_TIMEOUT` - Timeout in milliseconds (default: 30000)
+  - `SCREENSHOT_WIDTH` - Viewport width (default: 1920)
+  - `SCREENSHOT_HEIGHT` - Viewport height (default: 1080)
+  - `SCREENSHOT_WAIT` - Wait time before screenshot in ms (default: 1000)
+
+## Development
+
+### Build
+
+```bash
+npm run build
+```
+
+### Watch Mode
+
+```bash
+npm run dev
+```
+
+### Project Structure
+
+- `src/models.ts` - TypeScript interfaces matching the database schema
+- `src/db.ts` - Database layer with SQLite operations
+- `src/extractors.ts` - Extractor discovery and orchestration
+- `src/cli.ts` - CLI commands and application logic
+
+## Differences from Original ArchiveBox
+
+### Simplified
+
+1. **No Plugin System**: Instead of a complex ABX plugin framework, extractors are simple executable files
+2. **Simpler Config**: Only environment variables, no configuration file parsing
+3. **No Web UI**: Command-line only (for now)
+4. **No Background Workers**: Direct execution (could be added)
+5. **No User System**: Single-user mode
+
+### Architecture Improvements
+
+1. **Extractors are Standalone**: Each extractor can be tested independently
+2. **Language Agnostic**: Write extractors in any language (bash, Python, Node.js, Go, etc.)
+3. **Easy to Extend**: Just drop an executable file in `extractors/` directory
+4. **Minimal Dependencies**: Core system only needs Node.js and SQLite
+
+## Future Enhancements
+
+- [ ] Background job queue for processing
+- [ ] Web UI for browsing archives
+- [ ] Search functionality
+- [ ] More extractors (pdf, dom, singlefile, readability, etc.)
+- [ ] Import/export functionality
+- [ ] Schedule automatic archiving
+- [ ] Browser extension integration
+
+## License
+
+MIT
+
+## Credits
+
+Based on [ArchiveBox](https://github.com/ArchiveBox/ArchiveBox) by Nick Sweeting and contributors.
--- a/archivebox-ts/extractors/favicon
+++ b/archivebox-ts/extractors/favicon
@@ -0,0 +1,79 @@
+#!/bin/bash
+#
+# Favicon Extractor
+# Downloads the favicon for a given URL
+#
+# Usage: favicon <url>
+# Output: favicon.ico or favicon.png in current directory
+# Config: All configuration via environment variables
+#   FAVICON_TIMEOUT - Timeout in seconds (default: 10)
+#
+
+set -e
+
+URL="$1"
+
+if [ -z "$URL" ]; then
+  echo "Error: URL argument required" >&2
+  exit 1
+fi
+
+# Auto-install dependencies
+if ! command -v curl &> /dev/null; then
+  echo "Installing curl..." >&2
+  if command -v apt-get &> /dev/null; then
+    sudo apt-get update && sudo apt-get install -y curl
+  elif command -v yum &> /dev/null; then
+    sudo yum install -y curl
+  elif command -v brew &> /dev/null; then
+    brew install curl
+  else
+    echo "Error: Cannot install curl. Please install manually." >&2
+    exit 1
+  fi
+fi
+
+# Configuration from environment
+TIMEOUT="${FAVICON_TIMEOUT:-10}"
+
+# Extract domain from URL
+DOMAIN=$(echo "$URL" | sed -e 's|^[^/]*//||' -e 's|/.*$||')
+
+echo "Extracting favicon for $DOMAIN..." >&2
+
+# Try common favicon locations
+FAVICON_URLS=(
+  "https://${DOMAIN}/favicon.ico"
+  "http://${DOMAIN}/favicon.ico"
+  "https://${DOMAIN}/favicon.png"
+  "http://${DOMAIN}/favicon.png"
+)
+
+SUCCESS=0
+
+for FAVICON_URL in "${FAVICON_URLS[@]}"; do
+  echo "Trying: $FAVICON_URL" >&2
+
+  # Determine output filename
+  EXT="${FAVICON_URL##*.}"
+  OUTPUT="favicon.${EXT}"
+
+  if curl -L -f -s --max-time "$TIMEOUT" -o "$OUTPUT" "$FAVICON_URL" 2>/dev/null; then
+    # Check if file is not empty
+    if [ -s "$OUTPUT" ]; then
+      echo "✓ Downloaded favicon: $OUTPUT" >&2
+      echo "$OUTPUT"
+      SUCCESS=1
+      break
+    else
+      rm -f "$OUTPUT"
+    fi
+  fi
+done
+
+if [ "$SUCCESS" -eq 0 ]; then
+  echo "Warning: Could not download favicon" >&2
+  exit 1
+fi
+
+exit 0
--- a/archivebox-ts/extractors/headers
+++ b/archivebox-ts/extractors/headers
@@ -0,0 +1,85 @@
+#!/bin/bash
+#
+# Headers Extractor
+# Extracts HTTP headers from a given URL
+#
+# Usage: headers <url>
+# Output: headers.json in current directory
+# Config: All configuration via environment variables
+#   HEADERS_TIMEOUT - Timeout in seconds (default: 10)
+#   HEADERS_USER_AGENT - User agent string
+#
+
+set -e
+
+URL="$1"
+
+if [ -z "$URL" ]; then
+  echo "Error: URL argument required" >&2
+  exit 1
+fi
+
+# Auto-install dependencies
+if ! command -v curl &> /dev/null; then
+  echo "Installing curl..." >&2
+  if command -v apt-get &> /dev/null; then
+    sudo apt-get update && sudo apt-get install -y curl
+  elif command -v yum &> /dev/null; then
+    sudo yum install -y curl
+  elif command -v brew &> /dev/null; then
+    brew install curl
+  else
+    echo "Error: Cannot install curl. Please install manually." >&2
+    exit 1
+  fi
+fi
+
+# Configuration from environment
+TIMEOUT="${HEADERS_TIMEOUT:-10}"
+USER_AGENT="${HEADERS_USER_AGENT:-Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)}"
+
+echo "Extracting headers from: $URL" >&2
+
+# Get headers using curl
+HEADERS=$(curl -I -L -s --max-time "$TIMEOUT" --user-agent "$USER_AGENT" "$URL" 2>&1 || echo "")
+
+if [ -z "$HEADERS" ]; then
+  echo "Error: Failed to fetch headers" >&2
+  exit 1
+fi
+
+# Convert headers to JSON format (simple key-value pairs)
+echo "{" > headers.json
+
+# Parse headers line by line
+FIRST=1
+while IFS=: read -r key value; do
+  # Skip empty lines and HTTP status line
+  if [ -z "$key" ] || [[ "$key" =~ ^HTTP ]]; then
+    continue
+  fi
+
+  # Clean up key and value
+  key=$(echo "$key" | tr -d '\r\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+  value=$(echo "$value" | tr -d '\r\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+
+  if [ -n "$key" ] && [ -n "$value" ]; then
+    # Escape quotes in value
+    value=$(echo "$value" | sed 's/"/\\"/g')
+
+    # Add comma if not first entry
+    if [ "$FIRST" -eq 0 ]; then
+      echo "," >> headers.json
+    fi
+
+    echo -n "  \"$key\": \"$value\"" >> headers.json
+    FIRST=0
+  fi
+done <<< "$HEADERS"
+
+echo "" >> headers.json
+echo "}" >> headers.json
+
+echo "✓ Extracted headers to headers.json" >&2
+echo "headers.json"
+exit 0
--- a/archivebox-ts/extractors/screenshot
+++ b/archivebox-ts/extractors/screenshot
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+#
+# Screenshot Extractor
+# Captures a screenshot of a given URL using Playwright
+#
+# Usage: screenshot <url>
+# Output: screenshot.png in current directory
+# Config: All configuration via environment variables
+#   SCREENSHOT_TIMEOUT - Timeout in milliseconds (default: 30000)
+#   SCREENSHOT_WIDTH - Viewport width (default: 1920)
+#   SCREENSHOT_HEIGHT - Viewport height (default: 1080)
+#   SCREENSHOT_WAIT - Time to wait before screenshot in ms (default: 1000)
+#
+
+import sys
+import os
+import subprocess
+
+def ensure_playwright():
+    """Auto-install playwright if not available"""
+    try:
+        from playwright.sync_api import sync_playwright
+        return True
+    except ImportError:
+        print("Installing playwright...", file=sys.stderr)
+        try:
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "playwright"])
+            subprocess.check_call([sys.executable, "-m", "playwright", "install", "chromium"])
+            from playwright.sync_api import sync_playwright
+            return True
+        except Exception as e:
+            print(f"Error installing playwright: {e}", file=sys.stderr)
+            return False
+
+def main():
+    if len(sys.argv) < 2:
+        print("Error: URL argument required", file=sys.stderr)
+        sys.exit(1)
+
+    url = sys.argv[1]
+
+    # Configuration from environment
+    timeout = int(os.environ.get('SCREENSHOT_TIMEOUT', '30000'))
+    width = int(os.environ.get('SCREENSHOT_WIDTH', '1920'))
+    height = int(os.environ.get('SCREENSHOT_HEIGHT', '1080'))
+    wait = int(os.environ.get('SCREENSHOT_WAIT', '1000'))
+
+    print(f"Capturing screenshot of: {url}", file=sys.stderr)
+
+    # Ensure playwright is installed
+    if not ensure_playwright():
+        sys.exit(1)
+
+    from playwright.sync_api import sync_playwright
+
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch()
+            page = browser.new_page(viewport={'width': width, 'height': height})
+            page.goto(url, timeout=timeout, wait_until='networkidle')
+
+            # Wait a bit for any dynamic content
+            page.wait_for_timeout(wait)
+
+            page.screenshot(path='screenshot.png', full_page=True)
+            browser.close()
+
+            print("✓ Captured screenshot: screenshot.png", file=sys.stderr)
+            print("screenshot.png")
+            sys.exit(0)
+
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == '__main__':
+    main()
--- a/archivebox-ts/extractors/title
+++ b/archivebox-ts/extractors/title
@@ -0,0 +1,89 @@
+#!/usr/bin/env node
+//
+// Title Extractor
+// Extracts the page title from a given URL
+//
+// Usage: title <url>
+// Output: title.txt in current directory
+// Config: All configuration via environment variables
+//   TITLE_TIMEOUT - Timeout in milliseconds (default: 10000)
+//   TITLE_USER_AGENT - User agent string
+//
+
+const https = require('https');
+const http = require('http');
+const fs = require('fs');
+const { URL } = require('url');
+
+const url = process.argv[2];
+
+if (!url) {
+  console.error('Error: URL argument required');
+  process.exit(1);
+}
+
+// Configuration from environment
+const TIMEOUT = parseInt(process.env.TITLE_TIMEOUT || '10000', 10);
+const USER_AGENT = process.env.TITLE_USER_AGENT || 'Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)';
+
+console.error(`Extracting title from: ${url}`);
+
+// Parse URL
+let parsedUrl;
+try {
+  parsedUrl = new URL(url);
+} catch (err) {
+  console.error(`Error: Invalid URL: ${err.message}`);
+  process.exit(1);
+}
+
+// Choose http or https module
+const client = parsedUrl.protocol === 'https:' ? https : http;
+
+// Make request
+const options = {
+  headers: {
+    'User-Agent': USER_AGENT,
+  },
+  timeout: TIMEOUT,
+};
+
+client.get(url, options, (res) => {
+  let html = '';
+
+  res.on('data', (chunk) => {
+    html += chunk;
+
+    // Early exit if we found the title (optimization)
+    if (html.includes('</title>')) {
+      res.destroy();
+    }
+  });
+
+  res.on('end', () => {
+    // Extract title using regex
+    const titleMatch = html.match(/<title[^>]*>(.*?)<\/title>/is);
+
+    if (titleMatch && titleMatch[1]) {
+      const title = titleMatch[1]
+        .replace(/<[^>]*>/g, '') // Remove any HTML tags
+        .replace(/\s+/g, ' ')     // Normalize whitespace
+        .trim();
+
+      // Write to file
+      fs.writeFileSync('title.txt', title, 'utf8');
+      console.error(`✓ Extracted title: ${title}`);
+      console.log('title.txt');
+      process.exit(0);
+    } else {
+      console.error('Warning: Could not find title tag');
+      process.exit(1);
+    }
+  });
+}).on('error', (err) => {
+  console.error(`Error: ${err.message}`);
+  process.exit(1);
+}).on('timeout', () => {
+  console.error('Error: Request timeout');
+  process.exit(1);
+});
--- a/archivebox-ts/extractors/wget
+++ b/archivebox-ts/extractors/wget
@@ -0,0 +1,75 @@
+#!/bin/bash
+#
+# Wget Extractor
+# Downloads a complete copy of the page using wget
+#
+# Usage: wget <url>
+# Output: Files in current directory
+# Config: All configuration via environment variables
+#   WGET_TIMEOUT - Timeout in seconds (default: 60)
+#   WGET_USER_AGENT - User agent string
+#   WGET_ARGS - Additional wget arguments
+#
+
+set -e
+
+URL="$1"
+
+if [ -z "$URL" ]; then
+  echo "Error: URL argument required" >&2
+  exit 1
+fi
+
+# Auto-install dependencies
+if ! command -v wget &> /dev/null; then
+  echo "Installing wget..." >&2
+  if command -v apt-get &> /dev/null; then
+    sudo apt-get update && sudo apt-get install -y wget
+  elif command -v yum &> /dev/null; then
+    sudo yum install -y wget
+  elif command -v brew &> /dev/null; then
+    brew install wget
+  else
+    echo "Error: Cannot install wget. Please install manually." >&2
+    exit 1
+  fi
+fi
+
+# Configuration from environment
+TIMEOUT="${WGET_TIMEOUT:-60}"
+USER_AGENT="${WGET_USER_AGENT:-Mozilla/5.0 (compatible; ArchiveBox-TS/0.1)}"
+EXTRA_ARGS="${WGET_ARGS:-}"
+
+echo "Downloading with wget: $URL" >&2
+
+# Create warc directory
+mkdir -p warc
+
+# Run wget with WARC output
+wget \
+  --timeout="$TIMEOUT" \
+  --user-agent="$USER_AGENT" \
+  --adjust-extension \
+  --convert-links \
+  --page-requisites \
+  --span-hosts \
+  --no-parent \
+  --warc-file="warc/archive" \
+  --warc-cdx \
+  $EXTRA_ARGS \
+  "$URL" 2>&1 || true
+
+if [ -f "warc/archive.warc.gz" ]; then
+  echo "✓ Created WARC archive" >&2
+  echo "warc/archive.warc.gz"
+  exit 0
+else
+  echo "Warning: WARC file not created" >&2
+  # Still succeed if we downloaded something
+  if [ -n "$(ls -A 2>/dev/null)" ]; then
+    echo "✓ Downloaded files" >&2
+    echo "."
+    exit 0
+  fi
+  exit 1
+fi
--- a/archivebox-ts/package-lock.json
+++ b/archivebox-ts/package-lock.json
@@ -0,0 +1,542 @@
+{
+  "name": "archivebox-ts",
+  "version": "0.1.0",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "archivebox-ts",
+      "version": "0.1.0",
+      "license": "MIT",
+      "dependencies": {
+        "better-sqlite3": "^11.0.0",
+        "commander": "^12.0.0",
+        "nanoid": "^3.3.7"
+      },
+      "bin": {
+        "archivebox-ts": "dist/cli.js"
+      },
+      "devDependencies": {
+        "@types/better-sqlite3": "^7.6.9",
+        "@types/node": "^20.11.0",
+        "typescript": "^5.3.3"
+      }
+    },
+    "node_modules/@types/better-sqlite3": {
+      "version": "7.6.13",
+      "resolved": "https://registry.npmjs.org/@types/better-sqlite3/-/better-sqlite3-7.6.13.tgz",
+      "integrity": "sha512-NMv9ASNARoKksWtsq/SHakpYAYnhBrQgGD8zkLYk/jaK8jUGn08CfEdTRgYhMypUQAfzSP8W6gNLe0q19/t4VA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/node": "*"
+      }
+    },
+    "node_modules/@types/node": {
+      "version": "20.19.24",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.24.tgz",
+      "integrity": "sha512-FE5u0ezmi6y9OZEzlJfg37mqqf6ZDSF2V/NLjUyGrR9uTZ7Sb9F7bLNZ03S4XVUNRWGA7Ck4c1kK+YnuWjl+DA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~6.21.0"
+      }
+    },
+    "node_modules/base64-js": {
+      "version": "1.5.1",
+      "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz",
+      "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/better-sqlite3": {
+      "version": "11.10.0",
+      "resolved": "https://registry.npmjs.org/better-sqlite3/-/better-sqlite3-11.10.0.tgz",
+      "integrity": "sha512-EwhOpyXiOEL/lKzHz9AW1msWFNzGc/z+LzeB3/jnFJpxu+th2yqvzsSWas1v9jgs9+xiXJcD5A8CJxAG2TaghQ==",
+      "hasInstallScript": true,
+      "license": "MIT",
+      "dependencies": {
+        "bindings": "^1.5.0",
+        "prebuild-install": "^7.1.1"
+      }
+    },
+    "node_modules/bindings": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/bindings/-/bindings-1.5.0.tgz",
+      "integrity": "sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==",
+      "license": "MIT",
+      "dependencies": {
+        "file-uri-to-path": "1.0.0"
+      }
+    },
+    "node_modules/bl": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz",
+      "integrity": "sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==",
+      "license": "MIT",
+      "dependencies": {
+        "buffer": "^5.5.0",
+        "inherits": "^2.0.4",
+        "readable-stream": "^3.4.0"
+      }
+    },
+    "node_modules/buffer": {
+      "version": "5.7.1",
+      "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz",
+      "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "base64-js": "^1.3.1",
+        "ieee754": "^1.1.13"
+      }
+    },
+    "node_modules/chownr": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
+      "integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==",
+      "license": "ISC"
+    },
+    "node_modules/commander": {
+      "version": "12.1.0",
+      "resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz",
+      "integrity": "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      }
+    },
+    "node_modules/decompress-response": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz",
+      "integrity": "sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==",
+      "license": "MIT",
+      "dependencies": {
+        "mimic-response": "^3.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/deep-extend": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz",
+      "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=4.0.0"
+      }
+    },
+    "node_modules/detect-libc": {
+      "version": "2.1.2",
+      "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz",
+      "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/end-of-stream": {
+      "version": "1.4.5",
+      "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz",
+      "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==",
+      "license": "MIT",
+      "dependencies": {
+        "once": "^1.4.0"
+      }
+    },
+    "node_modules/expand-template": {
+      "version": "2.0.3",
+      "resolved": "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz",
+      "integrity": "sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==",
+      "license": "(MIT OR WTFPL)",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/file-uri-to-path": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz",
+      "integrity": "sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==",
+      "license": "MIT"
+    },
+    "node_modules/fs-constants": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz",
+      "integrity": "sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==",
+      "license": "MIT"
+    },
+    "node_modules/github-from-package": {
+      "version": "0.0.0",
+      "resolved": "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz",
+      "integrity": "sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==",
+      "license": "MIT"
+    },
+    "node_modules/ieee754": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz",
+      "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
+      "license": "ISC"
+    },
+    "node_modules/ini": {
+      "version": "1.3.8",
+      "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz",
+      "integrity": "sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==",
+      "license": "ISC"
+    },
+    "node_modules/mimic-response": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
+      "integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/minimist": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
+      "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/mkdirp-classic": {
+      "version": "0.5.3",
+      "resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
+      "integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==",
+      "license": "MIT"
+    },
+    "node_modules/nanoid": {
+      "version": "3.3.11",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
+      "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "bin": {
+        "nanoid": "bin/nanoid.cjs"
+      },
+      "engines": {
+        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
+      }
+    },
+    "node_modules/napi-build-utils": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-2.0.0.tgz",
+      "integrity": "sha512-GEbrYkbfF7MoNaoh2iGG84Mnf/WZfB0GdGEsM8wz7Expx/LlWf5U8t9nvJKXSp3qr5IsEbK04cBGhol/KwOsWA==",
+      "license": "MIT"
+    },
+    "node_modules/node-abi": {
+      "version": "3.80.0",
+      "resolved": "https://registry.npmjs.org/node-abi/-/node-abi-3.80.0.tgz",
+      "integrity": "sha512-LyPuZJcI9HVwzXK1GPxWNzrr+vr8Hp/3UqlmWxxh8p54U1ZbclOqbSog9lWHaCX+dBaiGi6n/hIX+mKu74GmPA==",
+      "license": "MIT",
+      "dependencies": {
+        "semver": "^7.3.5"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/once": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
+      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
+      "license": "ISC",
+      "dependencies": {
+        "wrappy": "1"
+      }
+    },
+    "node_modules/prebuild-install": {
+      "version": "7.1.3",
+      "resolved": "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.3.tgz",
+      "integrity": "sha512-8Mf2cbV7x1cXPUILADGI3wuhfqWvtiLA1iclTDbFRZkgRQS0NqsPZphna9V+HyTEadheuPmjaJMsbzKQFOzLug==",
+      "license": "MIT",
+      "dependencies": {
+        "detect-libc": "^2.0.0",
+        "expand-template": "^2.0.3",
+        "github-from-package": "0.0.0",
+        "minimist": "^1.2.3",
+        "mkdirp-classic": "^0.5.3",
+        "napi-build-utils": "^2.0.0",
+        "node-abi": "^3.3.0",
+        "pump": "^3.0.0",
+        "rc": "^1.2.7",
+        "simple-get": "^4.0.0",
+        "tar-fs": "^2.0.0",
+        "tunnel-agent": "^0.6.0"
+      },
+      "bin": {
+        "prebuild-install": "bin.js"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/pump": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz",
+      "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==",
+      "license": "MIT",
+      "dependencies": {
+        "end-of-stream": "^1.1.0",
+        "once": "^1.3.1"
+      }
+    },
+    "node_modules/rc": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz",
+      "integrity": "sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==",
+      "license": "(BSD-2-Clause OR MIT OR Apache-2.0)",
+      "dependencies": {
+        "deep-extend": "^0.6.0",
+        "ini": "~1.3.0",
+        "minimist": "^1.2.0",
+        "strip-json-comments": "~2.0.1"
+      },
+      "bin": {
+        "rc": "cli.js"
+      }
+    },
+    "node_modules/readable-stream": {
+      "version": "3.6.2",
+      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
+      "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
+      "license": "MIT",
+      "dependencies": {
+        "inherits": "^2.0.3",
+        "string_decoder": "^1.1.1",
+        "util-deprecate": "^1.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/safe-buffer": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/semver": {
+      "version": "7.7.3",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz",
+      "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==",
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver.js"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/simple-concat": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz",
+      "integrity": "sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/simple-get": {
+      "version": "4.0.1",
+      "resolved": "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz",
+      "integrity": "sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "decompress-response": "^6.0.0",
+        "once": "^1.3.1",
+        "simple-concat": "^1.0.0"
+      }
+    },
+    "node_modules/string_decoder": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
+      "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
+      "license": "MIT",
+      "dependencies": {
+        "safe-buffer": "~5.2.0"
+      }
+    },
+    "node_modules/strip-json-comments": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz",
+      "integrity": "sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/tar-fs": {
+      "version": "2.1.4",
+      "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.4.tgz",
+      "integrity": "sha512-mDAjwmZdh7LTT6pNleZ05Yt65HC3E+NiQzl672vQG38jIrehtJk/J3mNwIg+vShQPcLF/LV7CMnDW6vjj6sfYQ==",
+      "license": "MIT",
+      "dependencies": {
+        "chownr": "^1.1.1",
+        "mkdirp-classic": "^0.5.2",
+        "pump": "^3.0.0",
+        "tar-stream": "^2.1.4"
+      }
+    },
+    "node_modules/tar-stream": {
+      "version": "2.2.0",
+      "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz",
+      "integrity": "sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==",
+      "license": "MIT",
+      "dependencies": {
+        "bl": "^4.0.3",
+        "end-of-stream": "^1.4.1",
+        "fs-constants": "^1.0.0",
+        "inherits": "^2.0.3",
+        "readable-stream": "^3.1.1"
+      },
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/tunnel-agent": {
+      "version": "0.6.0",
+      "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
+      "integrity": "sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "safe-buffer": "^5.0.1"
+      },
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/typescript": {
+      "version": "5.9.3",
+      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz",
+      "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "bin": {
+        "tsc": "bin/tsc",
+        "tsserver": "bin/tsserver"
+      },
+      "engines": {
+        "node": ">=14.17"
+      }
+    },
+    "node_modules/undici-types": {
+      "version": "6.21.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
+      "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/util-deprecate": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
+      "license": "MIT"
+    },
+    "node_modules/wrappy": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
+      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
+      "license": "ISC"
+    }
+  }
+}
--- a/archivebox-ts/package.json
+++ b/archivebox-ts/package.json
@@ -0,0 +1,32 @@
+{
+  "name": "archivebox-ts",
+  "version": "0.1.0",
+  "description": "TypeScript-based version of ArchiveBox with simplified architecture",
+  "main": "dist/cli.js",
+  "bin": {
+    "archivebox-ts": "./dist/cli.js"
+  },
+  "scripts": {
+    "build": "tsc",
+    "dev": "tsc --watch",
+    "start": "node dist/cli.js",
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "keywords": [
+    "archiving",
+    "web-archiving",
+    "snapshot"
+  ],
+  "author": "",
+  "license": "MIT",
+  "dependencies": {
+    "better-sqlite3": "^11.0.0",
+    "commander": "^12.0.0",
+    "nanoid": "^3.3.7"
+  },
+  "devDependencies": {
+    "@types/better-sqlite3": "^7.6.9",
+    "@types/node": "^20.11.0",
+    "typescript": "^5.3.3"
+  }
+}
--- a/archivebox-ts/src/cli.ts
+++ b/archivebox-ts/src/cli.ts
@@ -0,0 +1,312 @@
+#!/usr/bin/env node
+
+/**
+ * ArchiveBox TypeScript - Main CLI
+ */
+
+import { Command } from 'commander';
+import * as path from 'path';
+import * as fs from 'fs';
+import { ArchiveDatabase } from './db';
+import { ExtractorManager } from './extractors';
+import type { ExtractorName } from './models';
+
+const program = new Command();
+
+// Default paths
+const DEFAULT_DATA_DIR = path.join(process.cwd(), 'data');
+const DEFAULT_DB_PATH = path.join(DEFAULT_DATA_DIR, 'index.sqlite3');
+const DEFAULT_ARCHIVE_DIR = path.join(DEFAULT_DATA_DIR, 'archive');
+const EXTRACTORS_DIR = path.join(__dirname, '..', 'extractors');
+
+// Helper to ensure data directory exists
+function ensureDataDir(dataDir: string): void {
+  if (!fs.existsSync(dataDir)) {
+    fs.mkdirSync(dataDir, { recursive: true });
+  }
+}
+
+// Helper to get snapshot output directory
+function getSnapshotOutputDir(archiveDir: string, snapshotId: string, url: string): string {
+  const urlObj = new URL(url);
+  const domain = urlObj.hostname;
+  const timestamp = Date.now().toString();
+
+  // Create directory structure: archive/<timestamp>_<domain>
+  const dirName = `${timestamp}_${domain}`;
+  const outputDir = path.join(archiveDir, dirName);
+
+  return outputDir;
+}
+
+program
+  .name('archivebox-ts')
+  .description('TypeScript-based version of ArchiveBox with simplified architecture')
+  .version('0.1.0');
+
+// Initialize command
+program
+  .command('init')
+  .description('Initialize ArchiveBox data directory and database')
+  .option('-d, --data-dir <path>', 'Data directory path', DEFAULT_DATA_DIR)
+  .action((options) => {
+    const dataDir = options.dataDir;
+    const dbPath = path.join(dataDir, 'index.sqlite3');
+    const archiveDir = path.join(dataDir, 'archive');
+
+    console.log('Initializing ArchiveBox...');
+    console.log(`Data directory: ${dataDir}`);
+    console.log(`Database: ${dbPath}`);
+    console.log(`Archive directory: ${archiveDir}`);
+
+    ensureDataDir(dataDir);
+    ensureDataDir(archiveDir);
+
+    const db = new ArchiveDatabase(dbPath);
+    db.close();
+
+    console.log('✓ Initialization complete!');
+  });
+
+// Add command
+program
+  .command('add')
+  .description('Add a URL to archive')
+  .argument('<url>', 'URL to archive')
+  .option('-d, --data-dir <path>', 'Data directory path', DEFAULT_DATA_DIR)
+  .option('-e, --extractors <names>', 'Comma-separated list of extractors to run (default: all)')
+  .option('--title <title>', 'Page title')
+  .action(async (url, options) => {
+    const dataDir = options.dataDir;
+    const dbPath = path.join(dataDir, 'index.sqlite3');
+    const archiveDir = path.join(dataDir, 'archive');
+
+    ensureDataDir(dataDir);
+    ensureDataDir(archiveDir);
+
+    const db = new ArchiveDatabase(dbPath);
+    const extractorManager = new ExtractorManager(EXTRACTORS_DIR);
+
+    try {
+      console.log(`Adding URL: ${url}`);
+
+      // Check if URL already exists
+      let snapshot = db.getSnapshotByUrl(url);
+      if (snapshot) {
+        console.log(`URL already exists with ID: ${snapshot.id}`);
+      } else {
+        // Create new snapshot
+        snapshot = db.createSnapshot({
+          url,
+          title: options.title,
+        });
+        console.log(`✓ Created snapshot: ${snapshot.id}`);
+      }
+
+      // Determine which extractors to run
+      const availableExtractors = extractorManager.getAvailableExtractors();
+      let extractorsToRun: ExtractorName[];
+
+      if (options.extractors) {
+        extractorsToRun = options.extractors.split(',').map((e: string) => e.trim() as ExtractorName);
+        // Validate extractors
+        for (const extractor of extractorsToRun) {
+          if (!extractorManager.hasExtractor(extractor)) {
+            console.warn(`Warning: Extractor not found: ${extractor}`);
+          }
+        }
+      } else {
+        extractorsToRun = availableExtractors;
+      }
+
+      if (extractorsToRun.length === 0) {
+        console.log('No extractors available. Place extractor executables in the extractors/ directory.');
+        db.close();
+        return;
+      }
+
+      console.log(`Running extractors: ${extractorsToRun.join(', ')}`);
+
+      // Update snapshot status
+      db.updateSnapshotStatus(snapshot.id, 'started');
+
+      // Create output directory
+      const outputDir = getSnapshotOutputDir(archiveDir, snapshot.id, url);
+      fs.mkdirSync(outputDir, { recursive: true });
+      db.setSnapshotOutputDir(snapshot.id, outputDir);
+
+      console.log(`Output directory: ${outputDir}`);
+
+      // Create archive results for each extractor
+      const archiveResults = new Map<ExtractorName, string>();
+      for (const extractor of extractorsToRun) {
+        if (extractorManager.hasExtractor(extractor)) {
+          const result = db.createArchiveResult({
+            snapshot_id: snapshot.id,
+            extractor,
+          });
+          archiveResults.set(extractor, result.id);
+        }
+      }
+
+      // Run extractors
+      const results = await extractorManager.runExtractors(
+        extractorsToRun,
+        url,
+        outputDir,
+        {} // Environment variables can be passed here
+      );
+
+      // Update archive results
+      for (const [extractorName, result] of results.entries()) {
+        const resultId = archiveResults.get(extractorName);
+        if (resultId) {
+          db.updateArchiveResult(resultId, {
+            status: result.success ? 'succeeded' : 'failed',
+            start_ts: result.start_ts,
+            end_ts: result.end_ts,
+            cmd: result.cmd,
+            pwd: result.pwd,
+            output: result.output,
+            notes: result.error || '',
+          });
+
+          const status = result.success ? '✓' : '✗';
+          console.log(`  ${status} ${extractorName}: ${result.success ? 'succeeded' : 'failed'}`);
+          if (result.error) {
+            console.log(`    Error: ${result.error}`);
+          }
+        }
+      }
+
+      // Update snapshot status
+      db.updateSnapshotStatus(snapshot.id, 'sealed', new Date().toISOString());
+
+      console.log(`✓ Archiving complete!`);
+      console.log(`Snapshot ID: ${snapshot.id}`);
+      console.log(`Output: ${outputDir}`);
+
+    } catch (err) {
+      console.error('Error:', err instanceof Error ? err.message : err);
+      process.exit(1);
+    } finally {
+      db.close();
+    }
+  });
+
+// List command
+program
+  .command('list')
+  .description('List all archived snapshots')
+  .option('-d, --data-dir <path>', 'Data directory path', DEFAULT_DATA_DIR)
+  .option('-l, --limit <number>', 'Number of snapshots to show', '20')
+  .option('-o, --offset <number>', 'Offset for pagination', '0')
+  .action((options) => {
+    const dataDir = options.dataDir;
+    const dbPath = path.join(dataDir, 'index.sqlite3');
+
+    const db = new ArchiveDatabase(dbPath);
+
+    try {
+      const snapshots = db.getAllSnapshots(
+        parseInt(options.limit),
+        parseInt(options.offset)
+      );
+
+      if (snapshots.length === 0) {
+        console.log('No snapshots found.');
+        return;
+      }
+
+      console.log(`\nFound ${snapshots.length} snapshot(s):\n`);
+
+      for (const snapshot of snapshots) {
+        console.log(`ID: ${snapshot.id}`);
+        console.log(`URL: ${snapshot.url}`);
+        console.log(`Title: ${snapshot.title || '(none)'}`);
+        console.log(`Status: ${snapshot.status}`);
+        console.log(`Created: ${snapshot.created_at}`);
+        console.log(`Output: ${snapshot.output_dir || '(none)'}`);
+        console.log('---');
+      }
+    } catch (err) {
+      console.error('Error:', err instanceof Error ? err.message : err);
+      process.exit(1);
+    } finally {
+      db.close();
+    }
+  });
+
+// Status command
+program
+  .command('status')
+  .description('Show status of a snapshot')
+  .argument('<id>', 'Snapshot ID')
+  .option('-d, --data-dir <path>', 'Data directory path', DEFAULT_DATA_DIR)
+  .action((id, options) => {
+    const dataDir = options.dataDir;
+    const dbPath = path.join(dataDir, 'index.sqlite3');
+
+    const db = new ArchiveDatabase(dbPath);
+
+    try {
+      const snapshot = db.getSnapshot(id);
+      if (!snapshot) {
+        console.error(`Snapshot not found: ${id}`);
+        process.exit(1);
+      }
+
+      console.log(`\nSnapshot: ${snapshot.id}`);
+      console.log(`URL: ${snapshot.url}`);
+      console.log(`Title: ${snapshot.title || '(none)'}`);
+      console.log(`Status: ${snapshot.status}`);
+      console.log(`Created: ${snapshot.created_at}`);
+      console.log(`Downloaded: ${snapshot.downloaded_at || '(in progress)'}`);
+      console.log(`Output: ${snapshot.output_dir || '(none)'}`);
+
+      const results = db.getArchiveResults(snapshot.id);
+      if (results.length > 0) {
+        console.log(`\nArchive Results (${results.length}):`);
+        for (const result of results) {
+          const statusIcon = result.status === 'succeeded' ? '✓' :
+                           result.status === 'failed' ? '✗' :
+                           result.status === 'started' ? '⋯' : '○';
+          console.log(`  ${statusIcon} ${result.extractor}: ${result.status}`);
+          if (result.output) {
+            console.log(`    Output: ${result.output}`);
+          }
+          if (result.notes) {
+            console.log(`    Notes: ${result.notes}`);
+          }
+        }
+      }
+    } catch (err) {
+      console.error('Error:', err instanceof Error ? err.message : err);
+      process.exit(1);
+    } finally {
+      db.close();
+    }
+  });
+
+// Extractors command
+program
+  .command('extractors')
+  .description('List available extractors')
+  .action(() => {
+    const extractorManager = new ExtractorManager(EXTRACTORS_DIR);
+    const extractors = extractorManager.getAvailableExtractors();
+
+    if (extractors.length === 0) {
+      console.log('No extractors found.');
+      console.log(`Place executable files in: ${EXTRACTORS_DIR}`);
+      return;
+    }
+
+    console.log(`\nAvailable extractors (${extractors.length}):\n`);
+    for (const extractor of extractors) {
+      console.log(`  - ${extractor}`);
+    }
+    console.log();
+  });
+
+program.parse();
--- a/archivebox-ts/src/db.ts
+++ b/archivebox-ts/src/db.ts
@@ -0,0 +1,373 @@
+/**
+ * Database layer using SQLite with schema matching ArchiveBox
+ */
+
+import Database from 'better-sqlite3';
+import { randomUUID } from 'crypto';
+import { nanoid } from 'nanoid';
+import * as path from 'path';
+import * as fs from 'fs';
+import type {
+  Snapshot,
+  ArchiveResult,
+  CreateSnapshotInput,
+  CreateArchiveResultInput,
+  SnapshotStatus,
+  ArchiveResultStatus,
+} from './models';
+
+export class ArchiveDatabase {
+  private db: Database.Database;
+
+  constructor(dbPath: string) {
+    // Ensure the directory exists
+    const dir = path.dirname(dbPath);
+    if (!fs.existsSync(dir)) {
+      fs.mkdirSync(dir, { recursive: true });
+    }
+
+    this.db = new Database(dbPath);
+    this.db.pragma('journal_mode = WAL');
+    this.initSchema();
+  }
+
+  private initSchema(): void {
+    // Create snapshots table (simplified from ArchiveBox schema)
+    this.db.exec(`
+      CREATE TABLE IF NOT EXISTS snapshots (
+        id TEXT PRIMARY KEY,
+        abid TEXT NOT NULL UNIQUE,
+        url TEXT NOT NULL UNIQUE,
+        timestamp TEXT NOT NULL UNIQUE,
+        title TEXT,
+        created_at TEXT NOT NULL,
+        bookmarked_at TEXT NOT NULL,
+        downloaded_at TEXT,
+        modified_at TEXT NOT NULL,
+        status TEXT NOT NULL DEFAULT 'queued',
+        retry_at TEXT NOT NULL,
+        config TEXT NOT NULL DEFAULT '{}',
+        notes TEXT NOT NULL DEFAULT '',
+        output_dir TEXT
+      );
+
+      CREATE INDEX IF NOT EXISTS idx_snapshots_url ON snapshots(url);
+      CREATE INDEX IF NOT EXISTS idx_snapshots_timestamp ON snapshots(timestamp);
+      CREATE INDEX IF NOT EXISTS idx_snapshots_created_at ON snapshots(created_at);
+      CREATE INDEX IF NOT EXISTS idx_snapshots_status ON snapshots(status);
+    `);
+
+    // Create archive_results table (simplified from ArchiveBox schema)
+    this.db.exec(`
+      CREATE TABLE IF NOT EXISTS archive_results (
+        id TEXT PRIMARY KEY,
+        abid TEXT NOT NULL UNIQUE,
+        snapshot_id TEXT NOT NULL,
+        extractor TEXT NOT NULL,
+        status TEXT NOT NULL DEFAULT 'queued',
+        created_at TEXT NOT NULL,
+        modified_at TEXT NOT NULL,
+        start_ts TEXT,
+        end_ts TEXT,
+        cmd TEXT,
+        pwd TEXT,
+        cmd_version TEXT,
+        output TEXT,
+        retry_at TEXT NOT NULL,
+        config TEXT NOT NULL DEFAULT '{}',
+        notes TEXT NOT NULL DEFAULT '',
+        FOREIGN KEY (snapshot_id) REFERENCES snapshots(id) ON DELETE CASCADE
+      );
+
+      CREATE INDEX IF NOT EXISTS idx_archive_results_snapshot_id ON archive_results(snapshot_id);
+      CREATE INDEX IF NOT EXISTS idx_archive_results_extractor ON archive_results(extractor);
+      CREATE INDEX IF NOT EXISTS idx_archive_results_status ON archive_results(status);
+      CREATE INDEX IF NOT EXISTS idx_archive_results_created_at ON archive_results(created_at);
+    `);
+  }
+
+  /**
+   * Generate ABID (Archivable Bytes Identifier) similar to ArchiveBox
+   */
+  private generateABID(prefix: string, url: string): string {
+    const randomPart = nanoid(8);
+    return `${prefix}${randomPart}`;
+  }
+
+  /**
+   * Create a new snapshot
+   */
+  createSnapshot(input: CreateSnapshotInput): Snapshot {
+    const now = new Date().toISOString();
+    const timestamp = Date.now().toString();
+    const id = randomUUID();
+    const abid = this.generateABID('snp_', input.url);
+
+    const snapshot: Snapshot = {
+      id,
+      abid,
+      url: input.url,
+      timestamp,
+      title: input.title || null,
+      created_at: now,
+      bookmarked_at: input.bookmarked_at || now,
+      downloaded_at: null,
+      modified_at: now,
+      status: 'queued',
+      retry_at: now,
+      config: JSON.stringify(input.config || {}),
+      notes: input.notes || '',
+      output_dir: null,
+    } as any;
+
+    const stmt = this.db.prepare(`
+      INSERT INTO snapshots (
+        id, abid, url, timestamp, title, created_at, bookmarked_at,
+        downloaded_at, modified_at, status, retry_at, config, notes, output_dir
+      ) VALUES (
+        @id, @abid, @url, @timestamp, @title, @created_at, @bookmarked_at,
+        @downloaded_at, @modified_at, @status, @retry_at, @config, @notes, @output_dir
+      )
+    `);
+
+    stmt.run(snapshot);
+    return this.getSnapshot(id)!;
+  }
+
+  /**
+   * Get a snapshot by ID
+   */
+  getSnapshot(id: string): Snapshot | null {
+    const stmt = this.db.prepare('SELECT * FROM snapshots WHERE id = ?');
+    const row = stmt.get(id) as any;
+    if (!row) return null;
+
+    return {
+      ...row,
+      config: JSON.parse(row.config || '{}'),
+    } as Snapshot;
+  }
+
+  /**
+   * Get a snapshot by URL
+   */
+  getSnapshotByUrl(url: string): Snapshot | null {
+    const stmt = this.db.prepare('SELECT * FROM snapshots WHERE url = ?');
+    const row = stmt.get(url) as any;
+    if (!row) return null;
+
+    return {
+      ...row,
+      config: JSON.parse(row.config || '{}'),
+    } as Snapshot;
+  }
+
+  /**
+   * Get all snapshots
+   */
+  getAllSnapshots(limit: number = 100, offset: number = 0): Snapshot[] {
+    const stmt = this.db.prepare(
+      'SELECT * FROM snapshots ORDER BY created_at DESC LIMIT ? OFFSET ?'
+    );
+    const rows = stmt.all(limit, offset) as any[];
+
+    return rows.map(row => ({
+      ...row,
+      config: JSON.parse(row.config || '{}'),
+    })) as Snapshot[];
+  }
+
+  /**
+   * Update snapshot status
+   */
+  updateSnapshotStatus(id: string, status: SnapshotStatus, downloaded_at?: string): void {
+    const modified_at = new Date().toISOString();
+
+    if (downloaded_at) {
+      const stmt = this.db.prepare(`
+        UPDATE snapshots
+        SET status = ?, modified_at = ?, downloaded_at = ?
+        WHERE id = ?
+      `);
+      stmt.run(status, modified_at, downloaded_at, id);
+    } else {
+      const stmt = this.db.prepare(`
+        UPDATE snapshots
+        SET status = ?, modified_at = ?
+        WHERE id = ?
+      `);
+      stmt.run(status, modified_at, id);
+    }
+  }
+
+  /**
+   * Set snapshot output directory
+   */
+  setSnapshotOutputDir(id: string, output_dir: string): void {
+    const stmt = this.db.prepare(`
+      UPDATE snapshots SET output_dir = ?, modified_at = ? WHERE id = ?
+    `);
+    stmt.run(output_dir, new Date().toISOString(), id);
+  }
+
+  /**
+   * Create a new archive result
+   */
+  createArchiveResult(input: CreateArchiveResultInput): ArchiveResult {
+    const now = new Date().toISOString();
+    const id = randomUUID();
+    const snapshot = this.getSnapshot(input.snapshot_id);
+    if (!snapshot) {
+      throw new Error(`Snapshot ${input.snapshot_id} not found`);
+    }
+
+    const abid = this.generateABID('res_', snapshot.url);
+
+    const result: ArchiveResult = {
+      id,
+      abid,
+      snapshot_id: input.snapshot_id,
+      extractor: input.extractor,
+      status: 'queued',
+      created_at: now,
+      modified_at: now,
+      start_ts: null,
+      end_ts: null,
+      cmd: null,
+      pwd: null,
+      cmd_version: null,
+      output: null,
+      retry_at: now,
+      config: JSON.stringify(input.config || {}),
+      notes: input.notes || '',
+    } as any;
+
+    const stmt = this.db.prepare(`
+      INSERT INTO archive_results (
+        id, abid, snapshot_id, extractor, status, created_at, modified_at,
+        start_ts, end_ts, cmd, pwd, cmd_version, output, retry_at, config, notes
+      ) VALUES (
+        @id, @abid, @snapshot_id, @extractor, @status, @created_at, @modified_at,
+        @start_ts, @end_ts, @cmd, @pwd, @cmd_version, @output, @retry_at, @config, @notes
+      )
+    `);
+
+    stmt.run(result);
+    return this.getArchiveResult(id)!;
+  }
+
+  /**
+   * Get an archive result by ID
+   */
+  getArchiveResult(id: string): ArchiveResult | null {
+    const stmt = this.db.prepare('SELECT * FROM archive_results WHERE id = ?');
+    const row = stmt.get(id) as any;
+    if (!row) return null;
+
+    return {
+      ...row,
+      cmd: row.cmd ? JSON.parse(row.cmd) : null,
+      config: JSON.parse(row.config || '{}'),
+    } as ArchiveResult;
+  }
+
+  /**
+   * Get all archive results for a snapshot
+   */
+  getArchiveResults(snapshot_id: string): ArchiveResult[] {
+    const stmt = this.db.prepare(
+      'SELECT * FROM archive_results WHERE snapshot_id = ? ORDER BY created_at ASC'
+    );
+    const rows = stmt.all(snapshot_id) as any[];
+
+    return rows.map(row => ({
+      ...row,
+      cmd: row.cmd ? JSON.parse(row.cmd) : null,
+      config: JSON.parse(row.config || '{}'),
+    })) as ArchiveResult[];
+  }
+
+  /**
+   * Get archive results by status
+   */
+  getArchiveResultsByStatus(status: ArchiveResultStatus): ArchiveResult[] {
+    const stmt = this.db.prepare(
+      'SELECT * FROM archive_results WHERE status = ? ORDER BY created_at ASC'
+    );
+    const rows = stmt.all(status) as any[];
+
+    return rows.map(row => ({
+      ...row,
+      cmd: row.cmd ? JSON.parse(row.cmd) : null,
+      config: JSON.parse(row.config || '{}'),
+    })) as ArchiveResult[];
+  }
+
+  /**
+   * Update archive result
+   */
+  updateArchiveResult(
+    id: string,
+    updates: {
+      status?: ArchiveResultStatus;
+      start_ts?: string;
+      end_ts?: string;
+      cmd?: string[];
+      pwd?: string;
+      cmd_version?: string;
+      output?: string;
+      notes?: string;
+    }
+  ): void {
+    const fields: string[] = ['modified_at = ?'];
+    const values: any[] = [new Date().toISOString()];
+
+    if (updates.status !== undefined) {
+      fields.push('status = ?');
+      values.push(updates.status);
+    }
+    if (updates.start_ts !== undefined) {
+      fields.push('start_ts = ?');
+      values.push(updates.start_ts);
+    }
+    if (updates.end_ts !== undefined) {
+      fields.push('end_ts = ?');
+      values.push(updates.end_ts);
+    }
+    if (updates.cmd !== undefined) {
+      fields.push('cmd = ?');
+      values.push(JSON.stringify(updates.cmd));
+    }
+    if (updates.pwd !== undefined) {
+      fields.push('pwd = ?');
+      values.push(updates.pwd);
+    }
+    if (updates.cmd_version !== undefined) {
+      fields.push('cmd_version = ?');
+      values.push(updates.cmd_version);
+    }
+    if (updates.output !== undefined) {
+      fields.push('output = ?');
+      values.push(updates.output);
+    }
+    if (updates.notes !== undefined) {
+      fields.push('notes = ?');
+      values.push(updates.notes);
+    }
+
+    values.push(id);
+
+    const stmt = this.db.prepare(`
+      UPDATE archive_results SET ${fields.join(', ')} WHERE id = ?
+    `);
+
+    stmt.run(...values);
+  }
+
+  /**
+   * Close the database connection
+   */
+  close(): void {
+    this.db.close();
+  }
+}
--- a/archivebox-ts/src/extractors.ts
+++ b/archivebox-ts/src/extractors.ts
@@ -0,0 +1,205 @@
+/**
+ * Extractor orchestration system
+ * Discovers and runs standalone extractor executables
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { spawn } from 'child_process';
+import type { ExtractorName } from './models';
+
+export interface ExtractorInfo {
+  name: ExtractorName;
+  path: string;
+  executable: boolean;
+}
+
+export interface ExtractorResult {
+  success: boolean;
+  output?: string;
+  error?: string;
+  cmd: string[];
+  cmd_version?: string;
+  start_ts: string;
+  end_ts: string;
+  pwd: string;
+}
+
+export class ExtractorManager {
+  private extractorsDir: string;
+  private availableExtractors: Map<ExtractorName, ExtractorInfo>;
+
+  constructor(extractorsDir: string) {
+    this.extractorsDir = extractorsDir;
+    this.availableExtractors = new Map();
+    this.discoverExtractors();
+  }
+
+  /**
+   * Discover all available extractors in the extractors directory
+   */
+  private discoverExtractors(): void {
+    if (!fs.existsSync(this.extractorsDir)) {
+      console.warn(`Extractors directory not found: ${this.extractorsDir}`);
+      return;
+    }
+
+    const files = fs.readdirSync(this.extractorsDir);
+
+    for (const file of files) {
+      const filePath = path.join(this.extractorsDir, file);
+      const stats = fs.statSync(filePath);
+
+      // Skip directories and non-executable files
+      if (stats.isDirectory()) continue;
+
+      // Check if file is executable
+      try {
+        fs.accessSync(filePath, fs.constants.X_OK);
+        const name = file as ExtractorName;
+
+        this.availableExtractors.set(name, {
+          name,
+          path: filePath,
+          executable: true,
+        });
+
+        console.log(`Discovered extractor: ${name}`);
+      } catch (err) {
+        // File is not executable, skip it
+        console.warn(`Skipping non-executable file: ${file}`);
+      }
+    }
+  }
+
+  /**
+   * Get list of available extractors
+   */
+  getAvailableExtractors(): ExtractorName[] {
+    return Array.from(this.availableExtractors.keys());
+  }
+
+  /**
+   * Check if an extractor is available
+   */
+  hasExtractor(name: ExtractorName): boolean {
+    return this.availableExtractors.has(name);
+  }
+
+  /**
+   * Run an extractor on a URL
+   * @param extractorName Name of the extractor to run
+   * @param url URL to extract
+   * @param outputDir Directory where extractor should output files
+   * @param env Environment variables to pass to the extractor
+   * @returns Promise with the extraction result
+   */
+  async runExtractor(
+    extractorName: ExtractorName,
+    url: string,
+    outputDir: string,
+    env: Record<string, string> = {}
+  ): Promise<ExtractorResult> {
+    const extractor = this.availableExtractors.get(extractorName);
+
+    if (!extractor) {
+      throw new Error(`Extractor not found: ${extractorName}`);
+    }
+
+    // Ensure output directory exists
+    if (!fs.existsSync(outputDir)) {
+      fs.mkdirSync(outputDir, { recursive: true });
+    }
+
+    const start_ts = new Date().toISOString();
+    const cmd = [extractor.path, url];
+
+    // Merge environment variables
+    const processEnv = {
+      ...process.env,
+      ...env,
+      ARCHIVEBOX_OUTPUT_DIR: outputDir,
+    };
+
+    return new Promise((resolve) => {
+      let stdout = '';
+      let stderr = '';
+
+      const child = spawn(extractor.path, [url], {
+        cwd: outputDir,
+        env: processEnv,
+        stdio: ['ignore', 'pipe', 'pipe'],
+      });
+
+      child.stdout?.on('data', (data) => {
+        stdout += data.toString();
+      });
+
+      child.stderr?.on('data', (data) => {
+        stderr += data.toString();
+      });
+
+      child.on('close', (code) => {
+        const end_ts = new Date().toISOString();
+
+        const result: ExtractorResult = {
+          success: code === 0,
+          output: stdout.trim(),
+          error: stderr.trim() || undefined,
+          cmd,
+          start_ts,
+          end_ts,
+          pwd: outputDir,
+        };
+
+        resolve(result);
+      });
+
+      child.on('error', (err) => {
+        const end_ts = new Date().toISOString();
+
+        const result: ExtractorResult = {
+          success: false,
+          error: err.message,
+          cmd,
+          start_ts,
+          end_ts,
+          pwd: outputDir,
+        };
+
+        resolve(result);
+      });
+    });
+  }
+
+  /**
+   * Run multiple extractors in parallel
+   */
+  async runExtractors(
+    extractorNames: ExtractorName[],
+    url: string,
+    outputDir: string,
+    env: Record<string, string> = {}
+  ): Promise<Map<ExtractorName, ExtractorResult>> {
+    const results = new Map<ExtractorName, ExtractorResult>();
+
+    const promises = extractorNames.map(async (name) => {
+      try {
+        const result = await this.runExtractor(name, url, outputDir, env);
+        results.set(name, result);
+      } catch (err) {
+        results.set(name, {
+          success: false,
+          error: err instanceof Error ? err.message : String(err),
+          cmd: [name, url],
+          start_ts: new Date().toISOString(),
+          end_ts: new Date().toISOString(),
+          pwd: outputDir,
+        });
+      }
+    });
+
+    await Promise.all(promises);
+    return results;
+  }
+}
--- a/archivebox-ts/src/models.ts
+++ b/archivebox-ts/src/models.ts
@@ -0,0 +1,85 @@
+/**
+ * TypeScript models matching ArchiveBox database schema
+ */
+
+export type SnapshotStatus = 'queued' | 'started' | 'sealed';
+export type ArchiveResultStatus = 'queued' | 'started' | 'backoff' | 'succeeded' | 'failed' | 'skipped';
+
+export type ExtractorName =
+  | 'favicon'
+  | 'title'
+  | 'headers'
+  | 'screenshot'
+  | 'pdf'
+  | 'dom'
+  | 'singlefile'
+  | 'wget'
+  | 'readability'
+  | 'mercury'
+  | 'htmltotext'
+  | 'git'
+  | 'media'
+  | 'archive_org';
+
+/**
+ * Snapshot represents a single URL being archived
+ */
+export interface Snapshot {
+  id: string;                    // UUID primary key
+  abid: string;                  // ABID identifier (snp_...)
+  url: string;                   // The URL being archived (unique)
+  timestamp: string;             // Unix timestamp string (unique)
+  title: string | null;          // Page title
+  created_at: string;            // ISO datetime
+  bookmarked_at: string;         // ISO datetime
+  downloaded_at: string | null;  // ISO datetime when archiving completed
+  modified_at: string;           // ISO datetime
+  status: SnapshotStatus;        // Current status
+  retry_at: string;              // ISO datetime for retry logic
+  config: Record<string, any>;   // JSON configuration
+  notes: string;                 // Extra notes
+  output_dir: string | null;     // Path to output directory
+}
+
+/**
+ * ArchiveResult represents the result of running one extractor on one Snapshot
+ */
+export interface ArchiveResult {
+  id: string;                    // UUID primary key
+  abid: string;                  // ABID identifier (res_...)
+  snapshot_id: string;           // Foreign key to Snapshot
+  extractor: ExtractorName;      // Name of the extractor
+  status: ArchiveResultStatus;   // Current status
+  created_at: string;            // ISO datetime
+  modified_at: string;           // ISO datetime
+  start_ts: string | null;       // ISO datetime when extraction started
+  end_ts: string | null;         // ISO datetime when extraction ended
+  cmd: string[] | null;          // Command that was executed
+  pwd: string | null;            // Working directory
+  cmd_version: string | null;    // Version of the binary used
+  output: string | null;         // Main output file path or result
+  retry_at: string;              // ISO datetime for retry logic
+  config: Record<string, any>;   // JSON configuration
+  notes: string;                 // Extra notes
+}
+
+/**
+ * Simplified snapshot for creation
+ */
+export interface CreateSnapshotInput {
+  url: string;
+  title?: string | null;
+  bookmarked_at?: string;
+  config?: Record<string, any>;
+  notes?: string;
+}
+
+/**
+ * Simplified archive result for creation
+ */
+export interface CreateArchiveResultInput {
+  snapshot_id: string;
+  extractor: ExtractorName;
+  config?: Record<string, any>;
+  notes?: string;
+}
--- a/archivebox-ts/tsconfig.json
+++ b/archivebox-ts/tsconfig.json
@@ -0,0 +1,20 @@
+{
+  "compilerOptions": {
+    "target": "ES2022",
+    "module": "commonjs",
+    "lib": ["ES2022"],
+    "outDir": "./dist",
+    "rootDir": "./src",
+    "strict": true,
+    "esModuleInterop": true,
+    "skipLibCheck": true,
+    "forceConsistentCasingInFileNames": true,
+    "resolveJsonModule": true,
+    "declaration": true,
+    "declarationMap": true,
+    "sourceMap": true,
+    "moduleResolution": "node"
+  },
+  "include": ["src/**/*"],
+  "exclude": ["node_modules", "dist"]
+}