Add outlinks extractor to extract URLs from all output files

- New extractor: outlinks
  - Runs at the end after all other extractors
  - Finds all URLs in output files using grep
  - Truncates URLs at 2000 characters
  - Deduplicates and sorts output
  - Outputs to outlinks.txt (one URL per line)

- Updated ExtractorName type in models.ts to include 'outlinks'
- Updated EXTRACTOR_ORDER in extractors.ts to run outlinks last
- Tested with sample HTML and text files containing URLs

The extractor uses find + grep + sort + uniq pipeline to extract
all http(s):// URLs from any output files in the snapshot directory.
This commit is contained in:
Claude
2025-11-15 20:59:08 +00:00
parent 8a0dfa9b5f
commit 57072382bb
3 changed files with 39 additions and 1 deletions

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Extractor: outlinks
# Description: Extracts all URLs from output files in the snapshot directory
# Dependencies: grep, sort, uniq
# Outputs: outlinks.txt (one URL per line, sorted and deduplicated)
set -e
# CWD is the snapshot directory
SNAPSHOT_DIR="$(pwd)"
OUTPUT_FILE="outlinks.txt"
# URL regex pattern that matches http(s):// URLs
# This pattern is fairly permissive to catch various URL formats
URL_PATTERN='https?://[a-zA-Z0-9@:%._+~#?&/=-]+'
# Find all files in the snapshot directory (excluding outlinks.txt itself)
# Pass them through grep to find URLs
# Truncate each URL at 2000 chars
# Sort and deduplicate
# Output to outlinks.txt
# Find all files and extract URLs
find . -type f ! -name "outlinks.txt" ! -name ".env" -exec cat {} + 2>/dev/null | \
grep -oE "$URL_PATTERN" 2>/dev/null | \
cut -c 1-2000 | \
sort -u > "$OUTPUT_FILE" || true
# Count how many URLs were found
URL_COUNT=$(wc -l < "$OUTPUT_FILE" | tr -d ' ')
echo "[outlinks] Found $URL_COUNT unique URLs"
# Output success
echo "$OUTPUT_FILE"

View File

@@ -12,6 +12,7 @@ import type { ExtractorName } from './models';
// 2captcha must run FIRST to download/configure extensions before Chrome launches
// puppeteer runs second to launch Chrome with the extensions
// downloads, images, and infiniscroll run early to capture dynamic content
// outlinks runs LAST to extract URLs from all output files
export const EXTRACTOR_ORDER: string[] = [
'2captcha', // Downloads and configures Chrome extensions (2captcha, singlefile, etc.)
'puppeteer', // Launches Chrome with extensions and writes CDP URL to .env
@@ -31,6 +32,7 @@ export const EXTRACTOR_ORDER: string[] = [
'git', // Clones git repository (independent)
'media', // Downloads media with yt-dlp (independent)
'archive_org', // Submits to Internet Archive (independent)
'outlinks', // Extracts URLs from all output files (runs last)
];
export interface ExtractorInfo {

View File

@@ -23,7 +23,8 @@ export type ExtractorName =
| 'wget'
| 'git'
| 'media'
| 'archive_org';
| 'archive_org'
| 'outlinks';
/**
* Snapshot represents a single URL being archived