From 57072382bbf96d301784d2e30c0b483800738ad1 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 15 Nov 2025 20:59:08 +0000
Subject: [PATCH] Add outlinks extractor to extract URLs from all output files

- New extractor: outlinks
  - Runs at the end after all other extractors
  - Finds all URLs in output files using grep
  - Truncates URLs at 2000 characters
  - Deduplicates and sorts output
  - Outputs to outlinks.txt (one URL per line)

- Updated ExtractorName type in models.ts to include 'outlinks'
- Updated EXTRACTOR_ORDER in extractors.ts to run outlinks last
- Tested with sample HTML and text files containing URLs

The extractor uses find + grep + sort + uniq pipeline to extract
all http(s):// URLs from any output files in the snapshot directory.
---
 archivebox-ts/extractors/outlinks | 35 +++++++++++++++++++++++++++++++
 archivebox-ts/src/extractors.ts   |  2 ++
 archivebox-ts/src/models.ts       |  3 ++-
 3 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100755 archivebox-ts/extractors/outlinks

diff --git a/archivebox-ts/extractors/outlinks b/archivebox-ts/extractors/outlinks
new file mode 100755
index 00000000..9bd3a651
--- /dev/null
+++ b/archivebox-ts/extractors/outlinks
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Extractor: outlinks
+# Description: Extracts all URLs from output files in the snapshot directory
+# Dependencies: grep, sort, uniq
+# Outputs: outlinks.txt (one URL per line, sorted and deduplicated)
+
+set -e
+
+# CWD is the snapshot directory
+SNAPSHOT_DIR="$(pwd)"
+OUTPUT_FILE="outlinks.txt"
+
+# URL regex pattern that matches http(s):// URLs
+# This pattern is fairly permissive to catch various URL formats
+URL_PATTERN='https?://[a-zA-Z0-9@:%._+~#?&/=-]+'
+
+# Find all files in the snapshot directory (excluding outlinks.txt itself)
+# Pass them through grep to find URLs
+# Truncate each URL at 2000 chars
+# Sort and deduplicate
+# Output to outlinks.txt
+
+# Find all files and extract URLs
+find . -type f ! -name "outlinks.txt" ! -name ".env" -exec cat {} + 2>/dev/null | \
+  grep -oE "$URL_PATTERN" 2>/dev/null | \
+  cut -c 1-2000 | \
+  sort -u > "$OUTPUT_FILE" || true
+
+# Count how many URLs were found
+URL_COUNT=$(wc -l < "$OUTPUT_FILE" | tr -d ' ')
+
+echo "[outlinks] Found $URL_COUNT unique URLs"
+
+# Output success
+echo "$OUTPUT_FILE"
diff --git a/archivebox-ts/src/extractors.ts b/archivebox-ts/src/extractors.ts
index 5d6fa4f3..b20086fb 100644
--- a/archivebox-ts/src/extractors.ts
+++ b/archivebox-ts/src/extractors.ts
@@ -12,6 +12,7 @@ import type { ExtractorName } from './models';
 // 2captcha must run FIRST to download/configure extensions before Chrome launches
 // puppeteer runs second to launch Chrome with the extensions
 // downloads, images, and infiniscroll run early to capture dynamic content
+// outlinks runs LAST to extract URLs from all output files
 export const EXTRACTOR_ORDER: string[] = [
   '2captcha',    // Downloads and configures Chrome extensions (2captcha, singlefile, etc.)
   'puppeteer',   // Launches Chrome with extensions and writes CDP URL to .env
@@ -31,6 +32,7 @@ export const EXTRACTOR_ORDER: string[] = [
   'git',         // Clones git repository (independent)
   'media',       // Downloads media with yt-dlp (independent)
   'archive_org', // Submits to Internet Archive (independent)
+  'outlinks',    // Extracts URLs from all output files (runs last)
 ];
 
 export interface ExtractorInfo {
diff --git a/archivebox-ts/src/models.ts b/archivebox-ts/src/models.ts
index ec14f9b7..4b81578f 100644
--- a/archivebox-ts/src/models.ts
+++ b/archivebox-ts/src/models.ts
@@ -23,7 +23,8 @@ export type ExtractorName =
   | 'wget'
   | 'git'
   | 'media'
-  | 'archive_org';
+  | 'archive_org'
+  | 'outlinks';
 
 /**
  * Snapshot represents a single URL being archived