Add outlinks extractor to extract URLs from all output files

- New extractor: outlinks - Runs at the end after all other extractors - Finds all URLs in output files using grep - Truncates URLs at 2000 characters - Deduplicates and sorts output - Outputs to outlinks.txt (one URL per line) - Updated ExtractorName type in models.ts to include 'outlinks' - Updated EXTRACTOR_ORDER in extractors.ts to run outlinks last - Tested with sample HTML and text files containing URLs The extractor uses find + grep + sort + uniq pipeline to extract all http(s):// URLs from any output files in the snapshot directory.
2026-01-03 01:15:57 +10:00 · 2025-11-15 20:59:08 +00:00
parent 8a0dfa9b5f
commit 57072382bb
3 changed files with 39 additions and 1 deletions
--- a/archivebox-ts/extractors/outlinks
+++ b/archivebox-ts/extractors/outlinks
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Extractor: outlinks
+# Description: Extracts all URLs from output files in the snapshot directory
+# Dependencies: grep, sort, uniq
+# Outputs: outlinks.txt (one URL per line, sorted and deduplicated)
+
+set -e
+
+# CWD is the snapshot directory
+SNAPSHOT_DIR="$(pwd)"
+OUTPUT_FILE="outlinks.txt"
+
+# URL regex pattern that matches http(s):// URLs
+# This pattern is fairly permissive to catch various URL formats
+URL_PATTERN='https?://[a-zA-Z0-9@:%._+~#?&/=-]+'
+
+# Find all files in the snapshot directory (excluding outlinks.txt itself)
+# Pass them through grep to find URLs
+# Truncate each URL at 2000 chars
+# Sort and deduplicate
+# Output to outlinks.txt
+
+# Find all files and extract URLs
+find . -type f ! -name "outlinks.txt" ! -name ".env" -exec cat {} + 2>/dev/null | \
+  grep -oE "$URL_PATTERN" 2>/dev/null | \
+  cut -c 1-2000 | \
+  sort -u > "$OUTPUT_FILE" || true
+
+# Count how many URLs were found
+URL_COUNT=$(wc -l < "$OUTPUT_FILE" | tr -d ' ')
+
+echo "[outlinks] Found $URL_COUNT unique URLs"
+
+# Output success
+echo "$OUTPUT_FILE"
--- a/archivebox-ts/src/extractors.ts
+++ b/archivebox-ts/src/extractors.ts
@@ -12,6 +12,7 @@ import type { ExtractorName } from './models';
 // 2captcha must run FIRST to download/configure extensions before Chrome launches
 // puppeteer runs second to launch Chrome with the extensions
 // downloads, images, and infiniscroll run early to capture dynamic content
+// outlinks runs LAST to extract URLs from all output files
 export const EXTRACTOR_ORDER: string[] = [
  '2captcha',    // Downloads and configures Chrome extensions (2captcha, singlefile, etc.)
  'puppeteer',   // Launches Chrome with extensions and writes CDP URL to .env
@@ -31,6 +32,7 @@ export const EXTRACTOR_ORDER: string[] = [
  'git',         // Clones git repository (independent)
  'media',       // Downloads media with yt-dlp (independent)
  'archive_org', // Submits to Internet Archive (independent)
+  'outlinks',    // Extracts URLs from all output files (runs last)
 ];

 export interface ExtractorInfo {
--- a/archivebox-ts/src/models.ts
+++ b/archivebox-ts/src/models.ts
@@ -23,7 +23,8 @@ export type ExtractorName =
  | 'wget'
  | 'git'
  | 'media'
-  | 'archive_org';
+  | 'archive_org'
+  | 'outlinks';

 /**
 * Snapshot represents a single URL being archived