From 57072382bbf96d301784d2e30c0b483800738ad1 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 15 Nov 2025 20:59:08 +0000 Subject: [PATCH] Add outlinks extractor to extract URLs from all output files - New extractor: outlinks - Runs at the end after all other extractors - Finds all URLs in output files using grep - Truncates URLs at 2000 characters - Deduplicates and sorts output - Outputs to outlinks.txt (one URL per line) - Updated ExtractorName type in models.ts to include 'outlinks' - Updated EXTRACTOR_ORDER in extractors.ts to run outlinks last - Tested with sample HTML and text files containing URLs The extractor uses find + grep + sort + uniq pipeline to extract all http(s):// URLs from any output files in the snapshot directory. --- archivebox-ts/extractors/outlinks | 35 +++++++++++++++++++++++++++++++ archivebox-ts/src/extractors.ts | 2 ++ archivebox-ts/src/models.ts | 3 ++- 3 files changed, 39 insertions(+), 1 deletion(-) create mode 100755 archivebox-ts/extractors/outlinks diff --git a/archivebox-ts/extractors/outlinks b/archivebox-ts/extractors/outlinks new file mode 100755 index 00000000..9bd3a651 --- /dev/null +++ b/archivebox-ts/extractors/outlinks @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Extractor: outlinks +# Description: Extracts all URLs from output files in the snapshot directory +# Dependencies: grep, sort, uniq +# Outputs: outlinks.txt (one URL per line, sorted and deduplicated) + +set -e + +# CWD is the snapshot directory +SNAPSHOT_DIR="$(pwd)" +OUTPUT_FILE="outlinks.txt" + +# URL regex pattern that matches http(s):// URLs +# This pattern is fairly permissive to catch various URL formats +URL_PATTERN='https?://[a-zA-Z0-9@:%._+~#?&/=-]+' + +# Find all files in the snapshot directory (excluding outlinks.txt itself) +# Pass them through grep to find URLs +# Truncate each URL at 2000 chars +# Sort and deduplicate +# Output to outlinks.txt + +# Find all files and extract URLs +find . -type f ! -name "outlinks.txt" ! -name ".env" -exec cat {} + 2>/dev/null | \ + grep -oE "$URL_PATTERN" 2>/dev/null | \ + cut -c 1-2000 | \ + sort -u > "$OUTPUT_FILE" || true + +# Count how many URLs were found +URL_COUNT=$(wc -l < "$OUTPUT_FILE" | tr -d ' ') + +echo "[outlinks] Found $URL_COUNT unique URLs" + +# Output success +echo "$OUTPUT_FILE" diff --git a/archivebox-ts/src/extractors.ts b/archivebox-ts/src/extractors.ts index 5d6fa4f3..b20086fb 100644 --- a/archivebox-ts/src/extractors.ts +++ b/archivebox-ts/src/extractors.ts @@ -12,6 +12,7 @@ import type { ExtractorName } from './models'; // 2captcha must run FIRST to download/configure extensions before Chrome launches // puppeteer runs second to launch Chrome with the extensions // downloads, images, and infiniscroll run early to capture dynamic content +// outlinks runs LAST to extract URLs from all output files export const EXTRACTOR_ORDER: string[] = [ '2captcha', // Downloads and configures Chrome extensions (2captcha, singlefile, etc.) 'puppeteer', // Launches Chrome with extensions and writes CDP URL to .env @@ -31,6 +32,7 @@ export const EXTRACTOR_ORDER: string[] = [ 'git', // Clones git repository (independent) 'media', // Downloads media with yt-dlp (independent) 'archive_org', // Submits to Internet Archive (independent) + 'outlinks', // Extracts URLs from all output files (runs last) ]; export interface ExtractorInfo { diff --git a/archivebox-ts/src/models.ts b/archivebox-ts/src/models.ts index ec14f9b7..4b81578f 100644 --- a/archivebox-ts/src/models.ts +++ b/archivebox-ts/src/models.ts @@ -23,7 +23,8 @@ export type ExtractorName = | 'wget' | 'git' | 'media' - | 'archive_org'; + | 'archive_org' + | 'outlinks'; /** * Snapshot represents a single URL being archived