mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
Add outlinks extractor to extract URLs from all output files
- New extractor: outlinks - Runs at the end after all other extractors - Finds all URLs in output files using grep - Truncates URLs at 2000 characters - Deduplicates and sorts output - Outputs to outlinks.txt (one URL per line) - Updated ExtractorName type in models.ts to include 'outlinks' - Updated EXTRACTOR_ORDER in extractors.ts to run outlinks last - Tested with sample HTML and text files containing URLs The extractor uses find + grep + sort + uniq pipeline to extract all http(s):// URLs from any output files in the snapshot directory.
This commit is contained in:
35
archivebox-ts/extractors/outlinks
Executable file
35
archivebox-ts/extractors/outlinks
Executable file
@@ -0,0 +1,35 @@
|
||||
#!/usr/bin/env bash
|
||||
# Extractor: outlinks
|
||||
# Description: Extracts all URLs from output files in the snapshot directory
|
||||
# Dependencies: grep, sort, uniq
|
||||
# Outputs: outlinks.txt (one URL per line, sorted and deduplicated)
|
||||
|
||||
set -e
|
||||
|
||||
# CWD is the snapshot directory
|
||||
SNAPSHOT_DIR="$(pwd)"
|
||||
OUTPUT_FILE="outlinks.txt"
|
||||
|
||||
# URL regex pattern that matches http(s):// URLs
|
||||
# This pattern is fairly permissive to catch various URL formats
|
||||
URL_PATTERN='https?://[a-zA-Z0-9@:%._+~#?&/=-]+'
|
||||
|
||||
# Find all files in the snapshot directory (excluding outlinks.txt itself)
|
||||
# Pass them through grep to find URLs
|
||||
# Truncate each URL at 2000 chars
|
||||
# Sort and deduplicate
|
||||
# Output to outlinks.txt
|
||||
|
||||
# Find all files and extract URLs
|
||||
find . -type f ! -name "outlinks.txt" ! -name ".env" -exec cat {} + 2>/dev/null | \
|
||||
grep -oE "$URL_PATTERN" 2>/dev/null | \
|
||||
cut -c 1-2000 | \
|
||||
sort -u > "$OUTPUT_FILE" || true
|
||||
|
||||
# Count how many URLs were found
|
||||
URL_COUNT=$(wc -l < "$OUTPUT_FILE" | tr -d ' ')
|
||||
|
||||
echo "[outlinks] Found $URL_COUNT unique URLs"
|
||||
|
||||
# Output success
|
||||
echo "$OUTPUT_FILE"
|
||||
@@ -12,6 +12,7 @@ import type { ExtractorName } from './models';
|
||||
// 2captcha must run FIRST to download/configure extensions before Chrome launches
|
||||
// puppeteer runs second to launch Chrome with the extensions
|
||||
// downloads, images, and infiniscroll run early to capture dynamic content
|
||||
// outlinks runs LAST to extract URLs from all output files
|
||||
export const EXTRACTOR_ORDER: string[] = [
|
||||
'2captcha', // Downloads and configures Chrome extensions (2captcha, singlefile, etc.)
|
||||
'puppeteer', // Launches Chrome with extensions and writes CDP URL to .env
|
||||
@@ -31,6 +32,7 @@ export const EXTRACTOR_ORDER: string[] = [
|
||||
'git', // Clones git repository (independent)
|
||||
'media', // Downloads media with yt-dlp (independent)
|
||||
'archive_org', // Submits to Internet Archive (independent)
|
||||
'outlinks', // Extracts URLs from all output files (runs last)
|
||||
];
|
||||
|
||||
export interface ExtractorInfo {
|
||||
|
||||
@@ -23,7 +23,8 @@ export type ExtractorName =
|
||||
| 'wget'
|
||||
| 'git'
|
||||
| 'media'
|
||||
| 'archive_org';
|
||||
| 'archive_org'
|
||||
| 'outlinks';
|
||||
|
||||
/**
|
||||
* Snapshot represents a single URL being archived
|
||||
|
||||
Reference in New Issue
Block a user