mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 06:47:57 +10:00
continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script
This commit is contained in:
@@ -20,7 +20,7 @@ const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'accessibility';
|
||||
const PLUGIN_NAME = 'accessibility';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'accessibility.json';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
@@ -223,10 +223,14 @@ async function main() {
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
// Check if Chrome session exists, then wait for page load
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await extractAccessibility(url);
|
||||
|
||||
@@ -25,7 +25,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'archive_org'
|
||||
PLUGIN_NAME = 'archive_org'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
|
||||
@@ -26,101 +26,23 @@ const { spawn } = require('child_process');
|
||||
const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'chrome_launch';
|
||||
const PLUGIN_NAME = 'chrome_launch';
|
||||
const OUTPUT_DIR = 'chrome';
|
||||
|
||||
// Helper: Write PID file with mtime set to process start time
|
||||
// Helpers for PID file creation
|
||||
function writePidWithMtime(filePath, pid, startTimeSeconds) {
|
||||
fs.writeFileSync(filePath, String(pid));
|
||||
// Set both atime and mtime to process start time for validation
|
||||
const startTimeMs = startTimeSeconds * 1000;
|
||||
fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
|
||||
}
|
||||
|
||||
// Helper: Write command script for validation
|
||||
function writeCmdScript(filePath, binary, args) {
|
||||
// Shell escape arguments containing spaces or special characters
|
||||
const escapedArgs = args.map(arg => {
|
||||
if (arg.includes(' ') || arg.includes('"') || arg.includes('$')) {
|
||||
return `"${arg.replace(/"/g, '\\"')}"`;
|
||||
}
|
||||
return arg;
|
||||
});
|
||||
const script = `#!/bin/bash\n${binary} ${escapedArgs.join(' ')}\n`;
|
||||
fs.writeFileSync(filePath, script);
|
||||
const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$'))
|
||||
? `"${arg.replace(/"/g, '\\"')}"` : arg;
|
||||
fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`);
|
||||
fs.chmodSync(filePath, 0o755);
|
||||
}
|
||||
|
||||
// Helper: Get process start time (cross-platform)
|
||||
function getProcessStartTime(pid) {
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
if (process.platform === 'darwin') {
|
||||
// macOS: ps -p PID -o lstart= gives start time
|
||||
const output = execSync(`ps -p ${pid} -o lstart=`, { encoding: 'utf8', timeout: 1000 });
|
||||
return Date.parse(output.trim()) / 1000; // Convert to epoch seconds
|
||||
} else {
|
||||
// Linux: read /proc/PID/stat field 22 (starttime in clock ticks)
|
||||
const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
|
||||
const match = stat.match(/\) \w+ (\d+)/);
|
||||
if (match) {
|
||||
const startTicks = parseInt(match[1], 10);
|
||||
// Convert clock ticks to seconds (assuming 100 ticks/sec)
|
||||
const uptimeSeconds = parseFloat(fs.readFileSync('/proc/uptime', 'utf8').split(' ')[0]);
|
||||
const bootTime = Date.now() / 1000 - uptimeSeconds;
|
||||
return bootTime + (startTicks / 100);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Can't get start time
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Helper: Validate PID using mtime and command
|
||||
function validatePid(pid, pidFile, cmdFile) {
|
||||
try {
|
||||
// Check process exists
|
||||
try {
|
||||
process.kill(pid, 0); // Signal 0 = check existence
|
||||
} catch (e) {
|
||||
return false; // Process doesn't exist
|
||||
}
|
||||
|
||||
// Check mtime matches process start time (within 5 sec tolerance)
|
||||
const fileStat = fs.statSync(pidFile);
|
||||
const fileMtime = fileStat.mtimeMs / 1000; // Convert to seconds
|
||||
const procStartTime = getProcessStartTime(pid);
|
||||
|
||||
if (procStartTime === null) {
|
||||
// Can't validate - fall back to basic existence check
|
||||
return true;
|
||||
}
|
||||
|
||||
if (Math.abs(fileMtime - procStartTime) > 5) {
|
||||
// PID was reused by different process
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate command if available
|
||||
if (fs.existsSync(cmdFile)) {
|
||||
const cmd = fs.readFileSync(cmdFile, 'utf8');
|
||||
// Check for Chrome/Chromium and debug port
|
||||
if (!cmd.includes('chrome') && !cmd.includes('chromium')) {
|
||||
return false;
|
||||
}
|
||||
if (!cmd.includes('--remote-debugging-port')) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Global state for cleanup
|
||||
let chromePid = null;
|
||||
|
||||
@@ -332,20 +254,20 @@ function killZombieChrome() {
|
||||
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
|
||||
if (isNaN(pid) || pid <= 0) continue;
|
||||
|
||||
// Validate PID before killing
|
||||
const cmdFile = path.join(chromeDir, 'cmd.sh');
|
||||
if (!validatePid(pid, pidFile, cmdFile)) {
|
||||
// PID reused or validation failed
|
||||
console.error(`[!] PID ${pid} failed validation (reused or wrong process) - cleaning up`);
|
||||
// Check if process exists (simple check, Python will validate properly)
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
} catch (e) {
|
||||
// Process dead, remove stale PID file
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Process alive, validated, and crawl is stale - zombie!
|
||||
console.error(`[!] Found validated zombie (PID ${pid}) from stale crawl ${crawl.name}`);
|
||||
// Process alive and crawl is stale - zombie!
|
||||
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
|
||||
|
||||
try {
|
||||
// Kill process group first
|
||||
// Kill process group
|
||||
try {
|
||||
process.kill(-pid, 'SIGKILL');
|
||||
} catch (e) {
|
||||
@@ -354,14 +276,10 @@ function killZombieChrome() {
|
||||
|
||||
killed++;
|
||||
console.error(`[+] Killed zombie (PID ${pid})`);
|
||||
|
||||
// Remove PID file
|
||||
try { fs.unlinkSync(pidFile); } catch (e) {}
|
||||
|
||||
} catch (e) {
|
||||
console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
// Skip invalid PID files
|
||||
}
|
||||
|
||||
@@ -29,7 +29,7 @@ const http = require('http');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'chrome_tab';
|
||||
const PLUGIN_NAME = 'chrome_tab';
|
||||
const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
|
||||
const CHROME_SESSION_DIR = '.';
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const EXTRACTOR_NAME = 'chrome_navigate';
|
||||
const PLUGIN_NAME = 'chrome_navigate';
|
||||
const CHROME_SESSION_DIR = '.';
|
||||
const OUTPUT_DIR = '.';
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const EXTRACTOR_NAME = 'consolelog';
|
||||
const PLUGIN_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
|
||||
@@ -23,7 +23,7 @@ const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'dom';
|
||||
const PLUGIN_NAME = 'dom';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'output.html';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
@@ -252,10 +252,14 @@ async function main() {
|
||||
}));
|
||||
process.exit(0);
|
||||
} else {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await dumpDom(url);
|
||||
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'favicon'
|
||||
PLUGIN_NAME = 'favicon'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'favicon.ico'
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'forumdl'
|
||||
PLUGIN_NAME = 'forumdl'
|
||||
BIN_NAME = 'forum-dl'
|
||||
BIN_PROVIDERS = 'pip,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -32,7 +32,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'gallerydl'
|
||||
PLUGIN_NAME = 'gallerydl'
|
||||
BIN_NAME = 'gallery-dl'
|
||||
BIN_PROVIDERS = 'pip,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -24,7 +24,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'git'
|
||||
PLUGIN_NAME = 'git'
|
||||
BIN_NAME = 'git'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -21,7 +21,7 @@ const https = require('https');
|
||||
const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'headers';
|
||||
const PLUGIN_NAME = 'headers';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'headers.json';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
@@ -26,7 +26,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'htmltotext'
|
||||
PLUGIN_NAME = 'htmltotext'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'htmltotext.txt'
|
||||
|
||||
|
||||
@@ -34,7 +34,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'media'
|
||||
PLUGIN_NAME = 'media'
|
||||
BIN_NAME = 'yt-dlp'
|
||||
BIN_PROVIDERS = 'pip,apt,brew,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -25,7 +25,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'mercury'
|
||||
PLUGIN_NAME = 'mercury'
|
||||
BIN_NAME = 'postlight-parser'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -28,7 +28,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'papersdl'
|
||||
PLUGIN_NAME = 'papersdl'
|
||||
BIN_NAME = 'papers-dl'
|
||||
BIN_PROVIDERS = 'pip,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -23,7 +23,7 @@ const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'parse_dom_outlinks';
|
||||
const PLUGIN_NAME = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'outlinks.json';
|
||||
const URLS_FILE = 'urls.jsonl'; // For crawl system
|
||||
@@ -190,7 +190,7 @@ async function extractOutlinks(url) {
|
||||
const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
|
||||
type: 'Snapshot',
|
||||
url: href,
|
||||
via_extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
})).join('\n');
|
||||
|
||||
if (urlsJsonl) {
|
||||
@@ -236,10 +236,14 @@ async function main() {
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
// Check if Chrome session exists, then wait for page load
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await extractOutlinks(url);
|
||||
|
||||
@@ -28,7 +28,7 @@ from urllib.parse import urljoin, urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_html_urls'
|
||||
PLUGIN_NAME = 'parse_html_urls'
|
||||
|
||||
# Check if parse_dom_outlinks extractor already ran
|
||||
DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
|
||||
@@ -179,7 +179,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
record = {
|
||||
'type': 'Snapshot',
|
||||
'url': found_url,
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
'plugin': PLUGIN_NAME,
|
||||
'depth': depth + 1,
|
||||
}
|
||||
if snapshot_id:
|
||||
|
||||
@@ -233,7 +233,7 @@ class TestParseHtmlUrls:
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
assert 'plugin' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -24,7 +24,7 @@ from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_jsonl_urls'
|
||||
PLUGIN_NAME = 'parse_jsonl_urls'
|
||||
|
||||
|
||||
def parse_bookmarked_at(link: dict) -> str | None:
|
||||
@@ -75,7 +75,7 @@ def json_object_to_entry(link: dict) -> dict | None:
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
'plugin': PLUGIN_NAME,
|
||||
}
|
||||
|
||||
# Parse title
|
||||
|
||||
@@ -265,7 +265,7 @@ class TestParseJsonlUrls:
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
assert 'plugin' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -22,7 +22,7 @@ from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_netscape_urls'
|
||||
PLUGIN_NAME = 'parse_netscape_urls'
|
||||
|
||||
# Constants for timestamp epoch detection
|
||||
UNIX_EPOCH = 0 # 1970-01-01 00:00:00 UTC
|
||||
@@ -187,7 +187,7 @@ def main(url: str, snapshot_id: str = None):
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(bookmark_url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
'plugin': PLUGIN_NAME,
|
||||
}
|
||||
if title:
|
||||
entry['title'] = unescape(title)
|
||||
|
||||
@@ -23,7 +23,7 @@ from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_rss_urls'
|
||||
PLUGIN_NAME = 'parse_rss_urls'
|
||||
|
||||
try:
|
||||
import feedparser
|
||||
@@ -107,7 +107,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(item_url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
'plugin': PLUGIN_NAME,
|
||||
'depth': depth + 1,
|
||||
}
|
||||
if snapshot_id:
|
||||
|
||||
@@ -47,7 +47,7 @@ class TestRssVariants:
|
||||
|
||||
assert entry['url'] == 'https://example.com/article1'
|
||||
assert entry['title'] == 'RSS 0.91 Article'
|
||||
assert entry['via_extractor'] == 'parse_rss_urls'
|
||||
assert entry['plugin'] == 'parse_rss_urls'
|
||||
|
||||
def test_rss_10_rdf(self, tmp_path):
|
||||
"""Test RSS 1.0 (RDF) format."""
|
||||
|
||||
@@ -25,7 +25,7 @@ from urllib.request import urlopen
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_txt_urls'
|
||||
PLUGIN_NAME = 'parse_txt_urls'
|
||||
|
||||
# URL regex from archivebox/misc/util.py
|
||||
# https://mathiasbynens.be/demo/url-regex
|
||||
@@ -127,7 +127,7 @@ def main(url: str, snapshot_id: str = None):
|
||||
f.write(json.dumps({
|
||||
'type': 'Snapshot',
|
||||
'url': found_url,
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
'plugin': PLUGIN_NAME,
|
||||
}) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs')
|
||||
|
||||
@@ -186,7 +186,7 @@ https://other.com
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
assert 'plugin' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -22,7 +22,7 @@ const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'pdf';
|
||||
const PLUGIN_NAME = 'pdf';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'output.pdf';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
@@ -254,10 +254,14 @@ async function main() {
|
||||
}));
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await printToPdf(url);
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Shared utilities for extractor hooks.
|
||||
Shared utilities for extractor plugin hooks.
|
||||
|
||||
This module provides common functionality for all extractors to ensure
|
||||
This module provides common functionality for all extractor plugins to ensure
|
||||
consistent behavior, output format, error handling, and timing.
|
||||
|
||||
All extractors should:
|
||||
All extractor plugins should:
|
||||
1. Import and use these utilities
|
||||
2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
|
||||
3. Write all files to $PWD
|
||||
@@ -35,7 +35,7 @@ STATIC_EXTENSIONS = (
|
||||
|
||||
|
||||
def is_static_file(url: str) -> bool:
|
||||
"""Check if URL points to a static file that may not need browser extraction."""
|
||||
"""Check if URL points to a static file that may not need browser-based extractor plugins."""
|
||||
return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ def get_version(binary: str, version_args: list[str] | None = None) -> str:
|
||||
|
||||
class ExtractorResult:
|
||||
"""
|
||||
Tracks extractor execution and produces consistent output.
|
||||
Tracks extractor plugin execution and produces consistent output.
|
||||
|
||||
Usage:
|
||||
result = ExtractorResult(name='wget', url=url)
|
||||
@@ -152,7 +152,7 @@ class ExtractorResult:
|
||||
return 1
|
||||
|
||||
def finish(self, status: str | None = None):
|
||||
"""Mark extraction as finished and print results."""
|
||||
"""Mark extractor plugin execution as finished and print results."""
|
||||
self.end_ts = datetime.now(timezone.utc)
|
||||
if status:
|
||||
self.status = status
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'readability'
|
||||
PLUGIN_NAME = 'readability'
|
||||
BIN_NAME = 'readability-extractor'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -14,7 +14,7 @@ const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const EXTRACTOR_NAME = 'redirects';
|
||||
const PLUGIN_NAME = 'redirects';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'redirects.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
@@ -235,7 +235,7 @@ function handleShutdown(signal) {
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: OUTPUT_FILE,
|
||||
extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
original_url: originalUrl,
|
||||
final_url: finalUrl || originalUrl,
|
||||
redirect_count: redirectChain.length,
|
||||
|
||||
@@ -15,7 +15,7 @@ const path = require('path');
|
||||
const crypto = require('crypto');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const EXTRACTOR_NAME = 'responses';
|
||||
const PLUGIN_NAME = 'responses';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
@@ -22,7 +22,7 @@ const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'screenshot';
|
||||
const PLUGIN_NAME = 'screenshot';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'screenshot.png';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
@@ -250,10 +250,14 @@ async function main() {
|
||||
}));
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
// Only wait for page load if using shared Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await takeScreenshot(url);
|
||||
|
||||
@@ -27,7 +27,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sonic'
|
||||
PLUGIN_NAME = 'index_sonic'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
# Text file patterns to index
|
||||
@@ -83,14 +83,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
|
||||
cwd = Path.cwd()
|
||||
|
||||
for extractor, file_pattern in INDEXABLE_FILES:
|
||||
extractor_dir = cwd / extractor
|
||||
if not extractor_dir.exists():
|
||||
plugin_dir = cwd / extractor
|
||||
if not plugin_dir.exists():
|
||||
continue
|
||||
|
||||
if '*' in file_pattern:
|
||||
matches = list(extractor_dir.glob(file_pattern))
|
||||
matches = list(plugin_dir.glob(file_pattern))
|
||||
else:
|
||||
match = extractor_dir / file_pattern
|
||||
match = plugin_dir / file_pattern
|
||||
matches = [match] if match.exists() else []
|
||||
|
||||
for match in matches:
|
||||
|
||||
@@ -25,7 +25,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sqlite'
|
||||
PLUGIN_NAME = 'index_sqlite'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
# Text file patterns to index, in priority order
|
||||
@@ -74,14 +74,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
|
||||
cwd = Path.cwd()
|
||||
|
||||
for extractor, file_pattern in INDEXABLE_FILES:
|
||||
extractor_dir = cwd / extractor
|
||||
if not extractor_dir.exists():
|
||||
plugin_dir = cwd / extractor
|
||||
if not plugin_dir.exists():
|
||||
continue
|
||||
|
||||
if '*' in file_pattern:
|
||||
matches = list(extractor_dir.glob(file_pattern))
|
||||
matches = list(plugin_dir.glob(file_pattern))
|
||||
else:
|
||||
match = extractor_dir / file_pattern
|
||||
match = plugin_dir / file_pattern
|
||||
matches = [match] if match.exists() else []
|
||||
|
||||
for match in matches:
|
||||
|
||||
@@ -20,7 +20,7 @@ const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'seo';
|
||||
const PLUGIN_NAME = 'seo';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'seo.json';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
@@ -177,10 +177,14 @@ async function main() {
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
// Check if Chrome session exists, then wait for page load
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
// Wait for page to be fully loaded
|
||||
const pageLoaded = await waitForChromeTabLoaded(60000);
|
||||
if (!pageLoaded) {
|
||||
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
|
||||
}
|
||||
}
|
||||
|
||||
const result = await extractSeo(url);
|
||||
|
||||
@@ -36,7 +36,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'singlefile'
|
||||
PLUGIN_NAME = 'singlefile'
|
||||
BIN_NAME = 'single-file'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
@@ -14,7 +14,7 @@ const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const EXTRACTOR_NAME = 'ssl';
|
||||
const PLUGIN_NAME = 'ssl';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'ssl.jsonl';
|
||||
const PID_FILE = 'hook.pid';
|
||||
|
||||
@@ -14,7 +14,7 @@ const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const EXTRACTOR_NAME = 'staticfile';
|
||||
const PLUGIN_NAME = 'staticfile';
|
||||
const OUTPUT_DIR = '.';
|
||||
const PID_FILE = 'hook.pid';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
@@ -326,7 +326,7 @@ function handleShutdown(signal) {
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: 'No Content-Type detected',
|
||||
extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
};
|
||||
} else if (!isStaticFile) {
|
||||
// Not a static file (normal case for HTML pages)
|
||||
@@ -334,7 +334,7 @@ function handleShutdown(signal) {
|
||||
type: 'ArchiveResult',
|
||||
status: 'skipped',
|
||||
output_str: `Not a static file (Content-Type: ${detectedContentType})`,
|
||||
extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
content_type: detectedContentType,
|
||||
};
|
||||
} else if (downloadError) {
|
||||
@@ -343,7 +343,7 @@ function handleShutdown(signal) {
|
||||
type: 'ArchiveResult',
|
||||
status: 'failed',
|
||||
output_str: downloadError,
|
||||
extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
content_type: detectedContentType,
|
||||
};
|
||||
} else if (downloadedFilePath) {
|
||||
@@ -352,7 +352,7 @@ function handleShutdown(signal) {
|
||||
type: 'ArchiveResult',
|
||||
status: 'succeeded',
|
||||
output_str: downloadedFilePath,
|
||||
extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
content_type: detectedContentType,
|
||||
};
|
||||
} else {
|
||||
@@ -361,7 +361,7 @@ function handleShutdown(signal) {
|
||||
type: 'ArchiveResult',
|
||||
status: 'failed',
|
||||
output_str: 'Static file detected but download did not complete',
|
||||
extractor: EXTRACTOR_NAME,
|
||||
plugin: PLUGIN_NAME,
|
||||
content_type: detectedContentType,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ const https = require('https');
|
||||
const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'title';
|
||||
const PLUGIN_NAME = 'title';
|
||||
const OUTPUT_DIR = '.';
|
||||
const OUTPUT_FILE = 'title.txt';
|
||||
const CHROME_SESSION_DIR = '../chrome';
|
||||
|
||||
@@ -39,7 +39,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'wget'
|
||||
PLUGIN_NAME = 'wget'
|
||||
BIN_NAME = 'wget'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = '.'
|
||||
|
||||
Reference in New Issue
Block a user