continue renaming extractor to plugin, add plan for hook concurrency, add chrome kill helper script

This commit is contained in:
Nick Sweeting
2025-12-28 05:29:24 -08:00
parent d2e65cfd38
commit 4ccb0863bb
53 changed files with 456 additions and 493 deletions

View File

@@ -20,7 +20,7 @@ const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'accessibility';
const PLUGIN_NAME = 'accessibility';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'accessibility.json';
const CHROME_SESSION_DIR = '../chrome';
@@ -223,10 +223,14 @@ async function main() {
process.exit(0);
}
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
// Check if Chrome session exists, then wait for page load
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await extractAccessibility(url);

View File

@@ -25,7 +25,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'archive_org'
PLUGIN_NAME = 'archive_org'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'archive.org.txt'

View File

@@ -26,101 +26,23 @@ const { spawn } = require('child_process');
const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_launch';
const PLUGIN_NAME = 'chrome_launch';
const OUTPUT_DIR = 'chrome';
// Helper: Write PID file with mtime set to process start time
// Helpers for PID file creation
function writePidWithMtime(filePath, pid, startTimeSeconds) {
fs.writeFileSync(filePath, String(pid));
// Set both atime and mtime to process start time for validation
const startTimeMs = startTimeSeconds * 1000;
fs.utimesSync(filePath, new Date(startTimeMs), new Date(startTimeMs));
}
// Helper: Write command script for validation
function writeCmdScript(filePath, binary, args) {
// Shell escape arguments containing spaces or special characters
const escapedArgs = args.map(arg => {
if (arg.includes(' ') || arg.includes('"') || arg.includes('$')) {
return `"${arg.replace(/"/g, '\\"')}"`;
}
return arg;
});
const script = `#!/bin/bash\n${binary} ${escapedArgs.join(' ')}\n`;
fs.writeFileSync(filePath, script);
const escape = arg => (arg.includes(' ') || arg.includes('"') || arg.includes('$'))
? `"${arg.replace(/"/g, '\\"')}"` : arg;
fs.writeFileSync(filePath, `#!/bin/bash\n${binary} ${args.map(escape).join(' ')}\n`);
fs.chmodSync(filePath, 0o755);
}
// Helper: Get process start time (cross-platform)
function getProcessStartTime(pid) {
try {
const { execSync } = require('child_process');
if (process.platform === 'darwin') {
// macOS: ps -p PID -o lstart= gives start time
const output = execSync(`ps -p ${pid} -o lstart=`, { encoding: 'utf8', timeout: 1000 });
return Date.parse(output.trim()) / 1000; // Convert to epoch seconds
} else {
// Linux: read /proc/PID/stat field 22 (starttime in clock ticks)
const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
const match = stat.match(/\) \w+ (\d+)/);
if (match) {
const startTicks = parseInt(match[1], 10);
// Convert clock ticks to seconds (assuming 100 ticks/sec)
const uptimeSeconds = parseFloat(fs.readFileSync('/proc/uptime', 'utf8').split(' ')[0]);
const bootTime = Date.now() / 1000 - uptimeSeconds;
return bootTime + (startTicks / 100);
}
}
} catch (e) {
// Can't get start time
return null;
}
return null;
}
// Helper: Validate PID using mtime and command
function validatePid(pid, pidFile, cmdFile) {
try {
// Check process exists
try {
process.kill(pid, 0); // Signal 0 = check existence
} catch (e) {
return false; // Process doesn't exist
}
// Check mtime matches process start time (within 5 sec tolerance)
const fileStat = fs.statSync(pidFile);
const fileMtime = fileStat.mtimeMs / 1000; // Convert to seconds
const procStartTime = getProcessStartTime(pid);
if (procStartTime === null) {
// Can't validate - fall back to basic existence check
return true;
}
if (Math.abs(fileMtime - procStartTime) > 5) {
// PID was reused by different process
return false;
}
// Validate command if available
if (fs.existsSync(cmdFile)) {
const cmd = fs.readFileSync(cmdFile, 'utf8');
// Check for Chrome/Chromium and debug port
if (!cmd.includes('chrome') && !cmd.includes('chromium')) {
return false;
}
if (!cmd.includes('--remote-debugging-port')) {
return false;
}
}
return true;
} catch (e) {
return false;
}
}
// Global state for cleanup
let chromePid = null;
@@ -332,20 +254,20 @@ function killZombieChrome() {
const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
if (isNaN(pid) || pid <= 0) continue;
// Validate PID before killing
const cmdFile = path.join(chromeDir, 'cmd.sh');
if (!validatePid(pid, pidFile, cmdFile)) {
// PID reused or validation failed
console.error(`[!] PID ${pid} failed validation (reused or wrong process) - cleaning up`);
// Check if process exists (simple check, Python will validate properly)
try {
process.kill(pid, 0);
} catch (e) {
// Process dead, remove stale PID file
try { fs.unlinkSync(pidFile); } catch (e) {}
continue;
}
// Process alive, validated, and crawl is stale - zombie!
console.error(`[!] Found validated zombie (PID ${pid}) from stale crawl ${crawl.name}`);
// Process alive and crawl is stale - zombie!
console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
try {
// Kill process group first
// Kill process group
try {
process.kill(-pid, 'SIGKILL');
} catch (e) {
@@ -354,14 +276,10 @@ function killZombieChrome() {
killed++;
console.error(`[+] Killed zombie (PID ${pid})`);
// Remove PID file
try { fs.unlinkSync(pidFile); } catch (e) {}
} catch (e) {
console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
}
} catch (e) {
// Skip invalid PID files
}

View File

@@ -29,7 +29,7 @@ const http = require('http');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_tab';
const PLUGIN_NAME = 'chrome_tab';
const OUTPUT_DIR = '.'; // Hook already runs in chrome/ output directory
const CHROME_SESSION_DIR = '.';

View File

@@ -19,7 +19,7 @@ const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'chrome_navigate';
const PLUGIN_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = '.';
const OUTPUT_DIR = '.';

View File

@@ -14,7 +14,7 @@ const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'consolelog';
const PLUGIN_NAME = 'consolelog';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'console.jsonl';
const PID_FILE = 'hook.pid';

View File

@@ -23,7 +23,7 @@ const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'dom';
const PLUGIN_NAME = 'dom';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.html';
const CHROME_SESSION_DIR = '../chrome';
@@ -252,10 +252,14 @@ async function main() {
}));
process.exit(0);
} else {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await dumpDom(url);

View File

@@ -27,7 +27,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'favicon'
PLUGIN_NAME = 'favicon'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'favicon.ico'

View File

@@ -31,7 +31,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'forumdl'
PLUGIN_NAME = 'forumdl'
BIN_NAME = 'forum-dl'
BIN_PROVIDERS = 'pip,env'
OUTPUT_DIR = '.'

View File

@@ -32,7 +32,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'gallerydl'
PLUGIN_NAME = 'gallerydl'
BIN_NAME = 'gallery-dl'
BIN_PROVIDERS = 'pip,env'
OUTPUT_DIR = '.'

View File

@@ -24,7 +24,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'git'
PLUGIN_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = '.'

View File

@@ -21,7 +21,7 @@ const https = require('https');
const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'headers';
const PLUGIN_NAME = 'headers';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'headers.json';
const CHROME_SESSION_DIR = '../chrome';

View File

@@ -26,7 +26,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'htmltotext'
PLUGIN_NAME = 'htmltotext'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'htmltotext.txt'

View File

@@ -34,7 +34,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'media'
PLUGIN_NAME = 'media'
BIN_NAME = 'yt-dlp'
BIN_PROVIDERS = 'pip,apt,brew,env'
OUTPUT_DIR = '.'

View File

@@ -25,7 +25,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'mercury'
PLUGIN_NAME = 'mercury'
BIN_NAME = 'postlight-parser'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'

View File

@@ -28,7 +28,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'papersdl'
PLUGIN_NAME = 'papersdl'
BIN_NAME = 'papers-dl'
BIN_PROVIDERS = 'pip,env'
OUTPUT_DIR = '.'

View File

@@ -23,7 +23,7 @@ const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'parse_dom_outlinks';
const PLUGIN_NAME = 'parse_dom_outlinks';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'outlinks.json';
const URLS_FILE = 'urls.jsonl'; // For crawl system
@@ -190,7 +190,7 @@ async function extractOutlinks(url) {
const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
type: 'Snapshot',
url: href,
via_extractor: EXTRACTOR_NAME,
plugin: PLUGIN_NAME,
})).join('\n');
if (urlsJsonl) {
@@ -236,10 +236,14 @@ async function main() {
process.exit(0);
}
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
// Check if Chrome session exists, then wait for page load
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await extractOutlinks(url);

View File

@@ -28,7 +28,7 @@ from urllib.parse import urljoin, urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_html_urls'
PLUGIN_NAME = 'parse_html_urls'
# Check if parse_dom_outlinks extractor already ran
DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
@@ -179,7 +179,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
record = {
'type': 'Snapshot',
'url': found_url,
'via_extractor': EXTRACTOR_NAME,
'plugin': PLUGIN_NAME,
'depth': depth + 1,
}
if snapshot_id:

View File

@@ -233,7 +233,7 @@ class TestParseHtmlUrls:
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
assert 'plugin' in entry
if __name__ == '__main__':

View File

@@ -24,7 +24,7 @@ from urllib.parse import urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_jsonl_urls'
PLUGIN_NAME = 'parse_jsonl_urls'
def parse_bookmarked_at(link: dict) -> str | None:
@@ -75,7 +75,7 @@ def json_object_to_entry(link: dict) -> dict | None:
entry = {
'type': 'Snapshot',
'url': unescape(url),
'via_extractor': EXTRACTOR_NAME,
'plugin': PLUGIN_NAME,
}
# Parse title

View File

@@ -265,7 +265,7 @@ class TestParseJsonlUrls:
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
assert 'plugin' in entry
if __name__ == '__main__':

View File

@@ -22,7 +22,7 @@ from urllib.parse import urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_netscape_urls'
PLUGIN_NAME = 'parse_netscape_urls'
# Constants for timestamp epoch detection
UNIX_EPOCH = 0 # 1970-01-01 00:00:00 UTC
@@ -187,7 +187,7 @@ def main(url: str, snapshot_id: str = None):
entry = {
'type': 'Snapshot',
'url': unescape(bookmark_url),
'via_extractor': EXTRACTOR_NAME,
'plugin': PLUGIN_NAME,
}
if title:
entry['title'] = unescape(title)

View File

@@ -23,7 +23,7 @@ from urllib.parse import urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_rss_urls'
PLUGIN_NAME = 'parse_rss_urls'
try:
import feedparser
@@ -107,7 +107,7 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
entry = {
'type': 'Snapshot',
'url': unescape(item_url),
'via_extractor': EXTRACTOR_NAME,
'plugin': PLUGIN_NAME,
'depth': depth + 1,
}
if snapshot_id:

View File

@@ -47,7 +47,7 @@ class TestRssVariants:
assert entry['url'] == 'https://example.com/article1'
assert entry['title'] == 'RSS 0.91 Article'
assert entry['via_extractor'] == 'parse_rss_urls'
assert entry['plugin'] == 'parse_rss_urls'
def test_rss_10_rdf(self, tmp_path):
"""Test RSS 1.0 (RDF) format."""

View File

@@ -25,7 +25,7 @@ from urllib.request import urlopen
import rich_click as click
EXTRACTOR_NAME = 'parse_txt_urls'
PLUGIN_NAME = 'parse_txt_urls'
# URL regex from archivebox/misc/util.py
# https://mathiasbynens.be/demo/url-regex
@@ -127,7 +127,7 @@ def main(url: str, snapshot_id: str = None):
f.write(json.dumps({
'type': 'Snapshot',
'url': found_url,
'via_extractor': EXTRACTOR_NAME,
'plugin': PLUGIN_NAME,
}) + '\n')
click.echo(f'Found {len(urls_found)} URLs')

View File

@@ -186,7 +186,7 @@ https://other.com
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
assert 'plugin' in entry
if __name__ == '__main__':

View File

@@ -22,7 +22,7 @@ const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'pdf';
const PLUGIN_NAME = 'pdf';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'output.pdf';
const CHROME_SESSION_DIR = '../chrome';
@@ -254,10 +254,14 @@ async function main() {
}));
process.exit(0); // Permanent skip - staticfile already handled
} else {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await printToPdf(url);

View File

@@ -1,11 +1,11 @@
#!/usr/bin/env python3
"""
Shared utilities for extractor hooks.
Shared utilities for extractor plugin hooks.
This module provides common functionality for all extractors to ensure
This module provides common functionality for all extractor plugins to ensure
consistent behavior, output format, error handling, and timing.
All extractors should:
All extractor plugins should:
1. Import and use these utilities
2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
3. Write all files to $PWD
@@ -35,7 +35,7 @@ STATIC_EXTENSIONS = (
def is_static_file(url: str) -> bool:
"""Check if URL points to a static file that may not need browser extraction."""
"""Check if URL points to a static file that may not need browser-based extractor plugins."""
return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)
@@ -96,7 +96,7 @@ def get_version(binary: str, version_args: list[str] | None = None) -> str:
class ExtractorResult:
"""
Tracks extractor execution and produces consistent output.
Tracks extractor plugin execution and produces consistent output.
Usage:
result = ExtractorResult(name='wget', url=url)
@@ -152,7 +152,7 @@ class ExtractorResult:
return 1
def finish(self, status: str | None = None):
"""Mark extraction as finished and print results."""
"""Mark extractor plugin execution as finished and print results."""
self.end_ts = datetime.now(timezone.utc)
if status:
self.status = status

View File

@@ -27,7 +27,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'readability'
PLUGIN_NAME = 'readability'
BIN_NAME = 'readability-extractor'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'

View File

@@ -14,7 +14,7 @@ const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'redirects';
const PLUGIN_NAME = 'redirects';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'redirects.jsonl';
const PID_FILE = 'hook.pid';
@@ -235,7 +235,7 @@ function handleShutdown(signal) {
type: 'ArchiveResult',
status: 'succeeded',
output_str: OUTPUT_FILE,
extractor: EXTRACTOR_NAME,
plugin: PLUGIN_NAME,
original_url: originalUrl,
final_url: finalUrl || originalUrl,
redirect_count: redirectChain.length,

View File

@@ -15,7 +15,7 @@ const path = require('path');
const crypto = require('crypto');
const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'responses';
const PLUGIN_NAME = 'responses';
const OUTPUT_DIR = '.';
const PID_FILE = 'hook.pid';
const CHROME_SESSION_DIR = '../chrome';

View File

@@ -22,7 +22,7 @@ const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'screenshot';
const PLUGIN_NAME = 'screenshot';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'screenshot.png';
const CHROME_SESSION_DIR = '../chrome';
@@ -250,10 +250,14 @@ async function main() {
}));
process.exit(0); // Permanent skip - staticfile already handled
} else {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
// Only wait for page load if using shared Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await takeScreenshot(url);

View File

@@ -27,7 +27,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sonic'
PLUGIN_NAME = 'index_sonic'
OUTPUT_DIR = '.'
# Text file patterns to index
@@ -83,14 +83,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
cwd = Path.cwd()
for extractor, file_pattern in INDEXABLE_FILES:
extractor_dir = cwd / extractor
if not extractor_dir.exists():
plugin_dir = cwd / extractor
if not plugin_dir.exists():
continue
if '*' in file_pattern:
matches = list(extractor_dir.glob(file_pattern))
matches = list(plugin_dir.glob(file_pattern))
else:
match = extractor_dir / file_pattern
match = plugin_dir / file_pattern
matches = [match] if match.exists() else []
for match in matches:

View File

@@ -25,7 +25,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sqlite'
PLUGIN_NAME = 'index_sqlite'
OUTPUT_DIR = '.'
# Text file patterns to index, in priority order
@@ -74,14 +74,14 @@ def find_indexable_content() -> list[tuple[str, str]]:
cwd = Path.cwd()
for extractor, file_pattern in INDEXABLE_FILES:
extractor_dir = cwd / extractor
if not extractor_dir.exists():
plugin_dir = cwd / extractor
if not plugin_dir.exists():
continue
if '*' in file_pattern:
matches = list(extractor_dir.glob(file_pattern))
matches = list(plugin_dir.glob(file_pattern))
else:
match = extractor_dir / file_pattern
match = plugin_dir / file_pattern
matches = [match] if match.exists() else []
for match in matches:

View File

@@ -20,7 +20,7 @@ const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'seo';
const PLUGIN_NAME = 'seo';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'seo.json';
const CHROME_SESSION_DIR = '../chrome';
@@ -177,10 +177,14 @@ async function main() {
process.exit(0);
}
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
// Check if Chrome session exists, then wait for page load
const cdpUrl = getCdpUrl();
if (cdpUrl) {
// Wait for page to be fully loaded
const pageLoaded = await waitForChromeTabLoaded(60000);
if (!pageLoaded) {
throw new Error('Page not loaded after 60s (chrome_navigate must complete first)');
}
}
const result = await extractSeo(url);

View File

@@ -36,7 +36,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'singlefile'
PLUGIN_NAME = 'singlefile'
BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = '.'

View File

@@ -14,7 +14,7 @@ const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'ssl';
const PLUGIN_NAME = 'ssl';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'ssl.jsonl';
const PID_FILE = 'hook.pid';

View File

@@ -14,7 +14,7 @@ const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
const EXTRACTOR_NAME = 'staticfile';
const PLUGIN_NAME = 'staticfile';
const OUTPUT_DIR = '.';
const PID_FILE = 'hook.pid';
const CHROME_SESSION_DIR = '../chrome';
@@ -326,7 +326,7 @@ function handleShutdown(signal) {
type: 'ArchiveResult',
status: 'skipped',
output_str: 'No Content-Type detected',
extractor: EXTRACTOR_NAME,
plugin: PLUGIN_NAME,
};
} else if (!isStaticFile) {
// Not a static file (normal case for HTML pages)
@@ -334,7 +334,7 @@ function handleShutdown(signal) {
type: 'ArchiveResult',
status: 'skipped',
output_str: `Not a static file (Content-Type: ${detectedContentType})`,
extractor: EXTRACTOR_NAME,
plugin: PLUGIN_NAME,
content_type: detectedContentType,
};
} else if (downloadError) {
@@ -343,7 +343,7 @@ function handleShutdown(signal) {
type: 'ArchiveResult',
status: 'failed',
output_str: downloadError,
extractor: EXTRACTOR_NAME,
plugin: PLUGIN_NAME,
content_type: detectedContentType,
};
} else if (downloadedFilePath) {
@@ -352,7 +352,7 @@ function handleShutdown(signal) {
type: 'ArchiveResult',
status: 'succeeded',
output_str: downloadedFilePath,
extractor: EXTRACTOR_NAME,
plugin: PLUGIN_NAME,
content_type: detectedContentType,
};
} else {
@@ -361,7 +361,7 @@ function handleShutdown(signal) {
type: 'ArchiveResult',
status: 'failed',
output_str: 'Static file detected but download did not complete',
extractor: EXTRACTOR_NAME,
plugin: PLUGIN_NAME,
content_type: detectedContentType,
};
}

View File

@@ -20,7 +20,7 @@ const https = require('https');
const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'title';
const PLUGIN_NAME = 'title';
const OUTPUT_DIR = '.';
const OUTPUT_FILE = 'title.txt';
const CHROME_SESSION_DIR = '../chrome';

View File

@@ -39,7 +39,7 @@ import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'wget'
PLUGIN_NAME = 'wget'
BIN_NAME = 'wget'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = '.'