wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,266 @@
#!/usr/bin/env node
/**
* Extract accessibility tree and page outline from a URL.
*
* Extracts:
* - Page outline (headings h1-h6, sections, articles)
* - Iframe tree
* - Accessibility snapshot
* - ARIA labels and roles
*
* Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>
* Output: Writes accessibility/accessibility.json
*
* Environment variables:
* SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'accessibility';
const OUTPUT_DIR = 'accessibility';
const OUTPUT_FILE = 'accessibility.json';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Extract accessibility info
async function extractAccessibility(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Get accessibility snapshot
const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true });
// Extract page outline (headings, sections, etc.)
const outline = await page.evaluate(() => {
const headings = [];
const elements = document.querySelectorAll(
'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe'
);
elements.forEach(elem => {
// Skip unnamed anchors
if (elem.tagName.toLowerCase() === 'a' && !elem.name) return;
const tagName = elem.tagName.toLowerCase();
const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || '';
const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .');
const action = elem.action?.split('/').pop() || '';
let summary = (elem.innerText || '').slice(0, 128);
if (summary.length >= 128) summary += '...';
let prefix = '';
let title = '';
// Format headings with # prefix
const level = parseInt(tagName.replace('h', ''));
if (!isNaN(level)) {
prefix = '#'.repeat(level);
title = elem.innerText || elemId || elemClasses;
} else {
// For other elements, create breadcrumb path
const parents = [tagName];
let node = elem.parentNode;
while (node && parents.length < 5) {
if (node.tagName) {
const tag = node.tagName.toLowerCase();
if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) {
parents.unshift(tag);
} else {
parents.unshift('');
}
}
node = node.parentNode;
}
prefix = parents.join('>');
title = elemId ? `#${elemId}` : '';
if (!title && elemClasses) title = `.${elemClasses}`;
if (action) title += ` /${action}`;
if (summary && !title.includes(summary)) title += `: ${summary}`;
}
// Clean up title
title = title.replace(/\s+/g, ' ').trim();
if (prefix) {
headings.push(`${prefix} ${title}`);
}
});
return headings;
});
// Get iframe tree
const iframes = [];
function dumpFrameTree(frame, indent = '>') {
iframes.push(indent + frame.url());
for (const child of frame.childFrames()) {
dumpFrameTree(child, indent + '>');
}
}
dumpFrameTree(page.mainFrame(), '');
const accessibilityData = {
url,
headings: outline,
iframes,
tree: accessibilityTree,
};
// Write output
fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2));
return { success: true, output: outputPath, accessibilityData };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('SAVE_ACCESSIBILITY', true)) {
console.log('Skipping accessibility (SAVE_ACCESSIBILITY=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await extractAccessibility(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const headingCount = result.accessibilityData.headings.length;
const iframeCount = result.accessibilityData.iframes.length;
console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""
Install a binary using apt package manager.
Usage: on_Dependency__install_using_apt_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
Output: InstalledBinary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import sys
import rich_click as click
from abx_pkg import Binary, AptProvider, BinProviderOverrides
# Fix pydantic forward reference issue
AptProvider.model_rebuild()
@click.command()
@click.option('--dependency-id', required=True, help="Dependency UUID")
@click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command (overrides default)")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
"""Install binary using apt package manager."""
# Check if apt provider is allowed
if bin_providers != '*' and 'apt' not in bin_providers.split(','):
click.echo(f"apt provider not allowed for {bin_name}", err=True)
sys.exit(0) # Not an error, just skip
# Use abx-pkg AptProvider to install binary
provider = AptProvider()
if not provider.INSTALLER_BIN:
click.echo("apt not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {bin_name} via apt...", err=True)
try:
binary = Binary(name=bin_name, binproviders=[provider]).install()
except Exception as e:
click.echo(f"apt install failed: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{bin_name} not found after apt install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output InstalledBinary JSONL record to stdout
record = {
'type': 'InstalledBinary',
'name': bin_name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'apt',
'machine_id': machine_id,
'dependency_id': dependency_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,26 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_ARCHIVE_DOT_ORG": {
"type": "boolean",
"default": true,
"x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
"description": "Submit URLs to archive.org Wayback Machine"
},
"ARCHIVE_ORG_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for archive.org submission in seconds"
},
"ARCHIVE_ORG_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
}
}
}

View File

@@ -0,0 +1,156 @@
#!/usr/bin/env python3
"""
Submit a URL to archive.org for archiving.
Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
Output: Writes archive.org.txt to $PWD with the archived URL
Environment variables:
TIMEOUT: Timeout in seconds (default: 60)
USER_AGENT: User agent string
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
It can run standalone if requests is installed: pip install requests
"""
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'archive_org'
OUTPUT_DIR = 'archive_org'
OUTPUT_FILE = 'archive.org.txt'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
"""
Submit URL to archive.org Wayback Machine.
Returns: (success, output_path, error_message)
"""
try:
import requests
except ImportError:
return False, None, 'requests library not installed'
timeout = get_env_int('TIMEOUT', 60)
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
submit_url = f'https://web.archive.org/save/{url}'
try:
response = requests.get(
submit_url,
timeout=timeout,
headers={'User-Agent': user_agent},
allow_redirects=True,
)
# Check for successful archive
content_location = response.headers.get('Content-Location', '')
x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
# Build archive URL
if content_location:
archive_url = f'https://web.archive.org{content_location}'
Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
return True, OUTPUT_FILE, ''
elif 'web.archive.org' in response.url:
# We were redirected to an archive page
Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
return True, OUTPUT_FILE, ''
else:
# Check for errors in response
if 'RobotAccessControlException' in response.text:
# Blocked by robots.txt - save submit URL for manual retry
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
return True, OUTPUT_FILE, '' # Consider this a soft success
elif response.status_code >= 400:
return False, None, f'HTTP {response.status_code}'
else:
# Save submit URL anyway
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
return True, OUTPUT_FILE, ''
except requests.Timeout:
return False, None, f'Request timed out after {timeout} seconds'
except requests.RequestException as e:
return False, None, f'{type(e).__name__}: {e}'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to submit to archive.org')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Submit a URL to archive.org for archiving."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
try:
# Run extraction
success, output, error = submit_to_archive_org(url)
status = 'succeeded' if success else 'failed'
if success:
archive_url = Path(output).read_text().strip()
print(f'Archived at: {archive_url}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""
Install a binary using Homebrew package manager.
Usage: on_Dependency__install_using_brew_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
Output: InstalledBinary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import sys
import rich_click as click
from abx_pkg import Binary, BrewProvider, BinProviderOverrides
# Fix pydantic forward reference issue
BrewProvider.model_rebuild()
@click.command()
@click.option('--dependency-id', required=True, help="Dependency UUID")
@click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
"""Install binary using Homebrew."""
if bin_providers != '*' and 'brew' not in bin_providers.split(','):
click.echo(f"brew provider not allowed for {bin_name}", err=True)
sys.exit(0)
# Use abx-pkg BrewProvider to install binary
provider = BrewProvider()
if not provider.INSTALLER_BIN:
click.echo("brew not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {bin_name} via brew...", err=True)
try:
binary = Binary(name=bin_name, binproviders=[provider]).install()
except Exception as e:
click.echo(f"brew install failed: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{bin_name} not found after brew install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output InstalledBinary JSONL record to stdout
record = {
'type': 'InstalledBinary',
'name': bin_name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'brew',
'machine_id': machine_id,
'dependency_id': dependency_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""
Create symlinks from plugin outputs to canonical legacy locations.
This plugin runs after all extractors complete and creates symlinks from the
new plugin-based output structure to the legacy canonical output paths that
ArchiveBox has historically used. This maintains backward compatibility with
existing tools and scripts that expect outputs at specific locations.
Canonical output paths (from Snapshot.canonical_outputs()):
- favicon.ico → favicon/favicon.ico
- singlefile.html → singlefile/singlefile.html
- readability/content.html → readability/content.html
- mercury/content.html → mercury/content.html
- htmltotext.txt → htmltotext/htmltotext.txt
- output.pdf → pdf/output.pdf
- screenshot.png → screenshot/screenshot.png
- output.html → dom/output.html
- headers.json → headers/headers.json
- warc/{timestamp} → wget/warc/{timestamp}
New plugin outputs:
- ssl.json → ssl/ssl.json
- seo.json → seo/seo.json
- accessibility.json → accessibility/accessibility.json
- outlinks.json → outlinks/outlinks.json
- redirects.json → redirects/redirects.json
- console.jsonl → consolelog/console.jsonl
Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
"""
__package__ = 'archivebox.plugins.canonical_outputs'
import os
import sys
from pathlib import Path
from typing import Dict, Optional
# Configure Django if running standalone
if __name__ == '__main__':
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
import rich_click as click
# Mapping from canonical path to plugin output path
CANONICAL_MAPPINGS = {
# Legacy extractors
'favicon.ico': 'favicon/favicon.ico',
'singlefile.html': 'singlefile/singlefile.html',
'readability/content.html': 'readability/content.html',
'mercury/content.html': 'mercury/content.html',
'htmltotext.txt': 'htmltotext/htmltotext.txt',
'output.pdf': 'pdf/output.pdf',
'screenshot.png': 'screenshot/screenshot.png',
'output.html': 'dom/output.html',
'headers.json': 'headers/headers.json',
# New plugins
'ssl.json': 'ssl/ssl.json',
'seo.json': 'seo/seo.json',
'accessibility.json': 'accessibility/accessibility.json',
'outlinks.json': 'parse_dom_outlinks/outlinks.json',
'redirects.json': 'redirects/redirects.json',
'console.jsonl': 'consolelog/console.jsonl',
}
def create_symlink(target: Path, link: Path, relative: bool = True) -> bool:
"""
Create a symlink from link to target.
Args:
target: The actual file/directory (source)
link: The symlink to create (destination)
relative: Whether to create a relative symlink (default: True)
Returns:
True if symlink was created or already exists, False otherwise
"""
try:
# Skip if target doesn't exist
if not target.exists():
return False
# Remove existing symlink/file if present
if link.exists() or link.is_symlink():
if link.is_symlink() and link.resolve() == target.resolve():
# Already correctly symlinked
return True
link.unlink()
# Create parent directory
link.parent.mkdir(parents=True, exist_ok=True)
# Create relative or absolute symlink
if relative:
# Calculate relative path from link to target
rel_target = os.path.relpath(target, link.parent)
link.symlink_to(rel_target)
else:
link.symlink_to(target)
return True
except (OSError, FileNotFoundError, PermissionError) as e:
# Symlink creation failed, skip
return False
def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
"""
Create all canonical symlinks for a snapshot directory.
Args:
snapshot_dir: The snapshot directory (e.g., archive/<timestamp>/)
Returns:
Dict mapping canonical path to success status
"""
results = {}
for canonical_path, plugin_output in CANONICAL_MAPPINGS.items():
target = snapshot_dir / plugin_output
link = snapshot_dir / canonical_path
success = create_symlink(target, link, relative=True)
results[canonical_path] = success
# Special handling for warc/ directory symlink
# wget plugin outputs to wget/warc/, but canonical expects warc/ at root
wget_warc = snapshot_dir / 'wget' / 'warc'
canonical_warc = snapshot_dir / 'warc'
if wget_warc.exists():
results['warc/'] = create_symlink(wget_warc, canonical_warc, relative=True)
return results
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Create symlinks from plugin outputs to canonical legacy locations."""
from datetime import datetime
from archivebox.core.models import Snapshot
start_ts = datetime.now()
status = 'failed'
output = None
error = ''
symlinks_created = 0
try:
# Check if enabled
from archivebox.config import CONSTANTS
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_canonical:
click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
status = 'skipped'
end_ts = datetime.now()
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'STATUS={status}')
click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
sys.exit(0)
# Get snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
error = f'Snapshot {snapshot_id} not found'
raise ValueError(error)
# Get snapshot directory
snapshot_dir = Path(snapshot.output_dir)
if not snapshot_dir.exists():
error = f'Snapshot directory not found: {snapshot_dir}'
raise FileNotFoundError(error)
# Create canonical symlinks
results = create_canonical_symlinks(snapshot_dir)
# Count successful symlinks
symlinks_created = sum(1 for success in results.values() if success)
total_mappings = len(results)
status = 'succeeded'
output = str(snapshot_dir)
click.echo(f'Created {symlinks_created}/{total_mappings} canonical symlinks')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
click.echo(f'Error: {error}', err=True)
end_ts = datetime.now()
duration = (end_ts - start_ts).total_seconds()
# Print results
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'DURATION={duration:.2f}')
if output:
click.echo(f'OUTPUT={output}')
click.echo(f'STATUS={status}')
if error:
click.echo(f'ERROR={error}', err=True)
# Print JSON result
import json
result_json = {
'extractor': 'canonical_outputs',
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'symlinks_created': symlinks_created,
'error': error or None,
}
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,121 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Plugin
*
* Installs and configures the 2captcha Chrome extension for automatic
* CAPTCHA solving during page archiving.
*
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
*
* Priority: 01 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
name: 'captcha2',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install and configure the 2captcha extension
*/
async function installCaptchaExtension() {
console.log('[*] Installing 2captcha extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install 2captcha extension');
return null;
}
// Check if API key is configured
const apiKey = process.env.API_KEY_2CAPTCHA;
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
} else {
console.log('[+] 2captcha extension installed and API key configured');
}
return extension;
}
/**
* Note: 2captcha configuration is now handled by chrome_session plugin
* during first-time browser setup to avoid repeated configuration on every snapshot.
* The API key is injected via chrome.storage API once per browser session.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] 2captcha extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installCaptchaExtension();
// Export extension metadata for chrome_session to load
if (extension) {
// Write extension info to a cache file that chrome_session can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installCaptchaExtension,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] 2captcha extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] 2captcha extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,284 @@
#!/usr/bin/env node
/**
* 2Captcha Extension Configuration
*
* Configures the 2captcha extension with API key after Chrome session starts.
* Runs once per browser session to inject API key into extension storage.
*
* Priority: 21 (after chrome_session at 20, before navigation at 30)
* Hook: on_Snapshot
*
* Requirements:
* - API_KEY_2CAPTCHA environment variable must be set
* - chrome_session must have loaded extensions (extensions.json must exist)
*/
const path = require('path');
const fs = require('fs');
const puppeteer = require('puppeteer-core');
const OUTPUT_DIR = 'chrome_session';
const CONFIG_MARKER = path.join(OUTPUT_DIR, '.captcha2_configured');
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
async function configure2Captcha() {
// Check if already configured in this session
if (fs.existsSync(CONFIG_MARKER)) {
console.log('[*] 2captcha already configured in this browser session');
return { success: true, skipped: true };
}
// Check if API key is set
const apiKey = getEnv('API_KEY_2CAPTCHA');
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
}
// Load extensions metadata
const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
if (!fs.existsSync(extensionsFile)) {
return { success: false, error: 'extensions.json not found - chrome_session must run first' };
}
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
if (!captchaExt) {
console.log('[*] 2captcha extension not installed, skipping configuration');
return { success: true, skipped: true };
}
console.log('[*] Configuring 2captcha extension with API key...');
try {
// Connect to the existing Chrome session via CDP
const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return { success: false, error: 'CDP URL not found - chrome_session must run first' };
}
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
try {
// Method 1: Try to inject via extension background page
if (captchaExt.target && captchaExt.target_ctx) {
console.log('[*] Attempting to configure via extension background page...');
// Reconnect to the browser to get fresh target context
const targets = await browser.targets();
const extTarget = targets.find(t =>
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
);
if (extTarget) {
const extContext = await extTarget.worker() || await extTarget.page();
if (extContext) {
await extContext.evaluate((key) => {
// Try all common storage patterns
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
chrome.storage.sync.set({
apiKey: key,
api_key: key,
'2captcha_apikey': key,
apikey: key,
'solver-api-key': key,
});
}
// Also try localStorage as fallback
if (typeof localStorage !== 'undefined') {
localStorage.setItem('apiKey', key);
localStorage.setItem('2captcha_apikey', key);
localStorage.setItem('solver-api-key', key);
}
}, apiKey);
console.log('[+] 2captcha API key configured successfully via background page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'background_page' };
}
}
}
// Method 2: Try to configure via options page
console.log('[*] Attempting to configure via options page...');
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
const configPage = await browser.newPage();
try {
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
const configured = await configPage.evaluate((key) => {
// Try to find API key input field
const selectors = [
'input[name*="apikey" i]',
'input[id*="apikey" i]',
'input[name*="api-key" i]',
'input[id*="api-key" i]',
'input[name*="key" i]',
'input[placeholder*="api" i]',
'input[type="text"]',
];
for (const selector of selectors) {
const input = document.querySelector(selector);
if (input) {
input.value = key;
input.dispatchEvent(new Event('input', { bubbles: true }));
input.dispatchEvent(new Event('change', { bubbles: true }));
// Try to find and click save button
const saveSelectors = [
'button[type="submit"]',
'input[type="submit"]',
'button:contains("Save")',
'button:contains("Apply")',
];
for (const btnSel of saveSelectors) {
const btn = document.querySelector(btnSel);
if (btn) {
btn.click();
break;
}
}
// Also save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
}
return true;
}
}
// Fallback: Just save to storage
if (typeof chrome !== 'undefined' && chrome.storage) {
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
return true;
}
return false;
}, apiKey);
await configPage.close();
if (configured) {
console.log('[+] 2captcha API key configured successfully via options page');
// Mark as configured
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
return { success: true, method: 'options_page' };
}
} catch (e) {
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
try {
await configPage.close();
} catch (e2) {}
}
return { success: false, error: 'Could not configure via any method' };
} finally {
browser.disconnect();
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let error = '';
try {
const result = await configure2Captcha();
if (result.skipped) {
status = 'skipped';
} else if (result.success) {
status = 'succeeded';
} else {
status = 'failed';
error = result.error || 'Configuration failed';
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: 'captcha2_config',
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,184 @@
"""
Unit tests for captcha2 plugin
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__01_captcha2.js"
CONFIG_SCRIPT = PLUGIN_DIR / "on_Snapshot__21_captcha2_config.js"
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_config_script_exists():
"""Verify config script exists"""
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
def test_extension_metadata():
"""Test that captcha2 extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
# Just check the script can be loaded
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert metadata["name"] == "captcha2"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
# Check cache file was created
cache_file = ext_dir / "captcha2.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
assert cache_data["name"] == "captcha2"
assert "unpacked_path" in cache_data
assert "version" in cache_data
def test_install_uses_existing_cache():
"""Test that install uses existing cache when available"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
# Create fake cache
fake_extension_dir = ext_dir / "ifibfemgeogfhoebkmokieepdoobkbpo__captcha2"
fake_extension_dir.mkdir(parents=True)
manifest = {"version": "3.7.0", "name": "2Captcha Solver"}
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
cache_data = {
"webstore_id": "ifibfemgeogfhoebkmokieepdoobkbpo",
"name": "captcha2",
"unpacked_path": str(fake_extension_dir),
"version": "3.7.0"
}
(ext_dir / "captcha2.extension.json").write_text(json.dumps(cache_data))
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_api_key"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should use cache
assert "already installed (using cache)" in result.stdout or "Installed extension captcha2" in result.stdout
def test_install_warns_without_api_key():
"""Test that install warns when API key not configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# Don't set API_KEY_2CAPTCHA
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should warn about missing API key
combined_output = result.stdout + result.stderr
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
def test_install_success_with_api_key():
"""Test that install succeeds when API key is configured"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
# Run install script
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should mention API key configured
combined_output = result.stdout + result.stderr
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
def test_config_script_structure():
"""Test that config script has proper structure"""
# Verify the script exists and contains expected markers
script_content = CONFIG_SCRIPT.read_text()
# Should mention configuration marker file
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
# Should mention API key
assert "API_KEY_2CAPTCHA" in script_content
# Should have main function or be executable
assert "async function" in script_content or "main" in script_content

View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python3
"""
Clean up Chrome browser session started by chrome_session extractor.
This extractor runs after all Chrome-based extractors (screenshot, pdf, dom)
to terminate the Chrome process and clean up any leftover files.
Usage: on_Snapshot__24_chrome_cleanup.py --url=<url> --snapshot-id=<uuid>
Output: Terminates Chrome process and removes lock files
Environment variables:
CHROME_USER_DATA_DIR: Chrome profile directory (for lock file cleanup)
CHROME_PROFILE_NAME: Chrome profile name (default: Default)
"""
import json
import os
import signal
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'chrome_cleanup'
CHROME_SESSION_DIR = 'chrome_session'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def cleanup_chrome_session() -> tuple[bool, str | None, str]:
"""
Clean up Chrome session started by chrome_session extractor.
Returns: (success, output_info, error_message)
"""
session_dir = Path(CHROME_SESSION_DIR)
if not session_dir.exists():
return True, 'No chrome_session directory found', ''
pid_file = session_dir / 'pid.txt'
killed = False
if pid_file.exists():
try:
pid = int(pid_file.read_text().strip())
# Try graceful termination first
try:
os.kill(pid, signal.SIGTERM)
killed = True
# Wait briefly for graceful shutdown
for _ in range(10):
try:
os.kill(pid, 0) # Check if still running
time.sleep(0.1)
except OSError:
break # Process is gone
else:
# Force kill if still running
try:
os.kill(pid, signal.SIGKILL)
except OSError:
pass
except OSError as e:
# Process might already be dead, that's fine
if e.errno == 3: # No such process
pass
else:
return False, None, f'Failed to kill Chrome PID {pid}: {e}'
except ValueError:
return False, None, f'Invalid PID in {pid_file}'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
# Clean up Chrome profile lock files if configured
user_data_dir = get_env('CHROME_USER_DATA_DIR', '')
profile_name = get_env('CHROME_PROFILE_NAME', 'Default')
if user_data_dir:
user_data_path = Path(user_data_dir)
for lockfile in [
user_data_path / 'SingletonLock',
user_data_path / profile_name / 'SingletonLock',
]:
try:
lockfile.unlink(missing_ok=True)
except Exception:
pass # Best effort cleanup
result_info = f'Chrome cleanup: PID {"killed" if killed else "not found"}'
return True, result_info, ''
@click.command()
@click.option('--url', required=True, help='URL that was loaded')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Clean up Chrome browser session."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
try:
success, output, error = cleanup_chrome_session()
status = 'succeeded' if success else 'failed'
if success:
print(f'Chrome cleanup completed: {output}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,483 @@
#!/usr/bin/env node
/**
* Chrome Extension Management Utilities
*
* Handles downloading, installing, and managing Chrome extensions for browser automation.
* Ported from the TypeScript implementation in archivebox.ts
*/
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const { exec } = require('child_process');
const { promisify } = require('util');
const { Readable } = require('stream');
const { finished } = require('stream/promises');
const execAsync = promisify(exec);
// Try to import unzipper, fallback to system unzip if not available
let unzip = null;
try {
const unzipper = require('unzipper');
unzip = async (sourcePath, destPath) => {
const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath }));
return stream.promise();
};
} catch (err) {
// Will use system unzip command as fallback
}
/**
* Compute the extension ID from the unpacked path.
* Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id.
*
* @param {string} unpacked_path - Path to the unpacked extension directory
* @returns {string} - 32-character extension ID
*/
function getExtensionId(unpacked_path) {
// Chrome uses a SHA256 hash of the unpacked extension directory path
const hash = crypto.createHash('sha256');
hash.update(Buffer.from(unpacked_path, 'utf-8'));
// Convert first 32 hex chars to characters in the range 'a'-'p'
const detected_extension_id = Array.from(hash.digest('hex'))
.slice(0, 32)
.map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
.join('');
return detected_extension_id;
}
/**
* Download and install a Chrome extension from the Chrome Web Store.
*
* @param {Object} extension - Extension metadata object
* @param {string} extension.webstore_id - Chrome Web Store extension ID
* @param {string} extension.name - Human-readable extension name
* @param {string} extension.crx_url - URL to download the CRX file
* @param {string} extension.crx_path - Local path to save the CRX file
* @param {string} extension.unpacked_path - Path to extract the extension
* @returns {Promise<boolean>} - True if installation succeeded
*/
async function installExtension(extension) {
const manifest_path = path.join(extension.unpacked_path, 'manifest.json');
// Download CRX file if not already downloaded
if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`);
try {
// Ensure parent directory exists
const crxDir = path.dirname(extension.crx_path);
if (!fs.existsSync(crxDir)) {
fs.mkdirSync(crxDir, { recursive: true });
}
// Download CRX file from Chrome Web Store
const response = await fetch(extension.crx_url);
if (!response.ok) {
console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`);
return false;
}
if (response.body) {
const crx_file = fs.createWriteStream(extension.crx_path);
const crx_stream = Readable.fromWeb(response.body);
await finished(crx_stream.pipe(crx_file));
} else {
console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`);
return false;
}
} catch (err) {
console.error(`[❌] Failed to download extension ${extension.name}:`, err);
return false;
}
}
// Unzip CRX file to unpacked_path
await fs.promises.mkdir(extension.unpacked_path, { recursive: true });
try {
// Try system unzip command first
await execAsync(`/usr/bin/unzip -o ${extension.crx_path} -d ${extension.unpacked_path}`);
} catch (err1) {
if (unzip) {
// Fallback to unzipper library
try {
await unzip(extension.crx_path, extension.unpacked_path);
} catch (err2) {
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
return false;
}
} else {
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
return false;
}
}
if (!fs.existsSync(manifest_path)) {
console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`);
return false;
}
return true;
}
/**
* Load or install a Chrome extension, computing all metadata.
*
* @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path)
* @param {string} [ext.webstore_id] - Chrome Web Store extension ID
* @param {string} [ext.name] - Human-readable extension name
* @param {string} [ext.unpacked_path] - Path to unpacked extension
* @param {string} [extensions_dir] - Directory to store extensions
* @returns {Promise<Object>} - Complete extension metadata object
*/
async function loadOrInstallExtension(ext, extensions_dir = null) {
if (!(ext.webstore_id || ext.unpacked_path)) {
throw new Error('Extension must have either {webstore_id} or {unpacked_path}');
}
// Determine extensions directory
const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions';
// Set statically computable extension metadata
ext.webstore_id = ext.webstore_id || ext.id;
ext.name = ext.name || ext.webstore_id;
ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`;
ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`;
ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`);
ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`);
const manifest_path = path.join(ext.unpacked_path, 'manifest.json');
ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'));
ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null;
// If extension is not installed, download and unpack it
if (!ext.read_version()) {
await installExtension(ext);
}
// Autodetect ID from filesystem path (unpacked extensions don't have stable IDs)
ext.id = getExtensionId(ext.unpacked_path);
ext.version = ext.read_version();
if (!ext.version) {
console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`);
} else {
console.log(`[] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`);
}
return ext;
}
/**
* Check if a Puppeteer target is an extension background page/service worker.
*
* @param {Object} target - Puppeteer target object
* @returns {Promise<Object>} - Object with target_is_bg, extension_id, manifest_version, etc.
*/
async function isTargetExtension(target) {
let target_type;
let target_ctx;
let target_url;
try {
target_type = target.type();
target_ctx = (await target.worker()) || (await target.page()) || null;
target_url = target.url() || target_ctx?.url() || null;
} catch (err) {
if (String(err).includes('No target with given id found')) {
// Target closed during check, ignore harmless race condition
target_type = 'closed';
target_ctx = null;
target_url = 'about:closed';
} else {
throw err;
}
}
// Check if this is an extension background page or service worker
const is_chrome_extension = target_url?.startsWith('chrome-extension://');
const is_background_page = target_type === 'background_page';
const is_service_worker = target_type === 'service_worker';
const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker);
let extension_id = null;
let manifest_version = null;
const target_is_extension = is_chrome_extension || target_is_bg;
if (target_is_extension) {
try {
extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
if (target_ctx) {
const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
manifest_version = manifest?.manifest_version || null;
}
} catch (err) {
// Failed to get extension metadata
}
}
return {
target_is_extension,
target_is_bg,
target_type,
target_ctx,
target_url,
extension_id,
manifest_version,
};
}
/**
* Load extension metadata and connection handlers from a browser target.
*
* @param {Array} extensions - Array of extension metadata objects to update
* @param {Object} target - Puppeteer target object
* @returns {Promise<Object|null>} - Updated extension object or null if not an extension
*/
async function loadExtensionFromTarget(extensions, target) {
const {
target_is_bg,
target_is_extension,
target_type,
target_ctx,
target_url,
extension_id,
manifest_version,
} = await isTargetExtension(target);
if (!(target_is_bg && extension_id && target_ctx)) {
return null;
}
// Find matching extension in our list
const extension = extensions.find(ext => ext.id === extension_id);
if (!extension) {
console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`);
return null;
}
// Load manifest from the extension context
let manifest = null;
try {
manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
} catch (err) {
console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err);
return null;
}
// Create dispatch methods for communicating with the extension
const new_extension = {
...extension,
target,
target_type,
target_url,
manifest,
manifest_version,
// Trigger extension toolbar button click
dispatchAction: async (tab) => {
return await target_ctx.evaluate((tabId) => {
return new Promise((resolve) => {
chrome.action.onClicked.addListener((tab) => {
resolve({ success: true, tab });
});
chrome.action.openPopup();
});
}, tab?.id || null);
},
// Send message to extension
dispatchMessage: async (message, options = {}) => {
return await target_ctx.evaluate((msg, opts) => {
return new Promise((resolve) => {
chrome.runtime.sendMessage(msg, opts, (response) => {
resolve(response);
});
});
}, message, options);
},
// Trigger extension command (keyboard shortcut)
dispatchCommand: async (command) => {
return await target_ctx.evaluate((cmd) => {
return new Promise((resolve) => {
chrome.commands.onCommand.addListener((receivedCommand) => {
if (receivedCommand === cmd) {
resolve({ success: true, command: receivedCommand });
}
});
// Note: Actually triggering commands programmatically is not directly supported
// This would need to be done via CDP or keyboard simulation
});
}, command);
},
};
// Update the extension in the array
Object.assign(extension, new_extension);
console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`);
return new_extension;
}
/**
* Install all extensions in the list if not already installed.
*
* @param {Array} extensions - Array of extension metadata objects
* @param {string} [extensions_dir] - Directory to store extensions
* @returns {Promise<Array>} - Array of installed extension objects
*/
async function installAllExtensions(extensions, extensions_dir = null) {
console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`);
for (const extension of extensions) {
await loadOrInstallExtension(extension, extensions_dir);
}
return extensions;
}
/**
* Load and connect to all extensions from a running browser.
*
* @param {Object} browser - Puppeteer browser instance
* @param {Array} extensions - Array of extension metadata objects
* @returns {Promise<Array>} - Array of loaded extension objects with connection handlers
*/
async function loadAllExtensionsFromBrowser(browser, extensions) {
console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`);
// Find loaded extensions at runtime by examining browser targets
for (const target of browser.targets()) {
await loadExtensionFromTarget(extensions, target);
}
return extensions;
}
/**
* Load extension manifest.json file
*
* @param {string} unpacked_path - Path to unpacked extension directory
* @returns {object|null} - Parsed manifest object or null if not found/invalid
*/
function loadExtensionManifest(unpacked_path) {
const manifest_path = path.join(unpacked_path, 'manifest.json');
if (!fs.existsSync(manifest_path)) {
return null;
}
try {
const manifest_content = fs.readFileSync(manifest_path, 'utf-8');
return JSON.parse(manifest_content);
} catch (error) {
// Invalid JSON or read error
return null;
}
}
/**
* Generate Chrome launch arguments for loading extensions.
*
* @param {Array} extensions - Array of extension metadata objects
* @returns {Array<string>} - Chrome CLI arguments for loading extensions
*/
function getExtensionLaunchArgs(extensions) {
if (!extensions || extensions.length === 0) {
return [];
}
// Filter out extensions without unpacked_path first
const validExtensions = extensions.filter(ext => ext.unpacked_path);
const unpacked_paths = validExtensions.map(ext => ext.unpacked_path);
const webstore_ids = validExtensions.map(ext => ext.webstore_id || ext.id);
return [
`--load-extension=${unpacked_paths.join(',')}`,
`--allowlisted-extension-id=${webstore_ids.join(',')}`,
'--allow-legacy-extension-manifests',
'--disable-extensions-auto-update',
];
}
// Export all functions
module.exports = {
getExtensionId,
loadExtensionManifest,
installExtension,
loadOrInstallExtension,
isTargetExtension,
loadExtensionFromTarget,
installAllExtensions,
loadAllExtensionsFromBrowser,
getExtensionLaunchArgs,
};
// CLI usage
if (require.main === module) {
const args = process.argv.slice(2);
if (args.length === 0) {
console.log('Usage: chrome_extension_utils.js <command> [args...]');
console.log('');
console.log('Commands:');
console.log(' getExtensionId <path>');
console.log(' loadExtensionManifest <path>');
console.log(' getExtensionLaunchArgs <extensions_json>');
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
process.exit(1);
}
const [command, ...commandArgs] = args;
(async () => {
try {
switch (command) {
case 'getExtensionId': {
const [unpacked_path] = commandArgs;
const id = getExtensionId(unpacked_path);
console.log(id);
break;
}
case 'loadExtensionManifest': {
const [unpacked_path] = commandArgs;
const manifest = loadExtensionManifest(unpacked_path);
console.log(JSON.stringify(manifest));
break;
}
case 'getExtensionLaunchArgs': {
const [extensions_json] = commandArgs;
const extensions = JSON.parse(extensions_json);
const args = getExtensionLaunchArgs(extensions);
console.log(JSON.stringify(args));
break;
}
case 'loadOrInstallExtension': {
const [webstore_id, name, extensions_dir] = commandArgs;
const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir);
console.log(JSON.stringify(ext, null, 2));
break;
}
default:
console.error(`Unknown command: ${command}`);
process.exit(1);
}
} catch (error) {
console.error(`Error: ${error.message}`);
process.exit(1);
}
})();
}

View File

@@ -0,0 +1,329 @@
/**
* Unit tests for chrome_extension_utils.js
*
* Run with: npm test
* Or: node --test tests/test_chrome_extension_utils.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Import module under test
const extensionUtils = require('../chrome_extension_utils.js');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
describe('chrome_extension_utils', () => {
before(() => {
// Create test directory
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
// Cleanup test directory
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('getExtensionId', () => {
it('should compute extension ID from path', () => {
const testPath = '/path/to/extension';
const extensionId = extensionUtils.getExtensionId(testPath);
assert.strictEqual(typeof extensionId, 'string');
assert.strictEqual(extensionId.length, 32);
// Should only contain lowercase letters a-p
assert.match(extensionId, /^[a-p]+$/);
});
it('should compute ID even for non-existent paths', () => {
const testPath = '/nonexistent/path';
const extensionId = extensionUtils.getExtensionId(testPath);
// Should still compute an ID from the path string
assert.strictEqual(typeof extensionId, 'string');
assert.strictEqual(extensionId.length, 32);
assert.match(extensionId, /^[a-p]+$/);
});
it('should return consistent ID for same path', () => {
const testPath = '/path/to/extension';
const id1 = extensionUtils.getExtensionId(testPath);
const id2 = extensionUtils.getExtensionId(testPath);
assert.strictEqual(id1, id2);
});
it('should return different IDs for different paths', () => {
const path1 = '/path/to/extension1';
const path2 = '/path/to/extension2';
const id1 = extensionUtils.getExtensionId(path1);
const id2 = extensionUtils.getExtensionId(path2);
assert.notStrictEqual(id1, id2);
});
});
describe('loadExtensionManifest', () => {
beforeEach(() => {
// Create test extension directory with manifest
const testExtDir = path.join(TEST_DIR, 'test_extension');
fs.mkdirSync(testExtDir, { recursive: true });
const manifest = {
manifest_version: 3,
name: "Test Extension",
version: "1.0.0"
};
fs.writeFileSync(
path.join(testExtDir, 'manifest.json'),
JSON.stringify(manifest)
);
});
afterEach(() => {
// Cleanup test extension
const testExtDir = path.join(TEST_DIR, 'test_extension');
if (fs.existsSync(testExtDir)) {
fs.rmSync(testExtDir, { recursive: true });
}
});
it('should load valid manifest.json', () => {
const testExtDir = path.join(TEST_DIR, 'test_extension');
const manifest = extensionUtils.loadExtensionManifest(testExtDir);
assert.notStrictEqual(manifest, null);
assert.strictEqual(manifest.manifest_version, 3);
assert.strictEqual(manifest.name, "Test Extension");
assert.strictEqual(manifest.version, "1.0.0");
});
it('should return null for missing manifest', () => {
const nonExistentDir = path.join(TEST_DIR, 'nonexistent');
const manifest = extensionUtils.loadExtensionManifest(nonExistentDir);
assert.strictEqual(manifest, null);
});
it('should handle invalid JSON gracefully', () => {
const testExtDir = path.join(TEST_DIR, 'invalid_extension');
fs.mkdirSync(testExtDir, { recursive: true });
// Write invalid JSON
fs.writeFileSync(
path.join(testExtDir, 'manifest.json'),
'invalid json content'
);
const manifest = extensionUtils.loadExtensionManifest(testExtDir);
assert.strictEqual(manifest, null);
// Cleanup
fs.rmSync(testExtDir, { recursive: true });
});
});
describe('getExtensionLaunchArgs', () => {
it('should return empty array for no extensions', () => {
const args = extensionUtils.getExtensionLaunchArgs([]);
assert.deepStrictEqual(args, []);
});
it('should generate correct launch args for single extension', () => {
const extensions = [{
webstore_id: 'abcd1234',
unpacked_path: '/path/to/extension'
}];
const args = extensionUtils.getExtensionLaunchArgs(extensions);
assert.strictEqual(args.length, 4);
assert.strictEqual(args[0], '--load-extension=/path/to/extension');
assert.strictEqual(args[1], '--allowlisted-extension-id=abcd1234');
assert.strictEqual(args[2], '--allow-legacy-extension-manifests');
assert.strictEqual(args[3], '--disable-extensions-auto-update');
});
it('should generate correct launch args for multiple extensions', () => {
const extensions = [
{ webstore_id: 'ext1', unpacked_path: '/path/ext1' },
{ webstore_id: 'ext2', unpacked_path: '/path/ext2' },
{ webstore_id: 'ext3', unpacked_path: '/path/ext3' }
];
const args = extensionUtils.getExtensionLaunchArgs(extensions);
assert.strictEqual(args.length, 4);
assert.strictEqual(args[0], '--load-extension=/path/ext1,/path/ext2,/path/ext3');
assert.strictEqual(args[1], '--allowlisted-extension-id=ext1,ext2,ext3');
});
it('should handle extensions with id instead of webstore_id', () => {
const extensions = [{
id: 'computed_id',
unpacked_path: '/path/to/extension'
}];
const args = extensionUtils.getExtensionLaunchArgs(extensions);
assert.strictEqual(args[1], '--allowlisted-extension-id=computed_id');
});
it('should filter out extensions without paths', () => {
const extensions = [
{ webstore_id: 'ext1', unpacked_path: '/path/ext1' },
{ webstore_id: 'ext2', unpacked_path: null },
{ webstore_id: 'ext3', unpacked_path: '/path/ext3' }
];
const args = extensionUtils.getExtensionLaunchArgs(extensions);
assert.strictEqual(args[0], '--load-extension=/path/ext1,/path/ext3');
assert.strictEqual(args[1], '--allowlisted-extension-id=ext1,ext3');
});
});
describe('loadOrInstallExtension', () => {
beforeEach(() => {
// Create test extensions directory
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
// Cleanup test extensions directory
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
it('should throw error if neither webstore_id nor unpacked_path provided', async () => {
await assert.rejects(
async () => {
await extensionUtils.loadOrInstallExtension({}, TEST_EXTENSIONS_DIR);
},
/Extension must have either/
);
});
it('should set correct default values for extension metadata', async () => {
const input = {
webstore_id: 'test123',
name: 'test_extension'
};
// Mock the installation to avoid actual download
const originalInstall = extensionUtils.installExtension;
extensionUtils.installExtension = async () => {
// Create fake manifest
const extDir = path.join(TEST_EXTENSIONS_DIR, 'test123__test_extension');
fs.mkdirSync(extDir, { recursive: true });
fs.writeFileSync(
path.join(extDir, 'manifest.json'),
JSON.stringify({ version: '1.0.0' })
);
return true;
};
const ext = await extensionUtils.loadOrInstallExtension(input, TEST_EXTENSIONS_DIR);
// Restore original
extensionUtils.installExtension = originalInstall;
assert.strictEqual(ext.webstore_id, 'test123');
assert.strictEqual(ext.name, 'test_extension');
assert.ok(ext.webstore_url.includes(ext.webstore_id));
assert.ok(ext.crx_url.includes(ext.webstore_id));
assert.ok(ext.crx_path.includes('test123__test_extension.crx'));
assert.ok(ext.unpacked_path.includes('test123__test_extension'));
});
it('should detect version from manifest after installation', async () => {
const input = {
webstore_id: 'test456',
name: 'versioned_extension'
};
// Create pre-installed extension
const extDir = path.join(TEST_EXTENSIONS_DIR, 'test456__versioned_extension');
fs.mkdirSync(extDir, { recursive: true });
fs.writeFileSync(
path.join(extDir, 'manifest.json'),
JSON.stringify({
manifest_version: 3,
name: "Versioned Extension",
version: "2.5.1"
})
);
const ext = await extensionUtils.loadOrInstallExtension(input, TEST_EXTENSIONS_DIR);
assert.strictEqual(ext.version, '2.5.1');
});
});
describe('isTargetExtension', () => {
it('should identify extension targets by URL', async () => {
// Mock Puppeteer target
const mockTarget = {
type: () => 'service_worker',
url: () => 'chrome-extension://abcdefgh/background.js',
worker: async () => null,
page: async () => null
};
const result = await extensionUtils.isTargetExtension(mockTarget);
assert.strictEqual(result.target_is_extension, true);
assert.strictEqual(result.target_is_bg, true);
assert.strictEqual(result.extension_id, 'abcdefgh');
});
it('should not identify non-extension targets', async () => {
const mockTarget = {
type: () => 'page',
url: () => 'https://example.com',
worker: async () => null,
page: async () => null
};
const result = await extensionUtils.isTargetExtension(mockTarget);
assert.strictEqual(result.target_is_extension, false);
assert.strictEqual(result.target_is_bg, false);
assert.strictEqual(result.extension_id, null);
});
it('should handle closed targets gracefully', async () => {
const mockTarget = {
type: () => { throw new Error('No target with given id found'); },
url: () => { throw new Error('No target with given id found'); },
worker: async () => { throw new Error('No target with given id found'); },
page: async () => { throw new Error('No target with given id found'); }
};
const result = await extensionUtils.isTargetExtension(mockTarget);
assert.strictEqual(result.target_type, 'closed');
assert.strictEqual(result.target_url, 'about:closed');
});
});
});
// Run tests if executed directly
if (require.main === module) {
console.log('Run tests with: npm test');
console.log('Or: node --test tests/test_chrome_extension_utils.js');
}

View File

@@ -0,0 +1,224 @@
"""
Unit tests for chrome_extension_utils.js
Tests invoke the script as an external process and verify outputs/side effects.
"""
import json
import subprocess
import tempfile
from pathlib import Path
import pytest
SCRIPT_PATH = Path(__file__).parent.parent / "chrome_extension_utils.js"
def test_script_exists():
"""Verify the script file exists and is executable via node"""
assert SCRIPT_PATH.exists(), f"Script not found: {SCRIPT_PATH}"
def test_get_extension_id():
"""Test extension ID computation from path"""
with tempfile.TemporaryDirectory() as tmpdir:
test_path = "/path/to/extension"
# Run script with test path
result = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionId", test_path],
capture_output=True,
text=True
)
assert result.returncode == 0, f"Script failed: {result.stderr}"
extension_id = result.stdout.strip()
# Should return 32-character ID with only letters a-p
assert len(extension_id) == 32
assert all(c in 'abcdefghijklmnop' for c in extension_id)
def test_get_extension_id_consistency():
"""Test that same path produces same ID"""
test_path = "/path/to/extension"
result1 = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionId", test_path],
capture_output=True,
text=True
)
result2 = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionId", test_path],
capture_output=True,
text=True
)
assert result1.returncode == 0
assert result2.returncode == 0
assert result1.stdout.strip() == result2.stdout.strip()
def test_get_extension_id_different_paths():
"""Test that different paths produce different IDs"""
result1 = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionId", "/path1"],
capture_output=True,
text=True
)
result2 = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionId", "/path2"],
capture_output=True,
text=True
)
assert result1.returncode == 0
assert result2.returncode == 0
assert result1.stdout.strip() != result2.stdout.strip()
def test_load_extension_manifest():
"""Test loading extension manifest.json"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "test_extension"
ext_dir.mkdir()
# Create manifest
manifest = {
"manifest_version": 3,
"name": "Test Extension",
"version": "1.0.0"
}
(ext_dir / "manifest.json").write_text(json.dumps(manifest))
# Load manifest via script
result = subprocess.run(
["node", str(SCRIPT_PATH), "loadExtensionManifest", str(ext_dir)],
capture_output=True,
text=True
)
assert result.returncode == 0
loaded = json.loads(result.stdout)
assert loaded["manifest_version"] == 3
assert loaded["name"] == "Test Extension"
assert loaded["version"] == "1.0.0"
def test_load_extension_manifest_missing():
"""Test loading manifest from non-existent directory"""
with tempfile.TemporaryDirectory() as tmpdir:
nonexistent = Path(tmpdir) / "nonexistent"
result = subprocess.run(
["node", str(SCRIPT_PATH), "loadExtensionManifest", str(nonexistent)],
capture_output=True,
text=True
)
# Should return null/empty for missing manifest
assert result.returncode == 0
assert result.stdout.strip() in ("null", "")
def test_load_extension_manifest_invalid_json():
"""Test handling of invalid JSON in manifest"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "test_extension"
ext_dir.mkdir()
# Write invalid JSON
(ext_dir / "manifest.json").write_text("invalid json content")
result = subprocess.run(
["node", str(SCRIPT_PATH), "loadExtensionManifest", str(ext_dir)],
capture_output=True,
text=True
)
# Should handle gracefully
assert result.returncode == 0
assert result.stdout.strip() in ("null", "")
def test_get_extension_launch_args_empty():
"""Test launch args with no extensions"""
result = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", "[]"],
capture_output=True,
text=True
)
assert result.returncode == 0
args = json.loads(result.stdout)
assert args == []
def test_get_extension_launch_args_single():
"""Test launch args with single extension"""
extensions = [{
"webstore_id": "abcd1234",
"unpacked_path": "/path/to/extension"
}]
result = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
capture_output=True,
text=True
)
assert result.returncode == 0
args = json.loads(result.stdout)
assert len(args) == 4
assert args[0] == "--load-extension=/path/to/extension"
assert args[1] == "--allowlisted-extension-id=abcd1234"
assert args[2] == "--allow-legacy-extension-manifests"
assert args[3] == "--disable-extensions-auto-update"
def test_get_extension_launch_args_multiple():
"""Test launch args with multiple extensions"""
extensions = [
{"webstore_id": "ext1", "unpacked_path": "/path/ext1"},
{"webstore_id": "ext2", "unpacked_path": "/path/ext2"},
{"webstore_id": "ext3", "unpacked_path": "/path/ext3"}
]
result = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
capture_output=True,
text=True
)
assert result.returncode == 0
args = json.loads(result.stdout)
assert args[0] == "--load-extension=/path/ext1,/path/ext2,/path/ext3"
assert args[1] == "--allowlisted-extension-id=ext1,ext2,ext3"
def test_get_extension_launch_args_filter_null_paths():
"""Test that extensions without paths are filtered out"""
extensions = [
{"webstore_id": "ext1", "unpacked_path": "/path/ext1"},
{"webstore_id": "ext2", "unpacked_path": None},
{"webstore_id": "ext3", "unpacked_path": "/path/ext3"}
]
result = subprocess.run(
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
capture_output=True,
text=True
)
assert result.returncode == 0
args = json.loads(result.stdout)
assert args[0] == "--load-extension=/path/ext1,/path/ext3"
assert args[1] == "--allowlisted-extension-id=ext1,ext3"

View File

@@ -0,0 +1,309 @@
#!/usr/bin/env node
/**
* Navigate the Chrome browser to the target URL.
*
* This extractor runs AFTER pre-load extractors (21-29) have registered their
* CDP listeners. It connects to the existing Chrome session, navigates to the URL,
* waits for page load, and captures response headers.
*
* Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>
* Output: Writes to chrome_session/:
* - response_headers.json: HTTP response headers from main document
* - final_url.txt: Final URL after any redirects
* - page_loaded.txt: Marker file indicating navigation is complete
*
* Environment variables:
* CHROME_PAGELOAD_TIMEOUT: Timeout for page load in seconds (default: 60)
* CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
* CHROME_WAIT_FOR: Wait condition (default: networkidle2)
* - domcontentloaded: DOM is ready, resources may still load
* - load: Page fully loaded including resources
* - networkidle0: No network activity for 500ms (strictest)
* - networkidle2: At most 2 network connections for 500ms
*
* # Fallbacks
* TIMEOUT: Fallback timeout
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_navigate';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
function getEnvFloat(name, defaultValue = 0) {
const val = parseFloat(getEnv(name, String(defaultValue)));
return isNaN(val) ? defaultValue : val;
}
// Read CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (!fs.existsSync(cdpFile)) {
return null;
}
return fs.readFileSync(cdpFile, 'utf8').trim();
}
// Read URL from chrome_session (set by chrome_session extractor)
function getTargetUrl() {
const urlFile = path.join(CHROME_SESSION_DIR, 'url.txt');
if (!fs.existsSync(urlFile)) {
return null;
}
return fs.readFileSync(urlFile, 'utf8').trim();
}
// Validate wait condition
function getWaitCondition() {
const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
const validConditions = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
if (validConditions.includes(waitFor)) {
return waitFor;
}
console.error(`Warning: Invalid CHROME_WAIT_FOR="${waitFor}", using networkidle2`);
return 'networkidle2';
}
// Sleep helper
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function navigateToUrl(url, cdpUrl) {
const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
const waitUntil = getWaitCondition();
let browser = null;
let responseHeaders = {};
let redirectChain = [];
let finalUrl = url;
try {
// Connect to existing browser
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get all pages and find our target page
const pages = await browser.pages();
if (pages.length === 0) {
return { success: false, error: 'No pages found in browser' };
}
// Use the last created page (most likely the one chrome_session created)
const page = pages[pages.length - 1];
// Set up response interception to capture headers and redirects
page.on('response', async (response) => {
const request = response.request();
// Track redirects
if (response.status() >= 300 && response.status() < 400) {
redirectChain.push({
url: response.url(),
status: response.status(),
location: response.headers()['location'] || null,
});
}
// Capture headers from the main document request
if (request.isNavigationRequest() && request.frame() === page.mainFrame()) {
try {
responseHeaders = {
url: response.url(),
status: response.status(),
statusText: response.statusText(),
headers: response.headers(),
};
finalUrl = response.url();
} catch (e) {
// Ignore errors capturing headers
}
}
});
// Navigate to URL and wait for load
console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
const response = await page.goto(url, {
waitUntil,
timeout,
});
// Capture final response if not already captured
if (response && Object.keys(responseHeaders).length === 0) {
responseHeaders = {
url: response.url(),
status: response.status(),
statusText: response.statusText(),
headers: response.headers(),
};
finalUrl = response.url();
}
// Apply optional delay after load
if (delayAfterLoad > 0) {
console.log(`Waiting ${delayAfterLoad}ms after load...`);
await sleep(delayAfterLoad);
}
// Write response headers
if (Object.keys(responseHeaders).length > 0) {
// Add redirect chain to headers
responseHeaders.redirect_chain = redirectChain;
fs.writeFileSync(
path.join(CHROME_SESSION_DIR, 'response_headers.json'),
JSON.stringify(responseHeaders, null, 2)
);
}
// Write final URL (after redirects)
fs.writeFileSync(path.join(CHROME_SESSION_DIR, 'final_url.txt'), finalUrl);
// Write marker file indicating page is loaded
fs.writeFileSync(
path.join(CHROME_SESSION_DIR, 'page_loaded.txt'),
new Date().toISOString()
);
// Disconnect but leave browser running for post-load extractors
browser.disconnect();
return {
success: true,
output: CHROME_SESSION_DIR,
finalUrl,
status: responseHeaders.status,
redirectCount: redirectChain.length,
};
} catch (e) {
// Don't close browser on error - let cleanup handle it
if (browser) {
try {
browser.disconnect();
} catch (disconnectErr) {
// Ignore
}
}
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check for chrome_session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
console.error('ERROR: chrome_session not found (cdp_url.txt missing)');
console.error('chrome_navigate requires chrome_session to run first');
process.exit(1);
}
// Get URL from chrome_session or use provided URL
const targetUrl = getTargetUrl() || url;
const result = await navigateToUrl(targetUrl, cdpUrl);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Page loaded: ${result.finalUrl}`);
console.log(`HTTP status: ${result.status}`);
if (result.redirectCount > 0) {
console.log(`Redirects: ${result.redirectCount}`);
}
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,80 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"CHROME_BINARY": {
"type": "string",
"default": "chromium",
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
"description": "Path to Chrome/Chromium binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"x-aliases": ["NODEJS_BINARY"],
"description": "Path to Node.js binary (for Puppeteer)"
},
"CHROME_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Chrome operations in seconds"
},
"CHROME_HEADLESS": {
"type": "boolean",
"default": true,
"description": "Run Chrome in headless mode"
},
"CHROME_SANDBOX": {
"type": "boolean",
"default": true,
"description": "Enable Chrome sandbox (disable in Docker with --no-sandbox)"
},
"CHROME_RESOLUTION": {
"type": "string",
"default": "1440,2000",
"pattern": "^\\d+,\\d+$",
"x-fallback": "RESOLUTION",
"description": "Browser viewport resolution (width,height)"
},
"CHROME_USER_DATA_DIR": {
"type": "string",
"default": "",
"description": "Path to Chrome user data directory for persistent sessions"
},
"CHROME_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string for Chrome"
},
"CHROME_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra command-line arguments for Chrome (space-separated)"
},
"CHROME_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"SAVE_SCREENSHOT": {
"type": "boolean",
"default": true,
"description": "Enable screenshot capture"
},
"SAVE_PDF": {
"type": "boolean",
"default": true,
"description": "Enable PDF generation"
},
"SAVE_DOM": {
"type": "boolean",
"default": true,
"description": "Enable DOM capture"
}
}
}

View File

@@ -0,0 +1,150 @@
#!/usr/bin/env python3
"""
Validation hook for Chrome/Chromium binary.
Runs at crawl start to verify Chrome is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
# Common Chrome/Chromium binary names and paths
CHROME_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
CHROME_PATHS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
'/snap/bin/chromium',
'/opt/google/chrome/chrome',
]
def get_binary_version(abspath: str) -> str | None:
"""Get version string from Chrome binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0 and result.stdout:
# Chrome version string: "Google Chrome 120.0.6099.109" or "Chromium 120.0.6099.109"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
# Find version number (looks like 120.0.6099.109)
for part in parts:
if '.' in part and part[0].isdigit():
return part
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_chrome() -> dict | None:
"""Find Chrome/Chromium binary."""
# Check env var first
env_path = os.environ.get('CHROME_BINARY', '')
if env_path and Path(env_path).is_file():
return {
'name': 'chrome',
'abspath': env_path,
'version': get_binary_version(env_path),
'sha256': get_binary_hash(env_path),
'binprovider': 'env',
}
# Try shutil.which for various names
for name in CHROME_NAMES:
abspath = shutil.which(name)
if abspath:
return {
'name': 'chrome',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
# Check common paths
for path in CHROME_PATHS:
if Path(path).is_file():
return {
'name': 'chrome',
'abspath': path,
'version': get_binary_version(path),
'sha256': get_binary_hash(path),
'binprovider': 'env',
}
return None
def main():
result = find_chrome()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROME_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/CHROME_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'chrome',
'bin_providers': 'apt,brew,env',
}))
print(f"Chrome/Chromium binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
Validate and compute derived Chrome config values.
This hook runs early in the Crawl lifecycle to:
1. Auto-detect Chrome binary location
2. Compute sandbox settings based on Docker detection
3. Validate binary availability and version
4. Set computed env vars for subsequent hooks
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- InstalledBinary JSONL records to stdout when binaries are found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
# Chrome binary search order
CHROME_BINARY_NAMES = [
'chromium',
'chromium-browser',
'google-chrome',
'google-chrome-stable',
'chrome',
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def detect_docker() -> bool:
"""Detect if running inside Docker container."""
return (
os.path.exists('/.dockerenv') or
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
os.path.exists('/run/.containerenv')
)
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
"""Find Chrome binary using abx-pkg, checking configured path first."""
# Try configured binary first
if configured:
try:
binary = Binary(name=configured, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
pass
# Search common names
for name in CHROME_BINARY_NAMES:
try:
binary = Binary(name=name, binproviders=[provider]).load()
if binary.abspath:
return binary
except Exception:
continue
return None
def output_installed_binary(binary: Binary, name: str):
"""Output InstalledBinary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'InstalledBinary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
chrome_binary = get_env('CHROME_BINARY', 'chromium')
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
save_screenshot = get_env_bool('SAVE_SCREENSHOT', True)
save_pdf = get_env_bool('SAVE_PDF', True)
save_dom = get_env_bool('SAVE_DOM', True)
# Compute USE_CHROME (derived from SAVE_* flags)
use_chrome = save_screenshot or save_pdf or save_dom
computed['USE_CHROME'] = str(use_chrome).lower()
# Detect Docker and adjust sandbox
in_docker = detect_docker()
computed['IN_DOCKER'] = str(in_docker).lower()
if in_docker and chrome_sandbox:
warnings.append(
"Running in Docker with CHROME_SANDBOX=true. "
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
)
# Auto-disable sandbox in Docker unless explicitly set
if not get_env('CHROME_SANDBOX'):
computed['CHROME_SANDBOX'] = 'false'
# Find Chrome binary using abx-pkg
provider = EnvProvider()
if use_chrome:
chrome = find_chrome_binary(chrome_binary, provider)
if not chrome or not chrome.abspath:
errors.append(
f"Chrome binary not found (tried: {chrome_binary}). "
"Install Chrome/Chromium or set CHROME_BINARY path."
)
computed['CHROME_BINARY'] = ''
else:
computed['CHROME_BINARY'] = str(chrome.abspath)
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
# Output InstalledBinary JSONL record for Chrome
output_installed_binary(chrome, name='chrome')
# Check Node.js for Puppeteer
node_binary_name = get_env('NODE_BINARY', 'node')
try:
node = Binary(name=node_binary_name, binproviders=[provider]).load()
node_path = str(node.abspath) if node.abspath else ''
except Exception:
node = None
node_path = ''
if use_chrome and not node_path:
errors.append(
f"Node.js not found (tried: {node_binary_name}). "
"Install Node.js or set NODE_BINARY path for Puppeteer."
)
else:
computed['NODE_BINARY'] = node_path
if node and node.abspath:
# Output InstalledBinary JSONL record for Node
output_installed_binary(node, name='node')
# Output computed values
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,350 @@
#!/usr/bin/env node
/**
* Start a Chrome browser session for use by other extractors.
*
* This extractor ONLY launches Chrome and creates a blank page - it does NOT navigate.
* Pre-load extractors (21-29) can connect via CDP to register listeners before navigation.
* The chrome_navigate extractor (30) performs the actual page load.
*
* Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>
* Output: Creates chrome_session/ with:
* - cdp_url.txt: WebSocket URL for CDP connection
* - pid.txt: Chrome process ID (for cleanup)
* - page_id.txt: Target ID of the page for other extractors to use
* - url.txt: The URL to be navigated to (for chrome_navigate)
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extractor metadata
const EXTRACTOR_NAME = 'chrome_session';
const OUTPUT_DIR = 'chrome_session';
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
// Load installed extensions from cache files
function loadInstalledExtensions() {
const extensions = [];
if (!fs.existsSync(EXTENSIONS_DIR)) {
return extensions;
}
// Look for *.extension.json cache files created by extension plugins
const files = fs.readdirSync(EXTENSIONS_DIR);
const extensionFiles = files.filter(f => f.endsWith('.extension.json'));
for (const file of extensionFiles) {
try {
const filePath = path.join(EXTENSIONS_DIR, file);
const data = fs.readFileSync(filePath, 'utf-8');
const extension = JSON.parse(data);
// Verify extension is actually installed
const manifestPath = path.join(extension.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
extensions.push(extension);
console.log(`[+] Loaded extension: ${extension.name} (${extension.webstore_id})`);
}
} catch (e) {
console.warn(`[⚠️] Failed to load extension from ${file}: ${e.message}`);
}
}
return extensions;
}
async function startChromeSession(url, binary) {
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Load installed extensions
const extensions = loadInstalledExtensions();
const extensionArgs = extensionUtils.getExtensionLaunchArgs(extensions);
if (extensions.length > 0) {
console.log(`[*] Loading ${extensions.length} Chrome extensions...`);
}
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
let browser = null;
try {
// Launch browser with Puppeteer
browser = await puppeteer.launch({
executablePath: binary,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-sync',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-breakpad',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
'--password-store=basic',
'--use-mock-keychain',
'--font-render-hinting=none',
'--force-color-profile=srgb',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
...extensionArgs,
],
defaultViewport: { width, height },
});
// Get the WebSocket endpoint URL
const cdpUrl = browser.wsEndpoint();
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
// Write PID for cleanup
const browserProcess = browser.process();
if (browserProcess) {
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(browserProcess.pid));
}
// Create a new page (but DON'T navigate yet)
const page = await browser.newPage();
// Set user agent if specified
if (userAgent) {
await page.setUserAgent(userAgent);
}
// Write the page target ID so other extractors can find this specific page
const target = page.target();
const targetId = target._targetId;
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
// Write the URL for chrome_navigate to use
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
// Connect to loaded extensions at runtime (only if not already done)
const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
if (extensions.length > 0 && !fs.existsSync(extensionsFile)) {
console.log('[*] Connecting to loaded extensions (first time setup)...');
try {
const loadedExtensions = await extensionUtils.loadAllExtensionsFromBrowser(browser, extensions);
// Write loaded extensions metadata for other extractors to use
fs.writeFileSync(extensionsFile, JSON.stringify(loadedExtensions, null, 2));
console.log(`[+] Extensions loaded and available at ${extensionsFile}`);
console.log(`[+] ${loadedExtensions.length} extensions ready for configuration by subsequent plugins`);
} catch (e) {
console.warn(`[⚠️] Failed to load extensions from browser: ${e.message}`);
}
} else if (extensions.length > 0) {
console.log('[*] Extensions already loaded from previous snapshot');
}
// Don't close browser - leave it running for other extractors
// Detach puppeteer from browser so it stays running
browser.disconnect();
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId };
} catch (e) {
// Kill browser if startup failed
if (browser) {
try {
await browser.close();
} catch (closeErr) {
// Ignore
}
}
return { success: false, error: `${e.name}: ${e.message}` };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let version = '';
try {
// chrome_session launches Chrome and creates a blank page
// Pre-load extractors (21-29) register CDP listeners
// chrome_navigate (30) performs actual navigation
const binary = findChrome();
if (!binary) {
console.error('ERROR: Chrome/Chromium binary not found');
console.error('DEPENDENCY_NEEDED=chrome');
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
process.exit(1);
}
// Get Chrome version
try {
const { execSync } = require('child_process');
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
} catch (e) {
version = '';
}
const result = await startChromeSession(url, binary);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Chrome session started (no navigation yet): ${result.cdpUrl}`);
console.log(`Page target ID: ${result.targetId}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (version) {
console.log(`VERSION=${version}`);
}
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
cmd_version: version,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,297 @@
#!/usr/bin/env node
/**
* Capture console output from a page.
*
* Captures all console messages during page load:
* - log, warn, error, info, debug
* - Includes stack traces for errors
* - Timestamps for each message
*
* Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>
* Output: Writes consolelog/console.jsonl (one message per line)
*
* Environment variables:
* SAVE_CONSOLELOG: Enable console log capture (default: true)
* CONSOLELOG_TIMEOUT: Capture duration in seconds (default: 5)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'consolelog';
const OUTPUT_DIR = 'consolelog';
const OUTPUT_FILE = 'console.jsonl';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Serialize console message arguments
async function serializeArgs(args) {
const serialized = [];
for (const arg of args) {
try {
const json = await arg.jsonValue();
serialized.push(json);
} catch (e) {
// If jsonValue() fails, try to get text representation
try {
serialized.push(String(arg));
} catch (e2) {
serialized.push('[Unserializable]');
}
}
}
return serialized;
}
// Capture console logs
async function captureConsoleLogs(url) {
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Clear existing file
fs.writeFileSync(outputPath, '');
let browser = null;
const consoleLogs = [];
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Listen for console messages
page.on('console', async (msg) => {
try {
const type = msg.type();
const text = msg.text();
const location = msg.location();
const args = await serializeArgs(msg.args());
const logEntry = {
timestamp: new Date().toISOString(),
type,
text,
args,
location: {
url: location.url || '',
lineNumber: location.lineNumber,
columnNumber: location.columnNumber,
},
};
// Write immediately to file
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
consoleLogs.push(logEntry);
} catch (e) {
// Error processing console message, skip it
console.error(`Error processing console message: ${e.message}`);
}
});
// Listen for page errors
page.on('pageerror', (error) => {
try {
const logEntry = {
timestamp: new Date().toISOString(),
type: 'error',
text: error.message,
stack: error.stack || '',
location: {},
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
consoleLogs.push(logEntry);
} catch (e) {
console.error(`Error processing page error: ${e.message}`);
}
});
// Listen for request failures
page.on('requestfailed', (request) => {
try {
const failure = request.failure();
const logEntry = {
timestamp: new Date().toISOString(),
type: 'request_failed',
text: `Request failed: ${request.url()}`,
error: failure ? failure.errorText : 'Unknown error',
url: request.url(),
location: {},
};
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
consoleLogs.push(logEntry);
} catch (e) {
console.error(`Error processing request failure: ${e.message}`);
}
});
// Wait to capture logs
await new Promise(resolve => setTimeout(resolve, captureTimeout));
// Group logs by type
const logStats = consoleLogs.reduce((acc, log) => {
acc[log.type] = (acc[log.type] || 0) + 1;
return acc;
}, {});
return {
success: true,
output: outputPath,
logCount: consoleLogs.length,
logStats,
};
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let logCount = 0;
try {
// Check if enabled
if (!getEnvBool('SAVE_CONSOLELOG', true)) {
console.log('Skipping console log (SAVE_CONSOLELOG=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await captureConsoleLogs(url);
if (result.success) {
status = 'succeeded';
output = result.output;
logCount = result.logCount || 0;
const statsStr = Object.entries(result.logStats || {})
.map(([type, count]) => `${count} ${type}`)
.join(', ');
console.log(`Captured ${logCount} console messages: ${statsStr}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
log_count: logCount,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Install a binary using a custom bash command.
This provider runs arbitrary shell commands to install binaries
that don't fit into standard package managers.
Usage: on_Dependency__install_using_custom_bash.py --dependency-id=<uuid> --bin-name=<name> --custom-cmd=<cmd>
Output: InstalledBinary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import subprocess
import sys
import rich_click as click
from abx_pkg import Binary, EnvProvider
@click.command()
@click.option('--dependency-id', required=True, help="Dependency UUID")
@click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', required=True, help="Custom bash command to run")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str):
"""Install binary using custom bash command."""
if bin_providers != '*' and 'custom' not in bin_providers.split(','):
click.echo(f"custom provider not allowed for {bin_name}", err=True)
sys.exit(0)
if not custom_cmd:
click.echo("custom provider requires --custom-cmd", err=True)
sys.exit(1)
click.echo(f"Installing {bin_name} via custom command: {custom_cmd}", err=True)
try:
result = subprocess.run(
custom_cmd,
shell=True,
capture_output=True,
text=True,
timeout=600, # 10 minute timeout for custom installs
)
if result.returncode != 0:
click.echo(f"Custom install failed: {result.stderr}", err=True)
sys.exit(1)
except subprocess.TimeoutExpired:
click.echo("Custom install timed out", err=True)
sys.exit(1)
# Use abx-pkg to load the installed binary and get its info
provider = EnvProvider()
try:
binary = Binary(name=bin_name, binproviders=[provider]).load()
except Exception as e:
click.echo(f"{bin_name} not found after custom install: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{bin_name} not found after custom install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output InstalledBinary JSONL record to stdout
record = {
'type': 'InstalledBinary',
'name': bin_name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'custom',
'machine_id': machine_id,
'dependency_id': dependency_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,296 @@
#!/usr/bin/env node
/**
* Dump the DOM of a URL using Chrome/Puppeteer.
*
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
* Otherwise launches a new Chrome instance.
*
* Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>
* Output: Writes dom/output.html
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
* SAVE_DOM: Enable DOM extraction (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'dom';
const OUTPUT_DIR = 'dom';
const OUTPUT_FILE = 'output.html';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
// Get CDP URL from chrome_session if available
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
async function dumpDom(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
let connectedToSession = false;
try {
// Try to connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
try {
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
connectedToSession = true;
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
}
// Set viewport on the page
await page.setViewport({ width, height });
} catch (e) {
console.error(`Failed to connect to CDP session: ${e.message}`);
browser = null;
}
}
// Fall back to launching new browser
if (!browser) {
const executablePath = findChrome();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}
browser = await puppeteer.launch({
executablePath,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
],
defaultViewport: { width, height },
});
page = await browser.newPage();
// Navigate to URL (only if we launched fresh browser)
if (userAgent) {
await page.setUserAgent(userAgent);
}
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
}
// Get the full DOM content
const domContent = await page.content();
if (domContent && domContent.length > 100) {
fs.writeFileSync(outputPath, domContent, 'utf8');
return { success: true, output: outputPath };
} else {
return { success: false, error: 'DOM content too short or empty' };
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
// Only close browser if we launched it (not if we connected to session)
if (browser && !connectedToSession) {
await browser.close();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if DOM is enabled (permanent skip - don't retry)
if (!getEnvBool('SAVE_DOM', true)) {
console.log('Skipping DOM (SAVE_DOM=False)');
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${new Date().toISOString()}`);
console.log(`STATUS=skipped`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
process.exit(0); // Permanent skip - feature disabled
}
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.log(`Skipping DOM - staticfile extractor already downloaded this`);
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${new Date().toISOString()}`);
console.log(`STATUS=skipped`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
process.exit(0); // Permanent skip - staticfile already handled
} else {
const result = await dumpDom(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const size = fs.statSync(output).size;
console.log(`DOM saved (${size} bytes)`);
} else {
status = 'failed';
error = result.error;
}
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,70 @@
#!/usr/bin/env python3
"""
Check if a binary is already available in the system PATH.
This is the simplest "provider" - it doesn't install anything,
it just discovers binaries that are already installed.
Usage: on_Dependency__install_using_env_provider.py --dependency-id=<uuid> --bin-name=<name>
Output: InstalledBinary JSONL record to stdout if binary found in PATH
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import sys
import rich_click as click
from abx_pkg import Binary, EnvProvider
@click.command()
@click.option('--dependency-id', required=True, help="Dependency UUID")
@click.option('--bin-name', required=True, help="Binary name to find")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
def main(dependency_id: str, bin_name: str, bin_providers: str):
"""Check if binary is available in PATH and record it."""
# Check if env provider is allowed
if bin_providers != '*' and 'env' not in bin_providers.split(','):
click.echo(f"env provider not allowed for {bin_name}", err=True)
sys.exit(0) # Not an error, just skip
# Use abx-pkg EnvProvider to find binary
provider = EnvProvider()
try:
binary = Binary(name=bin_name, binproviders=[provider]).load()
except Exception as e:
click.echo(f"{bin_name} not found in PATH: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{bin_name} not found in PATH", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output InstalledBinary JSONL record to stdout
record = {
'type': 'InstalledBinary',
'name': bin_name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
'dependency_id': dependency_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Found {bin_name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,395 @@
#!/usr/bin/env python3
"""
Shared utilities for extractor hooks.
This module provides common functionality for all extractors to ensure
consistent behavior, output format, error handling, and timing.
All extractors should:
1. Import and use these utilities
2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
3. Write all files to $PWD
4. Return proper exit codes (0=success, 1=failure)
5. Be runnable standalone without any archivebox imports
"""
import json
import os
import shutil
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# Static file extensions that generally don't need browser-based extraction
STATIC_EXTENSIONS = (
'.pdf', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico',
'.mp4', '.mp3', '.m4a', '.webm', '.mkv', '.avi', '.mov',
'.zip', '.tar', '.gz', '.bz2', '.xz', '.7z', '.rar',
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
'.exe', '.dmg', '.apk', '.deb', '.rpm',
)
def is_static_file(url: str) -> bool:
"""Check if URL points to a static file that may not need browser extraction."""
return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)
def get_env(name: str, default: str = '') -> str:
"""Get environment variable with default."""
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
"""Get boolean environment variable."""
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
"""Get integer environment variable."""
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def find_binary(bin_name: str, env_var: str | None = None) -> str | None:
"""Find binary from environment variable or PATH."""
if env_var:
binary = get_env(env_var)
if binary and os.path.isfile(binary):
return binary
return shutil.which(bin_name)
def get_version(binary: str, version_args: list[str] | None = None) -> str:
"""Get binary version string."""
if not binary or not os.path.isfile(binary):
return ''
args = version_args or ['--version']
try:
result = subprocess.run(
[binary] + args,
capture_output=True,
text=True,
timeout=10
)
# Return first non-empty line, truncated
for line in result.stdout.split('\n'):
line = line.strip()
if line:
return line[:64]
return ''
except Exception:
return ''
class ExtractorResult:
"""
Tracks extractor execution and produces consistent output.
Usage:
result = ExtractorResult(name='wget', url=url)
result.cmd = ['wget', url]
result.version = '1.21'
# ... do extraction ...
result.output = 'example.com/index.html'
result.status = 'succeeded'
result.finish()
sys.exit(result.exit_code)
"""
def __init__(self, name: str, url: str, snapshot_id: str = ''):
self.name = name
self.url = url
self.snapshot_id = snapshot_id
self.start_ts = datetime.now(timezone.utc)
self.end_ts: datetime | None = None
self.cmd: list[str] = []
self.version: str = ''
self.output: str | Path | None = None
self.status: str = 'failed' # 'succeeded', 'failed', 'skipped'
self.stdout: str = ''
self.stderr: str = ''
self.returncode: int | None = None
self.error: str = ''
self.hints: list[str] = []
# Dependency info for missing binary
self.dependency_needed: str = ''
self.bin_providers: str = ''
@property
def duration(self) -> float:
"""Duration in seconds."""
if self.end_ts:
return (self.end_ts - self.start_ts).total_seconds()
return (datetime.now(timezone.utc) - self.start_ts).total_seconds()
@property
def exit_code(self) -> int:
"""Exit code based on status."""
if self.status == 'succeeded':
return 0
if self.status == 'skipped':
return 0 # Skipped is not a failure
return 1
def finish(self, status: str | None = None):
"""Mark extraction as finished and print results."""
self.end_ts = datetime.now(timezone.utc)
if status:
self.status = status
self._print_results()
def _print_results(self):
"""Print consistent output for hooks.py to parse."""
import sys
# Print timing
print(f"START_TS={self.start_ts.isoformat()}")
print(f"END_TS={self.end_ts.isoformat() if self.end_ts else ''}")
print(f"DURATION={self.duration:.2f}")
# Print command info
if self.cmd:
print(f"CMD={' '.join(str(c) for c in self.cmd)}")
if self.version:
print(f"VERSION={self.version}")
# Print output path
if self.output:
print(f"OUTPUT={self.output}")
# Print status
print(f"STATUS={self.status}")
# Print dependency info if needed
if self.dependency_needed:
print(f"DEPENDENCY_NEEDED={self.dependency_needed}", file=sys.stderr)
if self.bin_providers:
print(f"BIN_PROVIDERS={self.bin_providers}", file=sys.stderr)
# Print error info
if self.error:
print(f"ERROR={self.error}", file=sys.stderr)
for hint in self.hints:
print(f"HINT={hint}", file=sys.stderr)
# Print JSON result for structured parsing
result_json = {
'extractor': self.name,
'url': self.url,
'snapshot_id': self.snapshot_id,
'status': self.status,
'start_ts': self.start_ts.isoformat(),
'end_ts': self.end_ts.isoformat() if self.end_ts else None,
'duration': round(self.duration, 2),
'cmd': self.cmd,
'cmd_version': self.version,
'output': str(self.output) if self.output else None,
'returncode': self.returncode,
'error': self.error or None,
}
print(f"RESULT_JSON={json.dumps(result_json)}")
def run_shell_command(
cmd: list[str],
cwd: str | Path | None = None,
timeout: int = 60,
result: ExtractorResult | None = None,
) -> subprocess.CompletedProcess:
"""
Run a shell command with proper capturing and timing.
Updates result object if provided with stdout, stderr, returncode.
"""
cwd = cwd or Path.cwd()
try:
proc = subprocess.run(
cmd,
cwd=str(cwd),
capture_output=True,
timeout=timeout,
)
if result:
result.stdout = proc.stdout.decode('utf-8', errors='replace')
result.stderr = proc.stderr.decode('utf-8', errors='replace')
result.returncode = proc.returncode
return proc
except subprocess.TimeoutExpired as e:
if result:
result.error = f"Command timed out after {timeout} seconds"
result.stdout = e.stdout.decode('utf-8', errors='replace') if e.stdout else ''
result.stderr = e.stderr.decode('utf-8', errors='replace') if e.stderr else ''
raise
except Exception as e:
if result:
result.error = f"{type(e).__name__}: {e}"
raise
def chrome_args(
headless: bool = True,
sandbox: bool = False,
resolution: str = '1440,900',
user_agent: str = '',
check_ssl: bool = True,
user_data_dir: str = '',
profile_name: str = 'Default',
extra_args: list[str] | None = None,
) -> list[str]:
"""
Build Chrome/Chromium command line arguments.
Based on the old CHROME_CONFIG.chrome_args() implementation.
"""
args = [
# Disable unnecessary features
'--disable-sync',
'--no-pings',
'--no-first-run',
'--no-default-browser-check',
'--disable-default-apps',
'--disable-infobars',
'--disable-blink-features=AutomationControlled',
# Deterministic behavior
'--js-flags=--random-seed=1157259159',
'--deterministic-mode',
'--deterministic-fetch',
# Performance
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-ipc-flooding-protection',
# Disable prompts/popups
'--deny-permission-prompts',
'--disable-notifications',
'--disable-popup-blocking',
'--noerrdialogs',
# Security/privacy
'--disable-client-side-phishing-detection',
'--disable-domain-reliability',
'--disable-component-update',
'--safebrowsing-disable-auto-update',
'--password-store=basic',
'--use-mock-keychain',
# GPU/rendering
'--force-gpu-mem-available-mb=4096',
'--font-render-hinting=none',
'--force-color-profile=srgb',
'--disable-partial-raster',
'--disable-skia-runtime-opts',
'--disable-2d-canvas-clip-aa',
'--disable-lazy-loading',
# Media
'--use-fake-device-for-media-stream',
'--disable-gesture-requirement-for-media-playback',
]
if headless:
args.append('--headless=new')
if not sandbox:
args.extend([
'--no-sandbox',
'--no-zygote',
'--disable-dev-shm-usage',
'--disable-software-rasterizer',
])
if resolution:
args.append(f'--window-size={resolution}')
if not check_ssl:
args.extend([
'--disable-web-security',
'--ignore-certificate-errors',
])
if user_agent:
args.append(f'--user-agent={user_agent}')
if user_data_dir:
args.append(f'--user-data-dir={user_data_dir}')
args.append(f'--profile-directory={profile_name}')
if extra_args:
args.extend(extra_args)
return args
def chrome_cleanup_lockfile(user_data_dir: str | Path):
"""Remove Chrome SingletonLock file that can prevent browser from starting."""
if not user_data_dir:
return
lockfile = Path(user_data_dir) / 'SingletonLock'
try:
lockfile.unlink(missing_ok=True)
except Exception:
pass
# Common Chrome binary names to search for
CHROME_BINARY_NAMES = [
'google-chrome',
'google-chrome-stable',
'chromium',
'chromium-browser',
'chrome',
]
CHROME_BINARY_NAMES_MACOS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
]
def find_chrome() -> str | None:
"""Find Chrome/Chromium binary."""
# Check environment first
chrome = get_env('CHROME_BINARY')
if chrome and os.path.isfile(chrome):
return chrome
# Search PATH
for name in CHROME_BINARY_NAMES:
binary = shutil.which(name)
if binary:
return binary
# Check macOS locations
for path in CHROME_BINARY_NAMES_MACOS:
if os.path.isfile(path):
return path
return None

View File

@@ -0,0 +1,31 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_FAVICON": {
"type": "boolean",
"default": true,
"description": "Enable favicon downloading"
},
"FAVICON_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for favicon fetch in seconds"
},
"FAVICON_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
},
"FAVICON_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
}
}
}

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python3
"""
Extract favicon from a URL.
Usage: on_Snapshot__favicon.py --url=<url> --snapshot-id=<uuid>
Output: Writes favicon.ico to $PWD
Environment variables:
TIMEOUT: Timeout in seconds (default: 30)
USER_AGENT: User agent string
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
It can run standalone if requests is installed: pip install requests
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urljoin, urlparse
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'favicon'
OUTPUT_DIR = 'favicon'
OUTPUT_FILE = 'favicon.ico'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_favicon(url: str) -> tuple[bool, str | None, str]:
"""
Fetch favicon from URL.
Returns: (success, output_path, error_message)
"""
try:
import requests
except ImportError:
return False, None, 'requests library not installed'
timeout = get_env_int('TIMEOUT', 30)
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
headers = {'User-Agent': user_agent}
# Build list of possible favicon URLs
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
favicon_urls = [
urljoin(base_url, '/favicon.ico'),
urljoin(base_url, '/favicon.png'),
urljoin(base_url, '/apple-touch-icon.png'),
]
# Try to extract favicon URL from HTML link tags
try:
response = requests.get(url, timeout=timeout, headers=headers)
if response.ok:
# Look for <link rel="icon" href="...">
for match in re.finditer(
r'<link[^>]+rel=["\'](?:shortcut )?icon["\'][^>]+href=["\']([^"\']+)["\']',
response.text,
re.I
):
favicon_urls.insert(0, urljoin(url, match.group(1)))
# Also check reverse order: href before rel
for match in re.finditer(
r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\'](?:shortcut )?icon["\']',
response.text,
re.I
):
favicon_urls.insert(0, urljoin(url, match.group(1)))
except Exception:
pass # Continue with default favicon URLs
# Try each URL until we find one that works
for favicon_url in favicon_urls:
try:
response = requests.get(favicon_url, timeout=15, headers=headers)
if response.ok and len(response.content) > 0:
Path(OUTPUT_FILE).write_bytes(response.content)
return True, OUTPUT_FILE, ''
except Exception:
continue
# Try Google's favicon service as fallback
try:
google_url = f'https://www.google.com/s2/favicons?domain={parsed.netloc}'
response = requests.get(google_url, timeout=15, headers=headers)
if response.ok and len(response.content) > 0:
Path(OUTPUT_FILE).write_bytes(response.content)
return True, OUTPUT_FILE, ''
except Exception:
pass
return False, None, 'No favicon found'
@click.command()
@click.option('--url', required=True, help='URL to extract favicon from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Extract favicon from a URL."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
try:
# Run extraction
success, output, error = get_favicon(url)
status = 'succeeded' if success else 'failed'
if success:
print(f'Favicon saved ({Path(output).stat().st_size} bytes)')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,262 @@
"""
Integration tests for favicon plugin
Tests verify:
1. Plugin script exists
2. requests library is available
3. Favicon extraction works for real example.com
4. Output file is actual image data
5. Tries multiple favicon URLs
6. Falls back to Google's favicon service
7. Config options work (TIMEOUT, USER_AGENT)
8. Handles failures gracefully
"""
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
FAVICON_HOOK = PLUGIN_DIR / 'on_Snapshot__11_favicon.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
def test_requests_library_available():
"""Test that requests library is available."""
result = subprocess.run(
[sys.executable, '-c', 'import requests; print(requests.__version__)'],
capture_output=True,
text=True
)
if result.returncode != 0:
pytest.skip("requests library not installed")
assert len(result.stdout.strip()) > 0, "Should report requests version"
def test_extracts_favicon_from_example_com():
"""Test full workflow: extract favicon from real example.com.
Note: example.com doesn't have a favicon and Google's service may also fail,
so we test that the extraction completes and reports appropriate status.
"""
# Check requests is available
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run favicon extraction
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (if Google service works) or fail (if no favicon)
assert result.returncode in (0, 1), "Should complete extraction attempt"
# Verify RESULT_JSON is present
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
# If it succeeded, verify the favicon file
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'Favicon saved' in result.stdout, "Should report completion"
favicon_file = tmpdir / 'favicon.ico'
assert favicon_file.exists(), "favicon.ico not created"
# Verify file is not empty and contains actual image data
file_size = favicon_file.stat().st_size
assert file_size > 0, "Favicon file should not be empty"
assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
# Check for common image magic bytes
favicon_data = favicon_file.read_bytes()
# ICO, PNG, GIF, JPEG, or WebP
is_image = (
favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
favicon_data[:3] == b'GIF' or # GIF
favicon_data[:2] == b'\xff\xd8' or # JPEG
favicon_data[8:12] == b'WEBP' # WebP
)
assert is_image, "Favicon file should be a valid image format"
else:
# Failed as expected
assert 'STATUS=failed' in result.stdout
assert 'No favicon found' in result.stdout or 'No favicon found' in result.stderr
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
favicon_file = tmpdir / 'favicon.ico'
if favicon_file.exists():
assert favicon_file.stat().st_size > 0
def test_handles_missing_favicon_gracefully():
"""Test that favicon plugin handles sites without favicons gracefully.
Note: The plugin falls back to Google's favicon service, which generates
a generic icon even if the site doesn't have one, so extraction usually succeeds.
"""
check_result = subprocess.run(
[sys.executable, '-c', 'import requests'],
capture_output=True
)
if check_result.returncode != 0:
pytest.skip("requests not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try a URL that likely doesn't have a favicon
result = subprocess.run(
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed (Google fallback) or fail gracefully
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
if result.returncode != 0:
combined = result.stdout + result.stderr
assert 'No favicon found' in combined or 'ERROR=' in combined
def test_reports_missing_requests_library():
"""Test that script reports error when requests library is missing."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run with PYTHONPATH cleared to simulate missing requests
import os
env = os.environ.copy()
# Keep only minimal PATH, clear PYTHONPATH
env['PYTHONPATH'] = '/nonexistent'
result = subprocess.run(
[sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env
)
# Should fail and report missing requests
if result.returncode != 0:
combined = result.stdout + result.stderr
# May report missing requests or other import errors
assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,40 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_GIT": {
"type": "boolean",
"default": true,
"description": "Enable git repository cloning"
},
"GIT_BINARY": {
"type": "string",
"default": "git",
"description": "Path to git binary"
},
"GIT_TIMEOUT": {
"type": "integer",
"default": 120,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for git operations in seconds"
},
"GIT_DOMAINS": {
"type": "string",
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
"description": "Comma-separated list of domains to treat as git repositories"
},
"GIT_CLONE_DEPTH": {
"type": "integer",
"default": 1,
"minimum": 0,
"description": "Depth of git clone (0 for full history, 1 for shallow)"
},
"GIT_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for git clone"
}
}
}

View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
Validation hook for git binary.
Runs at crawl start to verify git is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# git version string: "git version 2.43.0"
first_line = result.stdout.strip().split('\n')[0]
parts = first_line.split()
if len(parts) >= 3 and parts[0] == 'git':
return parts[2]
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_git() -> dict | None:
"""Find git binary."""
try:
from abx_pkg import Binary, EnvProvider
class GitBinary(Binary):
name: str = 'git'
binproviders_supported = [EnvProvider()]
binary = GitBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'git',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('git') or os.environ.get('GIT_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'git',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
result = find_git()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/GIT_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/GIT_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'git',
'bin_providers': 'apt,brew,env',
}))
print(f"git binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""
Clone a git repository from a URL.
Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
Output: Clones repository to $PWD/repo
Environment variables:
GIT_BINARY: Path to git binary
TIMEOUT: Timeout in seconds (default: 120)
GIT_ARGS: Extra arguments for git clone (space-separated)
"""
import json
import os
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'git'
BIN_NAME = 'git'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'repo'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def is_git_url(url: str) -> bool:
"""Check if URL looks like a git repository."""
git_patterns = [
'.git',
'github.com',
'gitlab.com',
'bitbucket.org',
'git://',
'ssh://git@',
]
return any(p in url.lower() for p in git_patterns)
def find_git() -> str | None:
"""Find git binary."""
git = get_env('GIT_BINARY')
if git and os.path.isfile(git):
return git
return shutil.which('git')
def get_version(binary: str) -> str:
"""Get git version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Clone git repository.
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('TIMEOUT', 120)
extra_args = get_env('GIT_ARGS')
cmd = [
binary,
'clone',
'--depth=1',
'--recursive',
]
if extra_args:
cmd.extend(extra_args.split())
cmd.extend([url, OUTPUT_DIR])
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
return True, OUTPUT_DIR, ''
else:
stderr = result.stderr.decode('utf-8', errors='replace')
return False, None, f'git clone failed: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='Git repository URL')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Clone a git repository from a URL."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
try:
# Check if URL looks like a git repo
if not is_git_url(url):
print(f'Skipping git clone for non-git URL: {url}')
status = 'skipped'
end_ts = datetime.now(timezone.utc)
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url})}')
sys.exit(0)
# Find binary
binary = find_git()
if not binary:
print(f'ERROR: git binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
# Run extraction
success, output, error = clone_git(url, binary)
status = 'succeeded' if success else 'failed'
if success:
print(f'git clone completed')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if binary:
print(f'CMD={binary} clone {url}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,203 @@
#!/usr/bin/env node
/**
* Extract HTTP response headers for a URL.
*
* If a Chrome session exists (from chrome_session extractor), reads the captured
* response headers from chrome_session/response_headers.json.
* Otherwise falls back to making an HTTP HEAD request.
*
* Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
* Output: Writes headers/headers.json
*
* Environment variables:
* TIMEOUT: Timeout in seconds (default: 30)
* USER_AGENT: User agent string (optional)
* CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
*/
const fs = require('fs');
const path = require('path');
const https = require('https');
const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'headers';
const OUTPUT_DIR = 'headers';
const OUTPUT_FILE = 'headers.json';
const CHROME_SESSION_DIR = 'chrome_session';
const CHROME_HEADERS_FILE = 'response_headers.json';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Get headers from chrome_session if available
function getHeadersFromChromeSession() {
const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE);
if (fs.existsSync(headersFile)) {
try {
const data = JSON.parse(fs.readFileSync(headersFile, 'utf8'));
return data;
} catch (e) {
return null;
}
}
return null;
}
// Fetch headers via HTTP HEAD request (fallback)
function fetchHeaders(url) {
return new Promise((resolve, reject) => {
const timeout = getEnvInt('TIMEOUT', 30) * 1000;
const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
const checkSsl = getEnvBool('CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const parsedUrl = new URL(url);
const client = parsedUrl.protocol === 'https:' ? https : http;
const options = {
method: 'HEAD',
hostname: parsedUrl.hostname,
port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
path: parsedUrl.pathname + parsedUrl.search,
headers: { 'User-Agent': userAgent },
timeout,
rejectUnauthorized: checkSsl,
};
const req = client.request(options, (res) => {
resolve({
url: url,
status: res.statusCode,
statusText: res.statusMessage,
headers: res.headers,
});
});
req.on('error', reject);
req.on('timeout', () => {
req.destroy();
reject(new Error('Request timeout'));
});
req.end();
});
}
async function extractHeaders(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first
const chromeHeaders = getHeadersFromChromeSession();
if (chromeHeaders && chromeHeaders.headers) {
fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8');
return { success: true, output: outputPath, method: 'chrome_session', status: chromeHeaders.status };
}
// Fallback to HTTP HEAD request
try {
const headers = await fetchHeaders(url);
fs.writeFileSync(outputPath, JSON.stringify(headers, null, 2), 'utf8');
return { success: true, output: outputPath, method: 'http', status: headers.status };
} catch (e) {
return { success: false, error: e.message };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
const result = await extractHeaders(url);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Headers extracted (${result.method}): HTTP ${result.status}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,319 @@
"""
Integration tests for headers plugin
Tests verify:
1. Plugin script exists and is executable
2. Node.js is available
3. Headers extraction works for real example.com
4. Output JSON contains actual HTTP headers
5. Fallback to HTTP HEAD when chrome_session not available
6. Uses chrome_session headers when available
7. Config options work (TIMEOUT, USER_AGENT, CHECK_SSL_VALIDITY)
"""
import json
import shutil
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
HEADERS_HOOK = PLUGIN_DIR / 'on_Snapshot__33_headers.js'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}"
def test_node_is_available():
"""Test that Node.js is available on the system."""
result = subprocess.run(
['which', 'node'],
capture_output=True,
text=True
)
if result.returncode != 0:
pytest.skip("node not installed on system")
binary_path = result.stdout.strip()
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
# Test that node is executable and get version
result = subprocess.run(
['node', '--version'],
capture_output=True,
text=True,
timeout=10
)
assert result.returncode == 0, f"node not executable: {result.stderr}"
assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
def test_extracts_headers_from_example_com():
"""Test full workflow: extract headers from real example.com."""
# Check node is available
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run headers extraction
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Verify output in stdout
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'Headers extracted' in result.stdout, "Should report completion"
# Verify output directory created
headers_dir = tmpdir / 'headers'
assert headers_dir.exists(), "Output directory not created"
# Verify output file exists
headers_file = headers_dir / 'headers.json'
assert headers_file.exists(), "headers.json not created"
# Verify headers JSON contains REAL example.com response
headers_data = json.loads(headers_file.read_text())
assert 'url' in headers_data, "Should have url field"
assert headers_data['url'] == TEST_URL, f"URL should be {TEST_URL}"
assert 'status' in headers_data, "Should have status field"
assert headers_data['status'] in [200, 301, 302], \
f"Should have valid HTTP status, got {headers_data['status']}"
assert 'headers' in headers_data, "Should have headers field"
assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
assert len(headers_data['headers']) > 0, "Headers dict should not be empty"
# Verify common HTTP headers are present
headers_lower = {k.lower(): v for k, v in headers_data['headers'].items()}
assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
"Should have at least one common HTTP header"
# Verify RESULT_JSON is present and valid
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
for line in result.stdout.split('\n'):
if line.startswith('RESULT_JSON='):
result_json = json.loads(line.replace('RESULT_JSON=', ''))
assert result_json['extractor'] == 'headers'
assert result_json['status'] == 'succeeded'
assert result_json['url'] == TEST_URL
assert result_json['snapshot_id'] == 'test789'
assert 'duration' in result_json
assert result_json['duration'] >= 0
break
def test_uses_chrome_session_headers_when_available():
"""Test that headers plugin prefers chrome_session headers over HTTP HEAD."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create mock chrome_session directory with response_headers.json
chrome_session_dir = tmpdir / 'chrome_session'
chrome_session_dir.mkdir()
mock_headers = {
'url': TEST_URL,
'status': 200,
'statusText': 'OK',
'headers': {
'content-type': 'text/html; charset=UTF-8',
'server': 'MockChromeServer',
'x-test-header': 'from-chrome-session'
}
}
headers_file = chrome_session_dir / 'response_headers.json'
headers_file.write_text(json.dumps(mock_headers))
# Run headers extraction
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testchrome'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=30
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'chrome_session' in result.stdout, "Should report using chrome_session method"
# Verify it used chrome_session headers
output_headers_file = tmpdir / 'headers' / 'headers.json'
assert output_headers_file.exists(), "Output headers.json not created"
output_data = json.loads(output_headers_file.read_text())
assert output_data['headers']['x-test-header'] == 'from-chrome-session', \
"Should use headers from chrome_session"
assert output_data['headers']['server'] == 'MockChromeServer', \
"Should use headers from chrome_session"
def test_falls_back_to_http_when_chrome_session_unavailable():
"""Test that headers plugin falls back to HTTP HEAD when chrome_session unavailable."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Don't create chrome_session directory - force HTTP fallback
# Run headers extraction
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'http' in result.stdout.lower() or 'HEAD' not in result.stdout, \
"Should use HTTP method"
# Verify output exists and has real HTTP headers
output_headers_file = tmpdir / 'headers' / 'headers.json'
assert output_headers_file.exists(), "Output headers.json not created"
output_data = json.loads(output_headers_file.read_text())
assert output_data['url'] == TEST_URL
assert output_data['status'] in [200, 301, 302]
assert isinstance(output_data['headers'], dict)
assert len(output_data['headers']) > 0
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(HEADERS_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
output_headers_file = tmpdir / 'headers' / 'headers.json'
if output_headers_file.exists():
output_data = json.loads(output_headers_file.read_text())
assert output_data['url'] == 'https://example.org'
assert output_data['status'] in [200, 301, 302]
def test_handles_404_gracefully():
"""Test that headers plugin handles 404s gracefully."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(HEADERS_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed or fail depending on server behavior
# If it succeeds, verify 404 status is captured
if result.returncode == 0:
output_headers_file = tmpdir / 'headers' / 'headers.json'
if output_headers_file.exists():
output_data = json.loads(output_headers_file.read_text())
assert output_data['status'] == 404, "Should capture 404 status"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,182 @@
#!/usr/bin/env python3
"""
Convert HTML to plain text for search indexing.
This extractor reads HTML from other extractors (wget, singlefile, dom)
and converts it to plain text for full-text search.
Usage: on_Snapshot__htmltotext.py --url=<url> --snapshot-id=<uuid>
Output: Writes htmltotext.txt to $PWD
Environment variables:
TIMEOUT: Timeout in seconds (not used, but kept for consistency)
Note: This extractor does not require any external binaries.
It uses Python's built-in html.parser module.
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from html.parser import HTMLParser
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'htmltotext'
OUTPUT_DIR = 'htmltotext'
OUTPUT_FILE = 'htmltotext.txt'
class HTMLTextExtractor(HTMLParser):
"""Extract text content from HTML, ignoring scripts/styles."""
def __init__(self):
super().__init__()
self.result = []
self.skip_tags = {'script', 'style', 'head', 'meta', 'link', 'noscript'}
self.current_tag = None
def handle_starttag(self, tag, attrs):
self.current_tag = tag.lower()
def handle_endtag(self, tag):
self.current_tag = None
def handle_data(self, data):
if self.current_tag not in self.skip_tags:
text = data.strip()
if text:
self.result.append(text)
def get_text(self) -> str:
return ' '.join(self.result)
def html_to_text(html: str) -> str:
"""Convert HTML to plain text."""
parser = HTMLTextExtractor()
try:
parser.feed(html)
return parser.get_text()
except Exception:
# Fallback: strip HTML tags with regex
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', text)
return text.strip()
def find_html_source() -> str | None:
"""Find HTML content from other extractors in the snapshot directory."""
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
search_patterns = [
'singlefile/singlefile.html',
'singlefile/*.html',
'dom/output.html',
'dom/*.html',
'wget/**/*.html',
'wget/**/*.htm',
]
cwd = Path.cwd()
for pattern in search_patterns:
matches = list(cwd.glob(pattern))
for match in matches:
if match.is_file() and match.stat().st_size > 0:
try:
return match.read_text(errors='ignore')
except Exception:
continue
return None
def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
"""
Extract plain text from HTML sources.
Returns: (success, output_path, error_message)
"""
# Find HTML source from other extractors
html_content = find_html_source()
if not html_content:
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
# Convert HTML to text
text = html_to_text(html_content)
if not text or len(text) < 10:
return False, None, 'No meaningful text extracted from HTML'
# Create output directory and write output
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
output_path.write_text(text, encoding='utf-8')
return True, str(output_path), ''
@click.command()
@click.option('--url', required=True, help='URL that was archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Convert HTML to plain text for search indexing."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
try:
# Run extraction
success, output, error = extract_htmltotext(url)
status = 'succeeded' if success else 'failed'
if success:
text_len = Path(output).stat().st_size
print(f'Extracted {text_len} characters of text')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,115 @@
#!/usr/bin/env node
/**
* I Still Don't Care About Cookies Extension Plugin
*
* Installs and configures the "I still don't care about cookies" Chrome extension
* for automatic cookie consent banner dismissal during page archiving.
*
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
*
* Priority: 02 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
*
* This extension automatically:
* - Dismisses cookie consent popups
* - Removes cookie banners
* - Accepts necessary cookies to proceed with browsing
* - Works on thousands of websites out of the box
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install the I Still Don't Care About Cookies extension
*/
async function installCookiesExtension() {
console.log('[*] Installing I Still Don\'t Care About Cookies extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension');
return null;
}
console.log('[+] I Still Don\'t Care About Cookies extension installed');
console.log('[+] Cookie banners will be automatically dismissed during archiving');
return extension;
}
/**
* Note: This extension works out of the box with no configuration needed.
* It automatically detects and dismisses cookie banners on page load.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installCookiesExtension();
// Export extension metadata for chrome_session to load
if (extension) {
// Write extension info to a cache file that chrome_session can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installCookiesExtension,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,279 @@
/**
* Unit tests for istilldontcareaboutcookies plugin
*
* Run with: node --test tests/test_istilldontcareaboutcookies.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
describe('istilldontcareaboutcookies plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id', () => {
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
});
});
describe('installCookiesExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.1.8' })
);
const fakeCache = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
unpacked_path: fakeExtensionDir,
version: '1.1.8'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installCookiesExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
});
it('should not require any configuration', async () => {
// This extension works out of the box
// No API keys or config needed
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
assert.ok(EXTENSION);
// No config fields should be required
});
});
describe('cache file creation', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should create cache file with correct extension name', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
// Create mock extension
const mockExtension = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
};
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
assert.ok(fs.existsSync(cacheFile));
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
});
it('should use correct filename pattern', () => {
const expectedPattern = 'istilldontcareaboutcookies.extension.json';
const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
// Pattern should match expected format
assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
});
});
describe('extension functionality', () => {
it('should work automatically without configuration', () => {
// This extension automatically dismisses cookie banners
// No manual trigger or configuration needed
const features = {
automaticBannerDismissal: true,
requiresConfiguration: false,
requiresApiKey: false,
requiresUserAction: false
};
assert.strictEqual(features.automaticBannerDismissal, true);
assert.strictEqual(features.requiresConfiguration, false);
assert.strictEqual(features.requiresApiKey, false);
assert.strictEqual(features.requiresUserAction, false);
});
it('should not require any runtime hooks', () => {
// Extension works purely via Chrome's content script injection
// No need for additional hooks or configuration
const requiresHooks = {
preNavigation: false,
postNavigation: false,
onPageLoad: false
};
assert.strictEqual(requiresHooks.preNavigation, false);
assert.strictEqual(requiresHooks.postNavigation, false);
assert.strictEqual(requiresHooks.onPageLoad, false);
});
});
describe('priority and execution order', () => {
it('should have priority 02 (early)', () => {
const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
// Extract priority from filename
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 2);
});
it('should run before chrome_session (priority 20)', () => {
const extensionPriority = 2;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
});
describe('error handling', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should handle corrupted cache gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
// Create corrupted cache
fs.writeFileSync(cacheFile, 'invalid json content');
// Should detect corruption and proceed with fresh install
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Mock loadOrInstallExtension to avoid actual download
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
extensionUtils.loadOrInstallExtension = async () => ({
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
});
const result = await installCookiesExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
assert.notStrictEqual(result, null);
});
it('should handle missing manifest gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
// Create directory without manifest
fs.mkdirSync(fakeExtensionDir, { recursive: true });
const fakeCache = {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
unpacked_path: fakeExtensionDir
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
// Mock to return fresh extension when manifest missing
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
let freshInstallCalled = false;
extensionUtils.loadOrInstallExtension = async () => {
freshInstallCalled = true;
return {
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
name: 'istilldontcareaboutcookies',
version: '1.1.9'
};
};
const result = await installCookiesExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
// Should trigger fresh install when manifest missing
assert.ok(freshInstallCalled || result);
});
});
});

View File

@@ -0,0 +1,122 @@
"""
Unit tests for istilldontcareaboutcookies plugin
Tests invoke the plugin hook as an external process and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__02_istilldontcareaboutcookies.js"
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_extension_metadata():
"""Test that extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm"
assert metadata["name"] == "istilldontcareaboutcookies"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "Installing" in result.stdout or "installed" in result.stdout or "istilldontcareaboutcookies" in result.stdout
# Check cache file was created
cache_file = ext_dir / "istilldontcareaboutcookies.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm"
assert cache_data["name"] == "istilldontcareaboutcookies"
def test_install_uses_existing_cache():
"""Test that install uses existing cache when available"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
# Create fake cache
fake_extension_dir = ext_dir / "edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies"
fake_extension_dir.mkdir(parents=True)
manifest = {"version": "1.1.8", "name": "I still don't care about cookies"}
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should use cache or install successfully
assert result.returncode == 0
def test_no_configuration_required():
"""Test that extension works without any configuration"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# No special env vars needed - works out of the box
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should not require any API keys or configuration
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0

View File

@@ -0,0 +1,55 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_MEDIA": {
"type": "boolean",
"default": true,
"x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
"description": "Enable media downloading with yt-dlp"
},
"YOUTUBEDL_BINARY": {
"type": "string",
"default": "yt-dlp",
"x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
"description": "Path to yt-dlp binary"
},
"MEDIA_TIMEOUT": {
"type": "integer",
"default": 3600,
"minimum": 30,
"x-fallback": "TIMEOUT",
"description": "Timeout for media downloads in seconds"
},
"MEDIA_MAX_SIZE": {
"type": "string",
"default": "750m",
"pattern": "^\\d+[kmgKMG]?$",
"description": "Maximum file size for media downloads"
},
"YTDLP_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"description": "Whether to verify SSL certificates"
},
"YTDLP_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
"--write-info-json",
"--write-thumbnail",
"--write-sub",
"--embed-subs",
"--write-auto-sub"
],
"description": "Default yt-dlp arguments"
},
"YTDLP_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for yt-dlp (space-separated)"
}
}
}

View File

@@ -0,0 +1,306 @@
#!/usr/bin/env python3
"""
Download media from a URL using yt-dlp.
Usage: on_Snapshot__media.py --url=<url> --snapshot-id=<uuid>
Output: Downloads media files to $PWD/media/
Environment variables:
YTDLP_BINARY: Path to yt-dlp binary
YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large media)
YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated)
# Media feature toggles
USE_YTDLP: Enable yt-dlp media extraction (default: True)
SAVE_MEDIA: Alias for USE_YTDLP
# Media size limits
MEDIA_MAX_SIZE: Maximum media file size (default: 750m)
# Fallback to ARCHIVING_CONFIG values if YTDLP_* not set:
MEDIA_TIMEOUT: Fallback timeout for media
TIMEOUT: Fallback timeout
CHECK_SSL_VALIDITY: Fallback SSL check
"""
import json
import os
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'media'
BIN_NAME = 'yt-dlp'
BIN_PROVIDERS = 'pip,apt,brew,env'
OUTPUT_DIR = 'media'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
STATICFILE_DIR = 'staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
staticfile_dir = Path(STATICFILE_DIR)
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
def find_ytdlp() -> str | None:
"""Find yt-dlp binary."""
ytdlp = get_env('YTDLP_BINARY') or get_env('YOUTUBEDL_BINARY')
if ytdlp and os.path.isfile(ytdlp):
return ytdlp
for name in ['yt-dlp', 'youtube-dl']:
binary = shutil.which(name)
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get yt-dlp version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
# Default yt-dlp args (from old YTDLP_CONFIG)
def get_ytdlp_default_args(media_max_size: str = '750m') -> list[str]:
"""Build default yt-dlp arguments."""
return [
'--restrict-filenames',
'--trim-filenames', '128',
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
'--no-abort-on-error',
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
f'--format=(bv*+ba/b)[filesize<={media_max_size}][filesize_approx<=?{media_max_size}]/(bv*+ba/b)',
]
def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Download media using yt-dlp.
Returns: (success, output_path, error_message)
"""
# Get config from env (with YTDLP_ prefix or fallback to ARCHIVING_CONFIG style)
timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('MEDIA_TIMEOUT') or get_env_int('TIMEOUT', 3600)
check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
# Create output directory
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
# Build command (later options take precedence)
cmd = [
binary,
*get_ytdlp_default_args(media_max_size),
'--no-progress',
'-o', f'{OUTPUT_DIR}/%(title)s.%(ext)s',
]
if not check_ssl:
cmd.append('--no-check-certificate')
if extra_args:
cmd.extend(extra_args.split())
cmd.append(url)
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
# Check if any media files were downloaded
media_extensions = (
'.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v',
'.mp3', '.m4a', '.ogg', '.wav', '.flac', '.aac', '.opus',
'.json', '.jpg', '.png', '.webp', '.jpeg',
'.vtt', '.srt', '.ass', '.lrc',
'.description',
)
downloaded_files = [
f for f in output_dir.glob('*')
if f.is_file() and f.suffix.lower() in media_extensions
]
if downloaded_files:
# Return first video/audio file, or first file if no media
video_audio = [
f for f in downloaded_files
if f.suffix.lower() in ('.mp4', '.webm', '.mkv', '.avi', '.mov', '.mp3', '.m4a', '.ogg', '.wav', '.flac')
]
output = str(video_audio[0]) if video_audio else str(downloaded_files[0])
return True, output, ''
else:
stderr = result.stderr
# These are NOT errors - page simply has no downloadable media
# Return success with no output (legitimate "nothing to download")
if 'ERROR: Unsupported URL' in stderr:
return True, None, '' # Not a media site - success, no output
if 'URL could be a direct video link' in stderr:
return True, None, '' # Not a supported media URL - success, no output
if result.returncode == 0:
return True, None, '' # yt-dlp exited cleanly, just no media - success
# These ARE errors - something went wrong
if 'HTTP Error 404' in stderr:
return False, None, '404 Not Found'
if 'HTTP Error 403' in stderr:
return False, None, '403 Forbidden'
if 'Unable to extract' in stderr:
return False, None, 'Unable to extract media info'
return False, None, f'yt-dlp error: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to download media from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Download media from a URL using yt-dlp."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if yt-dlp is enabled
if not (get_env_bool('USE_YTDLP', True) and get_env_bool('SAVE_MEDIA', True)):
print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)')
status = 'skipped'
end_ts = datetime.now(timezone.utc)
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Check if staticfile extractor already handled this (permanent skip)
if has_staticfile_output():
print(f'Skipping media - staticfile extractor already downloaded this')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Find binary
binary = find_ytdlp()
if not binary:
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=pip install yt-dlp OR brew install yt-dlp', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} {url}'
# Run extraction
success, output, error = save_media(url, binary)
status = 'succeeded' if success else 'failed'
if success:
output_dir = Path(OUTPUT_DIR)
files = list(output_dir.glob('*'))
file_count = len([f for f in files if f.is_file()])
if file_count > 0:
print(f'yt-dlp completed: {file_count} files downloaded')
else:
print(f'yt-dlp completed: no media found on page (this is normal)')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if cmd_str:
print(f'CMD={cmd_str}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,30 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_MERCURY": {
"type": "boolean",
"default": true,
"description": "Enable Mercury text extraction"
},
"MERCURY_BINARY": {
"type": "string",
"default": "postlight-parser",
"x-aliases": ["POSTLIGHT_PARSER_BINARY"],
"description": "Path to Mercury/Postlight parser binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"MERCURY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Mercury in seconds"
}
}
}

View File

@@ -0,0 +1,201 @@
#!/usr/bin/env python3
"""
Extract article content using Postlight's Mercury Parser.
Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
Output: Creates mercury/ directory with content.html, content.txt, article.json
Environment variables:
MERCURY_BINARY: Path to mercury-parser binary
TIMEOUT: Timeout in seconds (default: 60)
Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
"""
import json
import os
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'mercury'
BIN_NAME = 'mercury-parser'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'mercury'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def find_mercury() -> str | None:
"""Find mercury-parser binary."""
mercury = get_env('MERCURY_BINARY')
if mercury and os.path.isfile(mercury):
return mercury
for name in ['mercury-parser', 'mercury']:
binary = shutil.which(name)
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get mercury-parser version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Extract article using Mercury Parser.
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('TIMEOUT', 60)
# Create output directory
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
try:
# Get text version
cmd_text = [binary, url, '--format=text']
result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)
if result_text.returncode != 0:
stderr = result_text.stderr.decode('utf-8', errors='replace')
return False, None, f'mercury-parser failed: {stderr[:200]}'
try:
text_json = json.loads(result_text.stdout)
except json.JSONDecodeError:
return False, None, 'mercury-parser returned invalid JSON'
if text_json.get('failed'):
return False, None, 'Mercury was not able to extract article'
# Save text content
text_content = text_json.get('content', '')
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
# Get HTML version
cmd_html = [binary, url, '--format=html']
result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)
try:
html_json = json.loads(result_html.stdout)
except json.JSONDecodeError:
html_json = {}
# Save HTML content and metadata
html_content = html_json.pop('content', '')
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
# Save article metadata
metadata = {k: v for k, v in text_json.items() if k != 'content'}
(output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8')
return True, OUTPUT_DIR, ''
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to extract article from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Extract article content using Postlight's Mercury Parser."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
try:
# Find binary
binary = find_mercury()
if not binary:
print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
# Run extraction
success, output, error = extract_mercury(url, binary)
status = 'succeeded' if success else 'failed'
if success:
text_file = Path(output) / 'content.txt'
html_file = Path(output) / 'content.html'
text_len = text_file.stat().st_size if text_file.exists() else 0
html_len = html_file.stat().st_size if html_file.exists() else 0
print(f'Mercury extracted: {text_len} chars text, {html_len} chars HTML')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if binary:
print(f'CMD={binary} {url}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,295 @@
#!/usr/bin/env python3
"""
Create a Merkle tree of all archived outputs.
This plugin runs after all extractors and post-processing complete (priority 92)
and generates a cryptographic Merkle tree of all files in the snapshot directory.
This provides:
- Tamper detection: verify archive integrity
- Efficient updates: only re-hash changed files
- Compact proofs: prove file inclusion without sending all files
- Deduplication: identify identical content across snapshots
Output: merkletree/merkletree.json containing:
- root_hash: SHA256 hash of the Merkle root
- tree: Full tree structure with internal nodes
- files: List of all files with their hashes
- metadata: Timestamp, file count, total size
Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
"""
__package__ = 'archivebox.plugins.merkletree'
import os
import sys
import json
import hashlib
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
# Configure Django if running standalone
if __name__ == '__main__':
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
if parent_dir not in sys.path:
sys.path.insert(0, parent_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
import rich_click as click
def sha256_file(filepath: Path) -> str:
"""Compute SHA256 hash of a file."""
h = hashlib.sha256()
try:
with open(filepath, 'rb') as f:
# Read in 64kb chunks
while chunk := f.read(65536):
h.update(chunk)
return h.hexdigest()
except (OSError, PermissionError):
# If we can't read the file, return a null hash
return '0' * 64
def sha256_data(data: bytes) -> str:
"""Compute SHA256 hash of raw data."""
return hashlib.sha256(data).hexdigest()
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
"""
Recursively collect all files in snapshot directory.
Args:
snapshot_dir: Root directory to scan
exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
Returns:
List of (relative_path, sha256_hash, file_size) tuples
"""
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
files = []
for root, dirs, filenames in os.walk(snapshot_dir):
# Filter out excluded directories
dirs[:] = [d for d in dirs if d not in exclude_dirs]
for filename in filenames:
filepath = Path(root) / filename
rel_path = filepath.relative_to(snapshot_dir)
# Skip symlinks (we hash the target, not the link)
if filepath.is_symlink():
continue
# Compute hash and size
file_hash = sha256_file(filepath)
file_size = filepath.stat().st_size if filepath.exists() else 0
files.append((rel_path, file_hash, file_size))
# Sort by path for deterministic tree
files.sort(key=lambda x: str(x[0]))
return files
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
"""
Build a Merkle tree from a list of leaf hashes.
Args:
file_hashes: List of SHA256 hashes (leaves)
Returns:
(root_hash, tree_levels) where tree_levels is a list of hash lists per level
"""
if not file_hashes:
# Empty tree
return sha256_data(b''), [[]]
# Initialize with leaf level
tree_levels = [file_hashes.copy()]
# Build tree bottom-up
while len(tree_levels[-1]) > 1:
current_level = tree_levels[-1]
next_level = []
# Process pairs
for i in range(0, len(current_level), 2):
left = current_level[i]
if i + 1 < len(current_level):
# Combine left + right
right = current_level[i + 1]
combined = left + right
else:
# Odd number of nodes: duplicate the last one
combined = left + left
parent_hash = sha256_data(combined.encode('utf-8'))
next_level.append(parent_hash)
tree_levels.append(next_level)
# Root is the single hash at the top level
root_hash = tree_levels[-1][0]
return root_hash, tree_levels
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
"""
Create a complete Merkle tree of all files in snapshot directory.
Args:
snapshot_dir: The snapshot directory to scan
Returns:
Dict containing root_hash, tree structure, file list, and metadata
"""
# Collect all files
files = collect_files(snapshot_dir)
# Extract just the hashes for tree building
file_hashes = [file_hash for _, file_hash, _ in files]
# Build Merkle tree
root_hash, tree_levels = build_merkle_tree(file_hashes)
# Calculate total size
total_size = sum(size for _, _, size in files)
# Prepare file list with metadata
file_list = [
{
'path': str(path),
'hash': file_hash,
'size': size,
}
for path, file_hash, size in files
]
# Prepare result
result = {
'root_hash': root_hash,
'tree_levels': tree_levels,
'files': file_list,
'metadata': {
'timestamp': datetime.now().isoformat(),
'file_count': len(files),
'total_size': total_size,
'tree_depth': len(tree_levels),
},
}
return result
@click.command()
@click.option('--url', required=True, help='URL being archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Generate Merkle tree of all archived outputs."""
from archivebox.core.models import Snapshot
start_ts = datetime.now()
status = 'failed'
output = None
error = ''
root_hash = None
file_count = 0
try:
# Check if enabled
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
if not save_merkletree:
click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
status = 'skipped'
end_ts = datetime.now()
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'STATUS={status}')
click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
sys.exit(0)
# Get snapshot
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
error = f'Snapshot {snapshot_id} not found'
raise ValueError(error)
# Get snapshot directory
snapshot_dir = Path(snapshot.output_dir)
if not snapshot_dir.exists():
error = f'Snapshot directory not found: {snapshot_dir}'
raise FileNotFoundError(error)
# Create output directory
output_dir = snapshot_dir / 'merkletree'
output_dir.mkdir(exist_ok=True)
output_path = output_dir / 'merkletree.json'
# Generate Merkle tree
merkle_data = create_merkle_tree(snapshot_dir)
# Write output
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(merkle_data, f, indent=2)
status = 'succeeded'
output = str(output_path)
root_hash = merkle_data['root_hash']
file_count = merkle_data['metadata']['file_count']
total_size = merkle_data['metadata']['total_size']
click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
click.echo(f'Error: {error}', err=True)
end_ts = datetime.now()
duration = (end_ts - start_ts).total_seconds()
# Print results
click.echo(f'START_TS={start_ts.isoformat()}')
click.echo(f'END_TS={end_ts.isoformat()}')
click.echo(f'DURATION={duration:.2f}')
if output:
click.echo(f'OUTPUT={output}')
click.echo(f'STATUS={status}')
if error:
click.echo(f'ERROR={error}', err=True)
# Print JSON result
result_json = {
'extractor': 'merkletree',
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'root_hash': root_hash,
'file_count': file_count,
'error': error or None,
}
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""
Install a binary using npm package manager.
Usage: on_Dependency__install_using_npm_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
Output: InstalledBinary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import sys
import rich_click as click
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
# Fix pydantic forward reference issue
NpmProvider.model_rebuild()
@click.command()
@click.option('--dependency-id', required=True, help="Dependency UUID")
@click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
"""Install binary using npm."""
if bin_providers != '*' and 'npm' not in bin_providers.split(','):
click.echo(f"npm provider not allowed for {bin_name}", err=True)
sys.exit(0)
# Use abx-pkg NpmProvider to install binary
provider = NpmProvider()
if not provider.INSTALLER_BIN:
click.echo("npm not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {bin_name} via npm...", err=True)
try:
binary = Binary(name=bin_name, binproviders=[provider]).install()
except Exception as e:
click.echo(f"npm install failed: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{bin_name} not found after npm install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output InstalledBinary JSONL record to stdout
record = {
'type': 'InstalledBinary',
'name': bin_name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'npm',
'machine_id': machine_id,
'dependency_id': dependency_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env node
/**
* Extract and categorize outgoing links from a page's DOM.
*
* Categorizes links by type:
* - hrefs: All <a> links
* - images: <img src>
* - css_stylesheets: <link rel=stylesheet>
* - css_images: CSS background-image: url()
* - js_scripts: <script src>
* - iframes: <iframe src>
* - links: <link> tags with rel/href
*
* Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
* Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
*
* Environment variables:
* SAVE_DOM_OUTLINKS: Enable DOM outlinks extraction (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'parse_dom_outlinks';
const OUTPUT_DIR = 'parse_dom_outlinks';
const OUTPUT_FILE = 'outlinks.json';
const URLS_FILE = 'urls.jsonl'; // For crawl system
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Extract outlinks
async function extractOutlinks(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Extract outlinks by category
const outlinksData = await page.evaluate(() => {
const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
const filterDataUrls = (urls) => urls.filter(url => url && !url.startsWith('data:'));
const filterW3Urls = (urls) => urls.filter(url => url && !url.startsWith('http://www.w3.org/'));
// Get raw links from HTML
const html = document.documentElement.outerHTML;
const raw = Array.from(html.matchAll(LINK_REGEX)).map(m => m[0]);
// Get all <a href> links
const hrefs = Array.from(document.querySelectorAll('a[href]'))
.map(elem => elem.href)
.filter(url => url);
// Get all <link> tags (not just stylesheets)
const linksMap = {};
document.querySelectorAll('link[href]').forEach(elem => {
const rel = elem.rel || '';
const href = elem.href;
if (href && rel !== 'stylesheet') {
linksMap[href] = { rel, href };
}
});
const links = Object.values(linksMap);
// Get iframes
const iframes = Array.from(document.querySelectorAll('iframe[src]'))
.map(elem => elem.src)
.filter(url => url);
// Get images
const images = Array.from(document.querySelectorAll('img[src]'))
.map(elem => elem.src)
.filter(url => url && !url.startsWith('data:'));
// Get CSS background images
const css_images = Array.from(document.querySelectorAll('*'))
.map(elem => {
const bgImg = window.getComputedStyle(elem).getPropertyValue('background-image');
const match = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i.exec(bgImg);
return match ? match[1] : null;
})
.filter(url => url);
// Get stylesheets
const css_stylesheets = Array.from(document.querySelectorAll('link[rel=stylesheet]'))
.map(elem => elem.href)
.filter(url => url);
// Get JS scripts
const js_scripts = Array.from(document.querySelectorAll('script[src]'))
.map(elem => elem.src)
.filter(url => url);
return {
url: window.location.href,
raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
hrefs: [...new Set(filterDataUrls(hrefs))],
links,
iframes: [...new Set(iframes)],
images: [...new Set(filterDataUrls(images))],
css_images: [...new Set(filterDataUrls(css_images))],
css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
js_scripts: [...new Set(filterDataUrls(js_scripts))],
};
});
// Write detailed output (for archival)
fs.writeFileSync(outputPath, JSON.stringify(outlinksData, null, 2));
// Write urls.jsonl for crawl system (only hrefs that are crawlable pages)
const urlsPath = path.join(OUTPUT_DIR, URLS_FILE);
const crawlableUrls = outlinksData.hrefs.filter(href => {
// Only include http/https URLs, exclude static assets
if (!href.startsWith('http://') && !href.startsWith('https://')) return false;
// Exclude common static file extensions
const staticExts = ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot', '.mp4', '.webm', '.mp3', '.pdf'];
const urlPath = href.split('?')[0].split('#')[0].toLowerCase();
return !staticExts.some(ext => urlPath.endsWith(ext));
});
const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
type: 'Snapshot',
url: href,
via_extractor: EXTRACTOR_NAME,
})).join('\n');
if (urlsJsonl) {
fs.writeFileSync(urlsPath, urlsJsonl + '\n');
}
return { success: true, output: outputPath, outlinksData, crawlableCount: crawlableUrls.length };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('SAVE_DOM_OUTLINKS', true)) {
console.log('Skipping DOM outlinks (SAVE_DOM_OUTLINKS=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await extractOutlinks(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const total = result.outlinksData.hrefs.length;
const crawlable = result.crawlableCount;
const images = result.outlinksData.images.length;
const scripts = result.outlinksData.js_scripts.length;
console.log(`DOM outlinks extracted: ${total} links (${crawlable} crawlable), ${images} images, ${scripts} scripts`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""
Parse HTML files and extract href URLs.
This is a standalone extractor that can run without ArchiveBox.
It reads HTML content and extracts all <a href="..."> URLs.
NOTE: If parse_dom_outlinks already ran (parse_dom_outlinks/urls.jsonl exists),
this extractor will skip since parse_dom_outlinks provides better coverage via Chrome.
Usage: ./on_Snapshot__60_parse_html_urls.py --url=<url>
Output: Appends discovered URLs to urls.jsonl in current directory
Examples:
./on_Snapshot__60_parse_html_urls.py --url=file:///path/to/page.html
./on_Snapshot__60_parse_html_urls.py --url=https://example.com/page.html
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from html import unescape
from html.parser import HTMLParser
from pathlib import Path
from urllib.parse import urljoin, urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_html_urls'
# Check if parse_dom_outlinks extractor already ran
DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
# URL regex from archivebox/misc/util.py
URL_REGEX = re.compile(
r'(?=('
r'http[s]?://'
r'(?:[a-zA-Z]|[0-9]'
r'|[-_$@.&+!*\(\),]'
r'|[^\u0000-\u007F])+'
r'[^\]\[<>"\'\s]+'
r'))',
re.IGNORECASE | re.UNICODE,
)
class HrefParser(HTMLParser):
"""Extract href attributes from anchor tags."""
def __init__(self):
super().__init__()
self.urls = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
for attr, value in attrs:
if attr == 'href' and value:
self.urls.append(value)
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
"""Check if urljoin incorrectly stripped // from sub-URLs."""
relative_path = relative_path.lower()
if relative_path.startswith('http://') or relative_path.startswith('https://'):
relative_path = relative_path.split('://', 1)[-1]
original_path_had_suburl = '://' in relative_path
original_root_had_suburl = '://' in root_url[8:]
final_joined_has_suburl = '://' in final_url[8:]
return (original_root_had_suburl or original_path_had_suburl) and not final_joined_has_suburl
def fix_urljoin_bug(url: str, nesting_limit=5) -> str:
"""Fix broken sub-URLs where :// was changed to :/."""
input_url = url
for _ in range(nesting_limit):
url = re.sub(
r'(?P<root>.+?)'
r'(?P<separator>[-=/_&+%$#@!*\(\\])'
r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/'
r'(?P<suburl>[^/\\]+)',
r'\1\2\3://\4',
input_url,
re.IGNORECASE | re.UNICODE,
)
if url == input_url:
break
input_url = url
return url
def normalize_url(url: str, root_url: str = None) -> str:
"""Normalize a URL, resolving relative paths if root_url provided."""
if not root_url:
return url
url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')
if url_is_absolute:
return url
# Resolve relative URL
resolved = urljoin(root_url, url)
# Fix urljoin bug with sub-URLs
if did_urljoin_misbehave(root_url, url, resolved):
resolved = fix_urljoin_bug(resolved)
return resolved
def fetch_content(url: str) -> str:
"""Fetch content from a URL (supports file:// and https://)."""
parsed = urlparse(url)
if parsed.scheme == 'file':
file_path = parsed.path
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
else:
timeout = int(os.environ.get('TIMEOUT', '60'))
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
import urllib.request
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read().decode('utf-8', errors='replace')
@click.command()
@click.option('--url', required=True, help='HTML URL to parse')
def main(url: str):
"""Parse HTML and extract href URLs."""
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
# If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
if DOM_OUTLINKS_URLS_FILE.exists() and DOM_OUTLINKS_URLS_FILE.stat().st_size > 0:
click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
sys.exit(0)
try:
content = fetch_content(url)
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
# Parse HTML for hrefs
parser = HrefParser()
try:
parser.feed(content)
except Exception as e:
click.echo(f'Failed to parse HTML: {e}', err=True)
sys.exit(1)
urls_found = set()
for href in parser.urls:
# Normalize URL
normalized = normalize_url(href, root_url=url)
# Only include http/https URLs
if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
# Skip the source URL itself
if normalized != url:
urls_found.add(unescape(normalized))
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
for found_url in sorted(urls_found):
f.write(json.dumps({
'type': 'Snapshot',
'url': found_url,
'via_extractor': EXTRACTOR_NAME,
}) + '\n')
click.echo(f'Found {len(urls_found)} URLs')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,240 @@
#!/usr/bin/env python3
"""Unit tests for parse_html_urls extractor."""
import json
import subprocess
import sys
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.py'), None)
class TestParseHtmlUrls:
"""Test the parse_html_urls extractor CLI."""
def test_parses_real_example_com(self, tmp_path):
"""Test parsing real https://example.com and extracting its links."""
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', 'https://example.com'],
cwd=tmp_path,
capture_output=True,
text=True,
timeout=30
)
assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists(), "Output file not created"
# Verify output contains IANA link (example.com links to iana.org)
content = output_file.read_text()
assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
def test_extracts_href_urls(self, tmp_path):
"""Test extracting URLs from anchor tags."""
input_file = tmp_path / 'page.html'
input_file.write_text('''
<!DOCTYPE html>
<html>
<body>
<a href="https://example.com">Example</a>
<a href="https://foo.bar/page">Foo</a>
<a href="http://test.org">Test</a>
</body>
</html>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 3
urls = set()
for line in lines:
entry = json.loads(line)
assert 'url' in entry
urls.add(entry['url'])
assert 'https://example.com' in urls
assert 'https://foo.bar/page' in urls
assert 'http://test.org' in urls
def test_ignores_non_http_schemes(self, tmp_path):
"""Test that non-http schemes are ignored."""
input_file = tmp_path / 'page.html'
input_file.write_text('''
<html>
<body>
<a href="mailto:test@example.com">Email</a>
<a href="javascript:void(0)">JS</a>
<a href="tel:+1234567890">Phone</a>
<a href="https://valid.com">Valid</a>
</body>
</html>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 1
entry = json.loads(lines[0])
assert entry['url'] == 'https://valid.com'
def test_handles_html_entities(self, tmp_path):
"""Test that HTML entities in URLs are decoded."""
input_file = tmp_path / 'page.html'
input_file.write_text('''
<html>
<body>
<a href="https://example.com/page?a=1&amp;b=2">Link</a>
</body>
</html>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com/page?a=1&b=2'
def test_deduplicates_urls(self, tmp_path):
"""Test that duplicate URLs are deduplicated."""
input_file = tmp_path / 'page.html'
input_file.write_text('''
<html>
<body>
<a href="https://example.com">Link 1</a>
<a href="https://example.com">Link 2</a>
<a href="https://example.com">Link 3</a>
</body>
</html>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 1
def test_excludes_source_url(self, tmp_path):
"""Test that the source URL itself is excluded from results."""
input_file = tmp_path / 'page.html'
source_url = f'file://{input_file}'
input_file.write_text(f'''
<html>
<body>
<a href="{source_url}">Self</a>
<a href="https://other.com">Other</a>
</body>
</html>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', source_url],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 1
entry = json.loads(lines[0])
assert entry['url'] == 'https://other.com'
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
input_file = tmp_path / 'page.html'
input_file.write_text('<html><body>No links here</body></html>')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'No URLs found' in result.stderr
def test_handles_malformed_html(self, tmp_path):
"""Test handling of malformed HTML."""
input_file = tmp_path / 'malformed.html'
input_file.write_text('''
<html>
<body>
<a href="https://example.com">Unclosed tag
<a href="https://other.com">Another link</a>
</body>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_output_is_valid_json(self, tmp_path):
"""Test that output contains required fields."""
input_file = tmp_path / 'page.html'
input_file.write_text('<a href="https://example.com">Link</a>')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,184 @@
#!/usr/bin/env python3
"""
Parse JSONL bookmark files and extract URLs.
This is a standalone extractor that can run without ArchiveBox.
It reads JSONL-format bookmark exports (one JSON object per line).
Usage: ./on_Snapshot__54_parse_jsonl_urls.py --url=<url>
Output: Appends discovered URLs to urls.jsonl in current directory
Expected JSONL format (one object per line):
{"url": "https://example.com", "title": "Example", "tags": "tag1,tag2"}
{"href": "https://other.com", "description": "Other Site"}
Supports various field names for URL, title, timestamp, and tags.
"""
import json
import os
import sys
from datetime import datetime
from html import unescape
from urllib.parse import urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_jsonl_urls'
def parse_bookmarked_at(link: dict) -> str | None:
"""Parse timestamp from various JSON formats, return ISO 8601."""
from datetime import timezone
def json_date(s: str) -> datetime:
# Try ISO 8601 format
return datetime.strptime(s.split(',', 1)[0], '%Y-%m-%dT%H:%M:%S%z')
def to_iso(dt: datetime) -> str:
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.isoformat()
try:
if link.get('bookmarked_at'):
# Already in our format, pass through
return link['bookmarked_at']
elif link.get('timestamp'):
# Chrome/Firefox histories use microseconds
return to_iso(datetime.fromtimestamp(link['timestamp'] / 1000000, tz=timezone.utc))
elif link.get('time'):
return to_iso(json_date(link['time']))
elif link.get('created_at'):
return to_iso(json_date(link['created_at']))
elif link.get('created'):
return to_iso(json_date(link['created']))
elif link.get('date'):
return to_iso(json_date(link['date']))
elif link.get('bookmarked'):
return to_iso(json_date(link['bookmarked']))
elif link.get('saved'):
return to_iso(json_date(link['saved']))
except (ValueError, TypeError, KeyError):
pass
return None
def json_object_to_entry(link: dict) -> dict | None:
"""Convert a JSON bookmark object to a URL entry."""
# Parse URL (try various field names)
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
return None
entry = {
'type': 'Snapshot',
'url': unescape(url),
'via_extractor': EXTRACTOR_NAME,
}
# Parse title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
if title:
entry['title'] = unescape(title)
# Parse bookmarked_at (ISO 8601)
bookmarked_at = parse_bookmarked_at(link)
if bookmarked_at:
entry['bookmarked_at'] = bookmarked_at
# Parse tags
tags = link.get('tags', '')
if isinstance(tags, list):
tags = ','.join(tags)
elif isinstance(tags, str) and ',' not in tags and tags:
# If no comma, assume space-separated
tags = tags.replace(' ', ',')
if tags:
entry['tags'] = unescape(tags)
return entry
def fetch_content(url: str) -> str:
"""Fetch content from a URL (supports file:// and https://)."""
parsed = urlparse(url)
if parsed.scheme == 'file':
file_path = parsed.path
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
else:
timeout = int(os.environ.get('TIMEOUT', '60'))
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
import urllib.request
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read().decode('utf-8', errors='replace')
@click.command()
@click.option('--url', required=True, help='JSONL file URL to parse')
def main(url: str):
"""Parse JSONL bookmark file and extract URLs."""
try:
content = fetch_content(url)
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
urls_found = []
for line in content.splitlines():
line = line.strip()
if not line:
continue
try:
link = json.loads(line)
entry = json_object_to_entry(link)
if entry:
urls_found.append(entry)
except json.JSONDecodeError:
# Skip malformed lines
continue
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Collect unique tags
all_tags = set()
for entry in urls_found:
if entry.get('tags'):
for tag in entry['tags'].split(','):
tag = tag.strip()
if tag:
all_tags.add(tag)
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
# Write Tag records first
for tag_name in sorted(all_tags):
f.write(json.dumps({
'type': 'Tag',
'name': tag_name,
}) + '\n')
# Write Snapshot records
for entry in urls_found:
f.write(json.dumps(entry) + '\n')
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,272 @@
#!/usr/bin/env python3
"""Unit tests for parse_jsonl_urls extractor."""
import json
import subprocess
import sys
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
class TestParseJsonlUrls:
"""Test the parse_jsonl_urls extractor CLI."""
def test_extracts_urls_from_jsonl(self, tmp_path):
"""Test extracting URLs from JSONL bookmark file."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://example.com", "title": "Example"}\n'
'{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
'{"url": "https://test.org", "title": "Test Org"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 3
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
titles = {e.get('title') for e in entries}
assert 'https://example.com' in urls
assert 'https://foo.bar/page' in urls
assert 'https://test.org' in urls
assert 'Example' in titles
assert 'Foo Bar' in titles
assert 'Test Org' in titles
def test_supports_href_field(self, tmp_path):
"""Test that 'href' field is recognized as URL."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
def test_supports_description_as_title(self, tmp_path):
"""Test that 'description' field is used as title fallback."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['title'] == 'A description'
def test_parses_various_timestamp_formats(self, tmp_path):
"""Test parsing of different timestamp field names."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Parser converts timestamp to bookmarked_at
assert 'bookmarked_at' in entry
def test_parses_tags_as_string(self, tmp_path):
"""Test parsing tags as comma-separated string."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Parser converts tags to separate Tag objects in the output
content = output_file.read_text()
assert 'tech' in content or 'news' in content or 'Tag' in content
def test_parses_tags_as_list(self, tmp_path):
"""Test parsing tags as JSON array."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Parser converts tags to separate Tag objects in the output
content = output_file.read_text()
assert 'tech' in content or 'news' in content or 'Tag' in content
def test_skips_malformed_lines(self, tmp_path):
"""Test that malformed JSON lines are skipped."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://valid.com"}\n'
'not valid json\n'
'{"url": "https://also-valid.com"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_skips_entries_without_url(self, tmp_path):
"""Test that entries without URL field are skipped."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://valid.com"}\n'
'{"title": "No URL here"}\n'
'{"url": "https://also-valid.com"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
input_file = tmp_path / 'empty.jsonl'
input_file.write_text('{"title": "No URL"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'No URLs found' in result.stderr
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'Failed to fetch' in result.stderr
def test_handles_html_entities(self, tmp_path):
"""Test that HTML entities in URLs and titles are decoded."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com/page?a=1&amp;b=2", "title": "Test &amp; Title"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com/page?a=1&b=2'
assert entry['title'] == 'Test & Title'
def test_skips_empty_lines(self, tmp_path):
"""Test that empty lines are skipped."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://example.com"}\n'
'\n'
' \n'
'{"url": "https://other.com"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_output_includes_required_fields(self, tmp_path):
"""Test that output includes required fields."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
Parse Netscape bookmark HTML files and extract URLs.
This is a standalone extractor that can run without ArchiveBox.
It reads Netscape-format bookmark exports (produced by all major browsers).
Usage: ./on_Snapshot__53_parse_netscape_urls.py --url=<url>
Output: Appends discovered URLs to urls.jsonl in current directory
Examples:
./on_Snapshot__53_parse_netscape_urls.py --url=file:///path/to/bookmarks.html
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from html import unescape
from urllib.parse import urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_netscape_urls'
# Regex pattern for Netscape bookmark format
# Example: <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" TAGS="tag1,tag2">example title</A>
NETSCAPE_PATTERN = re.compile(
r'<a\s+href="([^"]+)"\s+add_date="(\d+)"(?:\s+[^>]*?tags="([^"]*)")?[^>]*>([^<]+)</a>',
re.UNICODE | re.IGNORECASE
)
def fetch_content(url: str) -> str:
"""Fetch content from a URL (supports file:// and https://)."""
parsed = urlparse(url)
if parsed.scheme == 'file':
file_path = parsed.path
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
else:
timeout = int(os.environ.get('TIMEOUT', '60'))
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
import urllib.request
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read().decode('utf-8', errors='replace')
@click.command()
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
def main(url: str):
"""Parse Netscape bookmark HTML and extract URLs."""
try:
content = fetch_content(url)
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
urls_found = []
all_tags = set()
for line in content.splitlines():
match = NETSCAPE_PATTERN.search(line)
if match:
bookmark_url = match.group(1)
tags_str = match.group(3) or ''
title = match.group(4).strip()
entry = {
'type': 'Snapshot',
'url': unescape(bookmark_url),
'via_extractor': EXTRACTOR_NAME,
}
if title:
entry['title'] = unescape(title)
if tags_str:
entry['tags'] = tags_str
# Collect unique tags
for tag in tags_str.split(','):
tag = tag.strip()
if tag:
all_tags.add(tag)
try:
# Convert unix timestamp to ISO 8601
entry['bookmarked_at'] = datetime.fromtimestamp(float(match.group(2)), tz=timezone.utc).isoformat()
except (ValueError, OSError):
pass
urls_found.append(entry)
if not urls_found:
click.echo('No bookmarks found', err=True)
sys.exit(1)
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
# Write Tag records first
for tag_name in sorted(all_tags):
f.write(json.dumps({
'type': 'Tag',
'name': tag_name,
}) + '\n')
# Write Snapshot records
for entry in urls_found:
f.write(json.dumps(entry) + '\n')
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,205 @@
#!/usr/bin/env python3
"""Unit tests for parse_netscape_urls extractor."""
import json
import subprocess
import sys
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
class TestParseNetscapeUrls:
"""Test the parse_netscape_urls extractor CLI."""
def test_extracts_urls_from_netscape_bookmarks(self, tmp_path):
"""Test extracting URLs from Netscape bookmark HTML format."""
input_file = tmp_path / 'bookmarks.html'
input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
<DT><A HREF="https://example.com" ADD_DATE="1609459200">Example Site</A>
<DT><A HREF="https://foo.bar/page" ADD_DATE="1609545600">Foo Bar</A>
<DT><A HREF="https://test.org" ADD_DATE="1609632000">Test Org</A>
</DL><p>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 3
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
titles = {e.get('title') for e in entries}
assert 'https://example.com' in urls
assert 'https://foo.bar/page' in urls
assert 'https://test.org' in urls
assert 'Example Site' in titles
assert 'Foo Bar' in titles
assert 'Test Org' in titles
def test_parses_add_date_timestamps(self, tmp_path):
"""Test that ADD_DATE timestamps are parsed correctly."""
input_file = tmp_path / 'bookmarks.html'
input_file.write_text('''
<DT><A HREF="https://example.com" ADD_DATE="1609459200">Test</A>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Parser converts timestamp to bookmarked_at
assert 'bookmarked_at' in entry
def test_handles_query_params_in_urls(self, tmp_path):
"""Test that URLs with query parameters are preserved."""
input_file = tmp_path / 'bookmarks.html'
input_file.write_text('''
<DT><A HREF="https://example.com/search?q=test+query&page=1" ADD_DATE="1609459200">Search</A>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert 'q=test+query' in entry['url']
assert 'page=1' in entry['url']
def test_handles_html_entities(self, tmp_path):
"""Test that HTML entities in URLs and titles are decoded."""
input_file = tmp_path / 'bookmarks.html'
input_file.write_text('''
<DT><A HREF="https://example.com/page?a=1&amp;b=2" ADD_DATE="1609459200">Test &amp; Title</A>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com/page?a=1&b=2'
assert entry['title'] == 'Test & Title'
def test_exits_1_when_no_bookmarks_found(self, tmp_path):
"""Test that script exits with code 1 when no bookmarks found."""
input_file = tmp_path / 'empty.html'
input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL><p>
</DL><p>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'No bookmarks found' in result.stderr
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.html'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'Failed to fetch' in result.stderr
def test_handles_nested_folders(self, tmp_path):
"""Test parsing bookmarks in nested folder structure."""
input_file = tmp_path / 'bookmarks.html'
input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
<DL><p>
<DT><H3>Folder 1</H3>
<DL><p>
<DT><A HREF="https://example.com/nested1" ADD_DATE="1609459200">Nested 1</A>
<DT><H3>Subfolder</H3>
<DL><p>
<DT><A HREF="https://example.com/nested2" ADD_DATE="1609459200">Nested 2</A>
</DL><p>
</DL><p>
<DT><A HREF="https://example.com/top" ADD_DATE="1609459200">Top Level</A>
</DL><p>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
urls = {json.loads(line)['url'] for line in lines}
assert 'https://example.com/nested1' in urls
assert 'https://example.com/nested2' in urls
assert 'https://example.com/top' in urls
def test_case_insensitive_parsing(self, tmp_path):
"""Test that parsing is case-insensitive for HTML tags."""
input_file = tmp_path / 'bookmarks.html'
input_file.write_text('''
<dt><a HREF="https://example.com" ADD_DATE="1609459200">Test</a>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,140 @@
#!/usr/bin/env python3
"""
Parse RSS/Atom feeds and extract URLs.
This is a standalone extractor that can run without ArchiveBox.
It reads feed content from a URL and extracts article URLs.
Usage: ./on_Snapshot__51_parse_rss_urls.py --url=<url>
Output: Appends discovered URLs to urls.jsonl in current directory
Examples:
./on_Snapshot__51_parse_rss_urls.py --url=https://example.com/feed.rss
./on_Snapshot__51_parse_rss_urls.py --url=file:///path/to/feed.xml
"""
import json
import os
import sys
from datetime import datetime, timezone
from html import unescape
from time import mktime
from urllib.parse import urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_rss_urls'
try:
import feedparser
except ImportError:
feedparser = None
def fetch_content(url: str) -> str:
"""Fetch content from a URL (supports file:// and https://)."""
parsed = urlparse(url)
if parsed.scheme == 'file':
file_path = parsed.path
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
else:
timeout = int(os.environ.get('TIMEOUT', '60'))
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
import urllib.request
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read().decode('utf-8', errors='replace')
@click.command()
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
def main(url: str):
"""Parse RSS/Atom feed and extract article URLs."""
if feedparser is None:
click.echo('feedparser library not installed', err=True)
sys.exit(1)
try:
content = fetch_content(url)
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
# Parse the feed
feed = feedparser.parse(content)
if not feed.entries:
click.echo('No entries found in feed', err=True)
sys.exit(1)
urls_found = []
for item in feed.entries:
item_url = getattr(item, 'link', None)
if not item_url:
continue
title = getattr(item, 'title', None)
# Get bookmarked_at (published/updated date as ISO 8601)
bookmarked_at = None
if hasattr(item, 'published_parsed') and item.published_parsed:
bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
elif hasattr(item, 'updated_parsed') and item.updated_parsed:
bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
# Get tags
tags = ''
if hasattr(item, 'tags') and item.tags:
try:
tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
except (AttributeError, TypeError):
pass
entry = {
'type': 'Snapshot',
'url': unescape(item_url),
'via_extractor': EXTRACTOR_NAME,
}
if title:
entry['title'] = unescape(title)
if bookmarked_at:
entry['bookmarked_at'] = bookmarked_at
if tags:
entry['tags'] = tags
urls_found.append(entry)
if not urls_found:
click.echo('No valid URLs found in feed entries', err=True)
sys.exit(1)
# Collect unique tags
all_tags = set()
for entry in urls_found:
if entry.get('tags'):
for tag in entry['tags'].split(','):
tag = tag.strip()
if tag:
all_tags.add(tag)
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
# Write Tag records first
for tag_name in sorted(all_tags):
f.write(json.dumps({
'type': 'Tag',
'name': tag_name,
}) + '\n')
# Write Snapshot records
for entry in urls_found:
f.write(json.dumps(entry) + '\n')
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,213 @@
#!/usr/bin/env python3
"""Unit tests for parse_rss_urls extractor."""
import json
import subprocess
import sys
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
class TestParseRssUrls:
"""Test the parse_rss_urls extractor CLI."""
def test_parses_real_rss_feed(self, tmp_path):
"""Test parsing a real RSS feed from the web."""
# Use httpbin.org which provides a sample RSS feed
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', 'https://news.ycombinator.com/rss'],
cwd=tmp_path,
capture_output=True,
text=True,
timeout=30
)
# HN RSS feed should parse successfully
if result.returncode == 0:
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists(), "Output file not created"
content = output_file.read_text()
assert len(content) > 0, "No URLs extracted from real RSS feed"
# Verify at least one URL was extracted
lines = content.strip().split('\n')
assert len(lines) > 0, "No entries found in RSS feed"
def test_extracts_urls_from_rss_feed(self, tmp_path):
"""Test extracting URLs from an RSS 2.0 feed."""
input_file = tmp_path / 'feed.rss'
input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Test Feed</title>
<link>https://example.com</link>
<item>
<title>First Post</title>
<link>https://example.com/post/1</link>
<pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
</item>
<item>
<title>Second Post</title>
<link>https://example.com/post/2</link>
<pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
</item>
</channel>
</rss>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Found 2 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
titles = {e.get('title') for e in entries}
assert 'https://example.com/post/1' in urls
assert 'https://example.com/post/2' in urls
assert 'First Post' in titles
assert 'Second Post' in titles
def test_extracts_urls_from_atom_feed(self, tmp_path):
"""Test extracting URLs from an Atom feed."""
input_file = tmp_path / 'feed.atom'
input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>Test Atom Feed</title>
<entry>
<title>Atom Post 1</title>
<link href="https://atom.example.com/entry/1"/>
<updated>2024-01-01T12:00:00Z</updated>
</entry>
<entry>
<title>Atom Post 2</title>
<link href="https://atom.example.com/entry/2"/>
<updated>2024-01-02T12:00:00Z</updated>
</entry>
</feed>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
urls = {json.loads(line)['url'] for line in lines}
assert 'https://atom.example.com/entry/1' in urls
assert 'https://atom.example.com/entry/2' in urls
def test_exits_1_when_no_entries(self, tmp_path):
"""Test that script exits with code 1 when feed has no entries."""
input_file = tmp_path / 'empty.rss'
input_file.write_text('''<?xml version="1.0"?>
<rss version="2.0">
<channel>
<title>Empty Feed</title>
</channel>
</rss>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'No entries found' in result.stderr
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/feed.rss'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'Failed to fetch' in result.stderr
def test_handles_html_entities_in_urls(self, tmp_path):
"""Test that HTML entities in URLs are decoded."""
input_file = tmp_path / 'feed.rss'
input_file.write_text('''<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Entity Test</title>
<link>https://example.com/page?a=1&amp;b=2</link>
</item>
</channel>
</rss>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com/page?a=1&b=2'
def test_includes_optional_metadata(self, tmp_path):
"""Test that title and timestamp are included when present."""
input_file = tmp_path / 'feed.rss'
input_file.write_text('''<?xml version="1.0"?>
<rss version="2.0">
<channel>
<item>
<title>Test Title</title>
<link>https://example.com/test</link>
<pubDate>Wed, 15 Jan 2020 10:30:00 GMT</pubDate>
</item>
</channel>
</rss>
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com/test'
assert entry['title'] == 'Test Title'
# Parser converts timestamp to bookmarked_at
assert 'bookmarked_at' in entry
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,137 @@
#!/usr/bin/env python3
"""
Parse plain text files and extract URLs.
This is a standalone extractor that can run without ArchiveBox.
It reads text content from a URL (file:// or https://) and extracts all URLs found.
Usage: ./on_Snapshot__52_parse_txt_urls.py --url=<url>
Output: Appends discovered URLs to urls.jsonl in current directory
Examples:
./on_Snapshot__52_parse_txt_urls.py --url=file:///path/to/urls.txt
./on_Snapshot__52_parse_txt_urls.py --url=https://example.com/urls.txt
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from html import unescape
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import urlopen
import rich_click as click
EXTRACTOR_NAME = 'parse_txt_urls'
# URL regex from archivebox/misc/util.py
# https://mathiasbynens.be/demo/url-regex
URL_REGEX = re.compile(
r'(?=('
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
r'))',
re.IGNORECASE | re.UNICODE,
)
def parens_are_matched(string: str, open_char='(', close_char=')') -> bool:
"""Check that all parentheses in a string are balanced and nested properly."""
count = 0
for c in string:
if c == open_char:
count += 1
elif c == close_char:
count -= 1
if count < 0:
return False
return count == 0
def fix_url_from_markdown(url_str: str) -> str:
"""
Cleanup a regex-parsed URL that may contain trailing parens from markdown syntax.
Example: https://wiki.org/article_(Disambiguation).html?q=1).text -> https://wiki.org/article_(Disambiguation).html?q=1
"""
trimmed_url = url_str
# Cut off trailing characters until parens are balanced
while not parens_are_matched(trimmed_url):
trimmed_url = trimmed_url[:-1]
# Verify trimmed URL is still valid
if re.findall(URL_REGEX, trimmed_url):
return trimmed_url
return url_str
def find_all_urls(text: str):
"""Find all URLs in a text string."""
for url in re.findall(URL_REGEX, text):
yield fix_url_from_markdown(url)
def fetch_content(url: str) -> str:
"""Fetch content from a URL (supports file:// and https://)."""
parsed = urlparse(url)
if parsed.scheme == 'file':
# Local file
file_path = parsed.path
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
else:
# Remote URL
timeout = int(os.environ.get('TIMEOUT', '60'))
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
import urllib.request
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read().decode('utf-8', errors='replace')
@click.command()
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
def main(url: str):
"""Parse plain text and extract URLs."""
try:
content = fetch_content(url)
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
urls_found = set()
for found_url in find_all_urls(content):
cleaned_url = unescape(found_url)
# Skip the source URL itself
if cleaned_url != url:
urls_found.add(cleaned_url)
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
for found_url in sorted(urls_found):
f.write(json.dumps({
'type': 'Snapshot',
'url': found_url,
'via_extractor': EXTRACTOR_NAME,
}) + '\n')
click.echo(f'Found {len(urls_found)} URLs')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,193 @@
#!/usr/bin/env python3
"""Unit tests for parse_txt_urls extractor."""
import json
import subprocess
import sys
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.py'), None)
class TestParseTxtUrls:
"""Test the parse_txt_urls extractor CLI."""
def test_extracts_urls_including_real_example_com(self, tmp_path):
"""Test extracting URLs from plain text including real example.com."""
input_file = tmp_path / 'urls.txt'
input_file.write_text('''
https://example.com
https://example.com/page
https://www.iana.org/domains/reserved
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0, f"Failed: {result.stderr}"
assert 'Found 3 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 3
urls = set()
for line in lines:
entry = json.loads(line)
assert 'url' in entry
urls.add(entry['url'])
# Verify real URLs are extracted correctly
assert 'https://example.com' in urls
assert 'https://example.com/page' in urls
assert 'https://www.iana.org/domains/reserved' in urls
def test_extracts_urls_from_mixed_content(self, tmp_path):
"""Test extracting URLs embedded in prose text."""
input_file = tmp_path / 'mixed.txt'
input_file.write_text('''
Check out this great article at https://blog.example.com/post
You can also visit http://docs.test.org for more info.
Also see https://github.com/user/repo for the code.
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
urls = {json.loads(line)['url'] for line in lines}
assert 'https://blog.example.com/post' in urls
assert 'http://docs.test.org' in urls
assert 'https://github.com/user/repo' in urls
def test_handles_markdown_urls(self, tmp_path):
"""Test handling URLs in markdown format with parentheses."""
input_file = tmp_path / 'markdown.txt'
input_file.write_text('''
[Example](https://example.com/page)
[Wiki](https://en.wikipedia.org/wiki/Article_(Disambiguation))
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
urls = {json.loads(line)['url'] for line in lines}
assert 'https://example.com/page' in urls
assert any('wikipedia.org' in u for u in urls)
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
input_file = tmp_path / 'empty.txt'
input_file.write_text('no urls here, just plain text')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'No URLs found' in result.stderr
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/path.txt'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'Failed to fetch' in result.stderr
def test_deduplicates_urls(self, tmp_path):
"""Test that duplicate URLs are deduplicated."""
input_file = tmp_path / 'dupes.txt'
input_file.write_text('''
https://example.com
https://example.com
https://example.com
https://other.com
''')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_appends_to_existing_file(self, tmp_path):
"""Test that output creates urls.jsonl with extracted URLs."""
input_file = tmp_path / 'urls.txt'
input_file.write_text('https://new.com\nhttps://other.com')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
urls = {json.loads(line)['url'] for line in lines}
assert 'https://new.com' in urls
assert 'https://other.com' in urls
def test_output_is_valid_json(self, tmp_path):
"""Test that output contains required fields."""
input_file = tmp_path / 'urls.txt'
input_file.write_text('https://example.com')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,295 @@
#!/usr/bin/env node
/**
* Print a URL to PDF using Chrome/Puppeteer.
*
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
* Otherwise launches a new Chrome instance.
*
* Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>
* Output: Writes pdf/output.pdf
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'pdf';
const OUTPUT_DIR = 'pdf';
const OUTPUT_FILE = 'output.pdf';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
// Get CDP URL from chrome_session if available
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
];
for (const candidate of candidates) {
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
async function printToPdf(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
let connectedToSession = false;
try {
// Try to connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
try {
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
connectedToSession = true;
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
}
// Set viewport on the page
await page.setViewport({ width, height });
} catch (e) {
console.error(`Failed to connect to CDP session: ${e.message}`);
browser = null;
}
}
// Fall back to launching new browser
if (!browser) {
const executablePath = findChrome();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}
browser = await puppeteer.launch({
executablePath,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
],
defaultViewport: { width, height },
});
page = await browser.newPage();
// Navigate to URL (only if we launched fresh browser)
if (userAgent) {
await page.setUserAgent(userAgent);
}
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
}
// Print to PDF
await page.pdf({
path: outputPath,
format: 'A4',
printBackground: true,
margin: {
top: '0.5in',
right: '0.5in',
bottom: '0.5in',
left: '0.5in',
},
});
if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
return { success: true, output: outputPath };
} else {
return { success: false, error: 'PDF file not created' };
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
// Only close browser if we launched it (not if we connected to session)
if (browser && !connectedToSession) {
await browser.close();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.log(`Skipping PDF - staticfile extractor already downloaded this`);
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${new Date().toISOString()}`);
console.log(`STATUS=skipped`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
process.exit(0); // Permanent skip - staticfile already handled
} else {
const result = await printToPdf(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const size = fs.statSync(output).size;
console.log(`PDF saved (${size} bytes)`);
} else {
status = 'failed';
error = result.error;
}
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env python3
"""
Install a binary using pip package manager.
Usage: on_Dependency__install_using_pip_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
Output: InstalledBinary JSONL record to stdout after installation
Environment variables:
MACHINE_ID: Machine UUID (set by orchestrator)
"""
import json
import os
import sys
import rich_click as click
from abx_pkg import Binary, PipProvider, BinProviderOverrides
# Fix pydantic forward reference issue
PipProvider.model_rebuild()
@click.command()
@click.option('--dependency-id', required=True, help="Dependency UUID")
@click.option('--bin-name', required=True, help="Binary name to install")
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
@click.option('--custom-cmd', default=None, help="Custom install command")
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
"""Install binary using pip."""
if bin_providers != '*' and 'pip' not in bin_providers.split(','):
click.echo(f"pip provider not allowed for {bin_name}", err=True)
sys.exit(0)
# Use abx-pkg PipProvider to install binary
provider = PipProvider()
if not provider.INSTALLER_BIN:
click.echo("pip not available on this system", err=True)
sys.exit(1)
click.echo(f"Installing {bin_name} via pip...", err=True)
try:
binary = Binary(name=bin_name, binproviders=[provider]).install()
except Exception as e:
click.echo(f"pip install failed: {e}", err=True)
sys.exit(1)
if not binary.abspath:
click.echo(f"{bin_name} not found after pip install", err=True)
sys.exit(1)
machine_id = os.environ.get('MACHINE_ID', '')
# Output InstalledBinary JSONL record to stdout
record = {
'type': 'InstalledBinary',
'name': bin_name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'pip',
'machine_id': machine_id,
'dependency_id': dependency_id,
}
print(json.dumps(record))
# Log human-readable info to stderr
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
click.echo(f" version: {binary.version}", err=True)
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,29 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_READABILITY": {
"type": "boolean",
"default": true,
"description": "Enable Readability text extraction"
},
"READABILITY_BINARY": {
"type": "string",
"default": "readability-extractor",
"description": "Path to readability-extractor binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"READABILITY_TIMEOUT": {
"type": "integer",
"default": 30,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for Readability in seconds"
}
}
}

View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python3
"""
Extract article content using Mozilla's Readability.
Usage: on_Snapshot__readability.py --url=<url> --snapshot-id=<uuid>
Output: Creates readability/ directory with content.html, content.txt, article.json
Environment variables:
READABILITY_BINARY: Path to readability-cli binary
TIMEOUT: Timeout in seconds (default: 60)
Note: Requires readability-cli: npm install -g readability-cli
This extractor looks for HTML source from other extractors (wget, singlefile, dom)
"""
import json
import os
import shutil
import subprocess
import sys
import tempfile
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'readability'
BIN_NAME = 'readability-cli'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'readability'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def find_readability() -> str | None:
"""Find readability-cli binary."""
readability = get_env('READABILITY_BINARY')
if readability and os.path.isfile(readability):
return readability
for name in ['readability-cli', 'readable']:
binary = shutil.which(name)
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get readability-cli version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
def find_html_source() -> str | None:
"""Find HTML content from other extractors in the snapshot directory."""
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
search_patterns = [
'singlefile/singlefile.html',
'singlefile/*.html',
'dom/output.html',
'dom/*.html',
'wget/**/*.html',
'wget/**/*.htm',
]
cwd = Path.cwd()
for pattern in search_patterns:
matches = list(cwd.glob(pattern))
for match in matches:
if match.is_file() and match.stat().st_size > 0:
return str(match)
return None
def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Extract article using Readability.
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('TIMEOUT', 60)
# Find HTML source
html_source = find_html_source()
if not html_source:
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
# Create output directory
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
try:
# Run readability-cli
cmd = [binary, '--json', html_source]
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if result.returncode != 0:
stderr = result.stderr.decode('utf-8', errors='replace')
return False, None, f'readability-cli failed: {stderr[:200]}'
# Parse JSON output
try:
result_json = json.loads(result.stdout)
except json.JSONDecodeError:
return False, None, 'readability-cli returned invalid JSON'
# Extract and save content
# readability-cli v2.x uses hyphenated field names
text_content = result_json.pop('text-content', result_json.pop('textContent', ''))
html_content = result_json.pop('html-content', result_json.pop('content', ''))
if not text_content and not html_content:
return False, None, 'No content extracted'
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
(output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
return True, OUTPUT_DIR, ''
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to extract article from')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Extract article content using Mozilla's Readability."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
try:
# Find binary
binary = find_readability()
if not binary:
print(f'ERROR: readability-cli binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
# Run extraction
success, output, error = extract_readability(url, binary)
status = 'succeeded' if success else 'failed'
if success:
text_file = Path(output) / 'content.txt'
html_file = Path(output) / 'content.html'
text_len = text_file.stat().st_size if text_file.exists() else 0
html_len = html_file.stat().st_size if html_file.exists() else 0
print(f'Readability extracted: {text_len} chars text, {html_len} chars HTML')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if binary:
print(f'CMD={binary} --json <html>')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,254 @@
"""
Integration tests for readability plugin
Tests verify:
1. Plugin reports missing dependency correctly
2. readability-cli can be installed via npm (note: package name != binary name)
3. Extraction works against real example.com content
"""
import json
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
TEST_URL = 'https://example.com'
def create_example_html(tmpdir: Path) -> Path:
"""Create sample HTML that looks like example.com with enough content for Readability."""
singlefile_dir = tmpdir / 'singlefile'
singlefile_dir.mkdir()
html_file = singlefile_dir / 'singlefile.html'
html_file.write_text('''
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Example Domain</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
</head>
<body>
<article>
<header>
<h1>Example Domain</h1>
</header>
<div class="content">
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p>Example domains are maintained by the Internet Assigned Numbers Authority (IANA)
to provide a well-known address for documentation purposes. This helps authors create
examples that readers can understand without confusion about actual domain ownership.</p>
<p>The practice of using example domains dates back to the early days of the internet.
These reserved domains ensure that example code and documentation doesn't accidentally
point to real, active websites that might change or disappear over time.</p>
<p>For more information about example domains and their history, you can visit the
IANA website. They maintain several example domains including example.com, example.net,
and example.org, all specifically reserved for this purpose.</p>
<p><a href="https://www.iana.org/domains/example">More information about example domains...</a></p>
</div>
</article>
</body>
</html>
''')
return html_file
def test_hook_script_exists():
"""Verify hook script exists."""
assert READABILITY_HOOK.exists(), f"Hook script not found: {READABILITY_HOOK}"
def test_reports_missing_dependency_when_not_installed():
"""Test that script reports DEPENDENCY_NEEDED when readability-cli is not found."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create HTML source so it doesn't fail on missing HTML
create_example_html(tmpdir)
# Run with empty PATH so binary won't be found
env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
result = subprocess.run(
[sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env
)
# Should fail and report missing dependency
assert result.returncode != 0, "Should exit non-zero when dependency missing"
combined = result.stdout + result.stderr
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
assert 'readability-cli' in combined or 'BIN_NAME' in combined, "Should mention readability-cli"
def test_can_install_readability_via_npm():
"""Test that readability-cli can be installed via npm and binary becomes available.
Note: The npm package 'readability-cli' installs a binary named 'readable',
so we test the full installation flow using npm install directly.
"""
# Check npm is available
if not shutil.which('npm'):
pytest.skip("npm not available on this system")
# Install readability-cli package via npm
# The orchestrator/dependency hooks would call this via npm provider
result = subprocess.run(
['npm', 'install', '-g', 'readability-cli'],
capture_output=True,
text=True,
timeout=300
)
assert result.returncode == 0, f"npm install failed: {result.stderr}"
# Verify the 'readable' binary is now available
# (readability-cli package installs as 'readable' not 'readability-cli')
result = subprocess.run(['which', 'readable'], capture_output=True, text=True)
assert result.returncode == 0, "readable binary not found after npm install"
binary_path = result.stdout.strip()
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
# Test that it's executable and responds to --version
result = subprocess.run(
[binary_path, '--version'],
capture_output=True,
text=True,
timeout=10
)
assert result.returncode == 0, f"Binary not executable: {result.stderr}"
def test_extracts_article_after_installation():
"""Test full workflow: ensure readability-cli installed then extract from example.com HTML."""
# Check npm is available
if not shutil.which('npm'):
pytest.skip("npm not available on this system")
# Ensure readability-cli is installed (orchestrator would handle this)
install_result = subprocess.run(
['npm', 'install', '-g', 'readability-cli'],
capture_output=True,
text=True,
timeout=300
)
if install_result.returncode != 0:
pytest.skip(f"Could not install readability-cli: {install_result.stderr}")
# Now test extraction
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create example.com HTML for readability to process
create_example_html(tmpdir)
# Run readability extraction (should find the installed binary)
result = subprocess.run(
[sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=30
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Verify output directory created
readability_dir = tmpdir / 'readability'
assert readability_dir.exists(), "Output directory not created"
# Verify output files exist
html_file = readability_dir / 'content.html'
txt_file = readability_dir / 'content.txt'
json_file = readability_dir / 'article.json'
assert html_file.exists(), "content.html not created"
assert txt_file.exists(), "content.txt not created"
assert json_file.exists(), "article.json not created"
# Verify HTML content contains REAL example.com text
html_content = html_file.read_text()
assert len(html_content) > 100, f"HTML content too short: {len(html_content)} bytes"
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
assert ('illustrative examples' in html_content.lower() or
'use in' in html_content.lower() or
'literature' in html_content.lower()), \
"Missing example.com description in HTML"
# Verify text content contains REAL example.com text
txt_content = txt_file.read_text()
assert len(txt_content) > 50, f"Text content too short: {len(txt_content)} bytes"
assert 'example' in txt_content.lower(), "Missing 'example' in text"
# Verify JSON metadata
json_data = json.loads(json_file.read_text())
assert isinstance(json_data, dict), "article.json should be a dict"
# Verify stdout contains expected output
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'OUTPUT=readability' in result.stdout, "Should report output directory"
def test_fails_gracefully_without_html_source():
"""Test that extraction fails gracefully when no HTML source is available."""
# Check npm is available
if not shutil.which('npm'):
pytest.skip("npm not available on this system")
# Ensure readability-cli is installed
install_result = subprocess.run(
['npm', 'install', '-g', 'readability-cli'],
capture_output=True,
text=True,
timeout=300
)
if install_result.returncode != 0:
pytest.skip("Could not install readability-cli")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Don't create any HTML source files
result = subprocess.run(
[sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=30
)
assert result.returncode != 0, "Should fail without HTML source"
combined_output = result.stdout + result.stderr
assert ('no html source' in combined_output.lower() or
'not found' in combined_output.lower() or
'ERROR=' in combined_output), \
"Should report missing HTML source"
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env node
/**
* Track complete redirect chains for a URL.
*
* Captures:
* - HTTP redirects (301, 302, 303, 307, 308)
* - Meta refresh redirects
* - JavaScript redirects (basic detection)
* - Full redirect chain with timestamps
*
* Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>
* Output: Writes redirects/redirects.json
*
* Environment variables:
* SAVE_REDIRECTS: Enable redirect tracking (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'redirects';
const OUTPUT_DIR = 'redirects';
const OUTPUT_FILE = 'redirects.json';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Track redirect chain
async function trackRedirects(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
const redirectChain = [];
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Track all responses to capture redirects
page.on('response', async (response) => {
const status = response.status();
const responseUrl = response.url();
const headers = response.headers();
// Check if it's a redirect
if (status >= 300 && status < 400) {
redirectChain.push({
timestamp: new Date().toISOString(),
url: responseUrl,
status,
statusText: response.statusText(),
location: headers['location'] || headers['Location'] || '',
type: 'http',
});
}
});
// Get the current URL (which is the final destination after redirects)
const finalUrl = page.url();
// Check for meta refresh redirects
const metaRefresh = await page.evaluate(() => {
const meta = document.querySelector('meta[http-equiv="refresh"]');
if (meta) {
const content = meta.getAttribute('content') || '';
const match = content.match(/url=['"]?([^'"]+)['"]?/i);
return {
content,
url: match ? match[1] : null,
};
}
return null;
});
if (metaRefresh && metaRefresh.url) {
redirectChain.push({
timestamp: new Date().toISOString(),
url: finalUrl,
status: null,
statusText: 'Meta Refresh',
location: metaRefresh.url,
type: 'meta_refresh',
content: metaRefresh.content,
});
}
// Check for JavaScript redirects (basic detection)
const jsRedirect = await page.evaluate(() => {
// Check for common JavaScript redirect patterns
const html = document.documentElement.outerHTML;
const patterns = [
/window\.location\s*=\s*['"]([^'"]+)['"]/i,
/window\.location\.href\s*=\s*['"]([^'"]+)['"]/i,
/window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i,
/document\.location\s*=\s*['"]([^'"]+)['"]/i,
];
for (const pattern of patterns) {
const match = html.match(pattern);
if (match) {
return {
pattern: pattern.toString(),
url: match[1],
};
}
}
return null;
});
if (jsRedirect && jsRedirect.url) {
redirectChain.push({
timestamp: new Date().toISOString(),
url: finalUrl,
status: null,
statusText: 'JavaScript Redirect',
location: jsRedirect.url,
type: 'javascript',
pattern: jsRedirect.pattern,
});
}
const redirectData = {
original_url: url,
final_url: finalUrl,
redirect_count: redirectChain.length,
redirects: redirectChain,
is_redirect: redirectChain.length > 0,
};
// Write output
fs.writeFileSync(outputPath, JSON.stringify(redirectData, null, 2));
return { success: true, output: outputPath, redirectData };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('SAVE_REDIRECTS', true)) {
console.log('Skipping redirects (SAVE_REDIRECTS=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await trackRedirects(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const redirectCount = result.redirectData.redirect_count;
const finalUrl = result.redirectData.final_url;
if (redirectCount > 0) {
console.log(`Tracked ${redirectCount} redirect(s) to: ${finalUrl}`);
} else {
console.log('No redirects detected');
}
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,381 @@
#!/usr/bin/env node
/**
* Archive all network responses during page load.
*
* Connects to Chrome session and captures ALL network responses (XHR, images, scripts, etc.)
* Saves them in an organized directory structure with both timestamped unique files
* and URL-organized symlinks.
*
* Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>
* Output: Creates responses/ directory with:
* - all/<timestamp>__<METHOD>__<URL>.<ext>: Timestamped unique files
* - <type>/<domain>/<path>/: URL-organized symlinks by resource type
* - index.jsonl: Searchable index of all responses
*
* Environment variables:
* SAVE_RESPONSES: Enable response archiving (default: true)
* RESPONSES_TIMEOUT: Timeout in seconds (default: 120)
* RESPONSES_TYPES: Comma-separated resource types to save (default: all)
* Options: script,stylesheet,font,image,media,xhr,websocket,document
*/
const fs = require('fs');
const path = require('path');
const crypto = require('crypto');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'responses';
const OUTPUT_DIR = 'responses';
const CHROME_SESSION_DIR = 'chrome_session';
// Resource types to capture (by default, capture everything)
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Get file extension from MIME type
function getExtensionFromMimeType(mimeType) {
const mimeMap = {
'text/html': 'html',
'text/css': 'css',
'text/javascript': 'js',
'application/javascript': 'js',
'application/x-javascript': 'js',
'application/json': 'json',
'application/xml': 'xml',
'text/xml': 'xml',
'image/png': 'png',
'image/jpeg': 'jpg',
'image/gif': 'gif',
'image/svg+xml': 'svg',
'image/webp': 'webp',
'font/woff': 'woff',
'font/woff2': 'woff2',
'font/ttf': 'ttf',
'font/otf': 'otf',
'application/font-woff': 'woff',
'application/font-woff2': 'woff2',
'video/mp4': 'mp4',
'video/webm': 'webm',
'audio/mpeg': 'mp3',
'audio/ogg': 'ogg',
};
const mimeBase = (mimeType || '').split(';')[0].trim().toLowerCase();
return mimeMap[mimeBase] || '';
}
// Get extension from URL path
function getExtensionFromUrl(url) {
try {
const pathname = new URL(url).pathname;
const match = pathname.match(/\.([a-z0-9]+)$/i);
return match ? match[1].toLowerCase() : '';
} catch (e) {
return '';
}
}
// Sanitize filename
function sanitizeFilename(str, maxLen = 200) {
return str
.replace(/[^a-zA-Z0-9._-]/g, '_')
.slice(0, maxLen);
}
// Create symlink (handle errors gracefully)
async function createSymlink(target, linkPath) {
try {
// Create parent directory
const dir = path.dirname(linkPath);
if (!fs.existsSync(dir)) {
fs.mkdirSync(dir, { recursive: true });
}
// Remove existing symlink/file if present
if (fs.existsSync(linkPath)) {
fs.unlinkSync(linkPath);
}
// Create relative symlink
const relativePath = path.relative(dir, target);
fs.symlinkSync(relativePath, linkPath);
} catch (e) {
// Ignore symlink errors (file conflicts, permissions, etc.)
console.error(`Failed to create symlink: ${e.message}`);
}
}
// Archive responses by intercepting network traffic
async function archiveResponses(originalUrl) {
const timeout = (getEnvInt('RESPONSES_TIMEOUT') || getEnvInt('TIMEOUT', 120)) * 1000;
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
// Create output directories
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const allDir = path.join(OUTPUT_DIR, 'all');
if (!fs.existsSync(allDir)) {
fs.mkdirSync(allDir, { recursive: true });
}
// Create index file
const indexPath = path.join(OUTPUT_DIR, 'index.jsonl');
fs.writeFileSync(indexPath, ''); // Clear existing
let browser = null;
let savedCount = 0;
const savedResponses = [];
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Enable request interception
await page.setRequestInterception(false); // Don't block requests
// Listen for responses
page.on('response', async (response) => {
try {
const request = response.request();
const url = response.url();
const resourceType = request.resourceType().toLowerCase();
const method = request.method();
const status = response.status();
// Skip redirects and errors
if (status >= 300 && status < 400) return;
if (status >= 400 && status < 600) return;
// Check if we should save this resource type
if (typesToSave.length && !typesToSave.includes(resourceType)) {
return;
}
// Get response body
let bodyBuffer = null;
try {
bodyBuffer = await response.buffer();
} catch (e) {
// Some responses can't be captured (already consumed, etc.)
return;
}
if (!bodyBuffer || bodyBuffer.length === 0) {
return;
}
// Determine file extension
const mimeType = response.headers()['content-type'] || '';
let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url);
// Create timestamp-based unique filename
const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '');
const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64));
const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`;
const uniquePath = path.join(allDir, uniqueFilename);
// Save to unique file
fs.writeFileSync(uniquePath, bodyBuffer);
// Create URL-organized symlink
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const pathname = urlObj.pathname || '/';
const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
const dirPath = path.dirname(pathname);
// Create symlink: responses/<type>/<hostname>/<path>/<filename>
const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
const symlinkPath = path.join(symlinkDir, filename);
await createSymlink(uniquePath, symlinkPath);
} catch (e) {
// URL parsing or symlink creation failed, skip
}
// Calculate SHA256
const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex');
const urlSha256 = crypto.createHash('sha256').update(url).digest('hex');
// Write to index
const indexEntry = {
ts: timestamp,
method,
url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs
urlSha256,
status,
resourceType,
mimeType: mimeType.split(';')[0],
responseSha256: sha256,
path: './' + path.relative(OUTPUT_DIR, uniquePath),
extension,
};
fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n');
savedResponses.push(indexEntry);
savedCount++;
} catch (e) {
// Log but don't fail the whole extraction
console.error(`Error capturing response: ${e.message}`);
}
});
// Wait a bit to ensure we capture responses
// (chrome_session already loaded the page, just capture any remaining traffic)
await new Promise(resolve => setTimeout(resolve, 2000));
return {
success: true,
output: OUTPUT_DIR,
savedCount,
indexPath,
};
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
let savedCount = 0;
try {
// Check if enabled
if (!getEnvBool('SAVE_RESPONSES', true)) {
console.log('Skipping responses (SAVE_RESPONSES=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await archiveResponses(url);
if (result.success) {
status = 'succeeded';
output = result.output;
savedCount = result.savedCount || 0;
console.log(`Saved ${savedCount} network responses to ${output}/`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
saved_count: savedCount,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,90 @@
#!/bin/bash
# Run all plugin tests
#
# Usage: ./run_all_tests.sh
set -e
echo "=========================================="
echo "Running All Plugin Tests"
echo "=========================================="
echo ""
# Color codes
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Track results
TOTAL_TESTS=0
PASSED_TESTS=0
FAILED_TESTS=0
run_test_suite() {
local test_file=$1
local test_name=$(basename $(dirname $test_file))
echo -e "${YELLOW}[RUNNING]${NC} $test_name tests..."
if node --test "$test_file" 2>&1; then
echo -e "${GREEN}[PASSED]${NC} $test_name tests"
PASSED_TESTS=$((PASSED_TESTS + 1))
else
echo -e "${RED}[FAILED]${NC} $test_name tests"
FAILED_TESTS=$((FAILED_TESTS + 1))
fi
TOTAL_TESTS=$((TOTAL_TESTS + 1))
echo ""
}
# Find and run all test files
echo "Finding test files..."
echo ""
# Chrome extensions utils tests
if [ -f "chrome_extensions/tests/test_chrome_extension_utils.js" ]; then
run_test_suite "chrome_extensions/tests/test_chrome_extension_utils.js"
fi
# Captcha2 tests
if [ -f "captcha2/tests/test_captcha2_install.js" ]; then
run_test_suite "captcha2/tests/test_captcha2_install.js"
fi
if [ -f "captcha2/tests/test_captcha2_config.js" ]; then
run_test_suite "captcha2/tests/test_captcha2_config.js"
fi
# I Still Don't Care About Cookies tests
if [ -f "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js" ]; then
run_test_suite "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js"
fi
# uBlock tests
if [ -f "ublock/tests/test_ublock.js" ]; then
run_test_suite "ublock/tests/test_ublock.js"
fi
# SingleFile tests
if [ -f "singlefile/tests/test_singlefile.js" ]; then
run_test_suite "singlefile/tests/test_singlefile.js"
fi
# Print summary
echo "=========================================="
echo "Test Summary"
echo "=========================================="
echo -e "Total test suites: $TOTAL_TESTS"
echo -e "${GREEN}Passed:${NC} $PASSED_TESTS"
echo -e "${RED}Failed:${NC} $FAILED_TESTS"
echo ""
if [ $FAILED_TESTS -eq 0 ]; then
echo -e "${GREEN}✓ All tests passed!${NC}"
exit 0
else
echo -e "${RED}✗ Some tests failed${NC}"
exit 1
fi

29
archivebox/plugins/run_tests.sh Executable file
View File

@@ -0,0 +1,29 @@
#!/bin/bash
# Run all plugin tests
#
# Usage: ./run_tests.sh [plugin_name]
#
# Examples:
# ./run_tests.sh # Run all tests
# ./run_tests.sh captcha2 # Run only captcha2 tests
# ./run_tests.sh chrome_* # Run all chrome tests
set -e
echo "=========================================="
echo "Running ArchiveBox Plugin Tests"
echo "=========================================="
echo ""
if [ -n "$1" ]; then
echo "Running tests for: $1"
python -m pytest "$1"/tests/ -v
else
echo "Running all plugin tests..."
python -m pytest */tests/test_*.py -v
fi
echo ""
echo "=========================================="
echo "Tests Complete"
echo "=========================================="

View File

@@ -0,0 +1,291 @@
#!/usr/bin/env node
/**
* Take a screenshot of a URL using Chrome/Puppeteer.
*
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
* Otherwise launches a new Chrome instance.
*
* Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>
* Output: Writes screenshot/screenshot.png
*
* Environment variables:
* CHROME_BINARY: Path to Chrome/Chromium binary
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
* CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
* CHROME_USER_AGENT: User agent string (optional)
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
* CHROME_HEADLESS: Run in headless mode (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'screenshot';
const OUTPUT_DIR = 'screenshot';
const OUTPUT_FILE = 'screenshot.png';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Check if staticfile extractor already downloaded this URL
const STATICFILE_DIR = 'staticfile';
function hasStaticFileOutput() {
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
}
// Get CDP URL from chrome_session if available
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Find Chrome binary
function findChrome() {
const chromeBinary = getEnv('CHROME_BINARY');
if (chromeBinary && fs.existsSync(chromeBinary)) {
return chromeBinary;
}
const candidates = [
// Linux
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
'/usr/bin/chromium',
'/usr/bin/chromium-browser',
// macOS
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
// Common paths
'google-chrome',
'chromium',
];
for (const candidate of candidates) {
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
return candidate;
}
}
return null;
}
// Parse resolution string
function parseResolution(resolution) {
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
return { width: width || 1440, height: height || 2000 };
}
async function takeScreenshot(url) {
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
const headless = getEnvBool('CHROME_HEADLESS', true);
const { width, height } = parseResolution(resolution);
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
let page = null;
let connectedToSession = false;
try {
// Try to connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (cdpUrl) {
try {
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
defaultViewport: { width, height },
});
connectedToSession = true;
// Get existing pages or create new one
const pages = await browser.pages();
page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
page = await browser.newPage();
}
// Set viewport on the page
await page.setViewport({ width, height });
} catch (e) {
console.error(`Failed to connect to CDP session: ${e.message}`);
browser = null;
}
}
// Fall back to launching new browser
if (!browser) {
const executablePath = findChrome();
if (!executablePath) {
return { success: false, error: 'Chrome binary not found' };
}
browser = await puppeteer.launch({
executablePath,
headless: headless ? 'new' : false,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
`--window-size=${width},${height}`,
...(checkSsl ? [] : ['--ignore-certificate-errors']),
],
defaultViewport: { width, height },
});
page = await browser.newPage();
// Navigate to URL (only if we launched fresh browser)
if (userAgent) {
await page.setUserAgent(userAgent);
}
await page.goto(url, {
waitUntil: 'networkidle2',
timeout,
});
}
// Take screenshot
await page.screenshot({
path: outputPath,
fullPage: true,
});
if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
return { success: true, output: outputPath };
} else {
return { success: false, error: 'Screenshot file not created' };
}
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
// Only close browser if we launched it (not if we connected to session)
if (browser && !connectedToSession) {
await browser.close();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if staticfile extractor already handled this (permanent skip)
if (hasStaticFileOutput()) {
console.log(`Skipping screenshot - staticfile extractor already downloaded this`);
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${new Date().toISOString()}`);
console.log(`STATUS=skipped`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
process.exit(0); // Permanent skip - staticfile already handled
} else {
const result = await takeScreenshot(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const size = fs.statSync(output).size;
console.log(`Screenshot saved (${size} bytes)`);
} else {
status = 'failed';
error = result.error;
}
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,24 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"RIPGREP_BINARY": {
"type": "string",
"default": "rg",
"description": "Path to ripgrep binary"
},
"RIPGREP_IGNORE_EXTENSIONS": {
"type": "string",
"default": "css,js,orig,svg",
"description": "Comma-separated file extensions to ignore"
},
"SEARCH_BACKEND_TIMEOUT": {
"type": "integer",
"default": 90,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Search timeout in seconds"
}
}
}

View File

@@ -0,0 +1,80 @@
"""
Ripgrep search backend - searches files directly without indexing.
This backend doesn't maintain an index - it searches archived files directly
using ripgrep (rg). This is simpler but slower for large archives.
Environment variables:
RIPGREP_BINARY: Path to ripgrep binary (default: rg)
RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg)
SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90)
"""
import os
import subprocess
import shutil
from pathlib import Path
from typing import List, Iterable
from django.conf import settings
# Config with old var names for backwards compatibility
RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip()
RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip()
SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90'))
def search(query: str) -> List[str]:
"""Search for snapshots using ripgrep."""
rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY
if not rg_binary or not Path(rg_binary).exists():
raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep')
archive_dir = Path(settings.ARCHIVE_DIR)
if not archive_dir.exists():
return []
# Build ignore pattern from config
ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}'
cmd = [
rg_binary,
f'--type-add=ignore:{ignore_pattern}',
'--type-not=ignore',
'--files-with-matches',
'--no-messages',
'--ignore-case',
'--regexp',
query,
str(archive_dir),
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT)
# Extract snapshot IDs from file paths
# Paths look like: archive/<snapshot_id>/<extractor>/file.txt
snapshot_ids = set()
for line in result.stdout.strip().split('\n'):
if not line:
continue
path = Path(line)
try:
relative = path.relative_to(archive_dir)
snapshot_id = relative.parts[0]
snapshot_ids.add(snapshot_id)
except (ValueError, IndexError):
continue
return list(snapshot_ids)
except subprocess.TimeoutExpired:
return []
except Exception:
return []
def flush(snapshot_ids: Iterable[str]) -> None:
"""No-op for ripgrep - it searches files directly."""
pass

View File

@@ -0,0 +1,37 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SEARCH_BACKEND_HOST_NAME": {
"type": "string",
"default": "127.0.0.1",
"x-aliases": ["SONIC_HOST"],
"description": "Sonic server hostname"
},
"SEARCH_BACKEND_PORT": {
"type": "integer",
"default": 1491,
"minimum": 1,
"maximum": 65535,
"x-aliases": ["SONIC_PORT"],
"description": "Sonic server port"
},
"SEARCH_BACKEND_PASSWORD": {
"type": "string",
"default": "SecretPassword",
"x-aliases": ["SONIC_PASSWORD"],
"description": "Sonic server password"
},
"SONIC_COLLECTION": {
"type": "string",
"default": "archivebox",
"description": "Sonic collection name"
},
"SONIC_BUCKET": {
"type": "string",
"default": "snapshots",
"description": "Sonic bucket name"
}
}
}

View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Sonic search backend - indexes snapshot content in Sonic server.
This hook runs after all extractors and indexes text content in Sonic.
Only runs if SEARCH_BACKEND_ENGINE=sonic.
Usage: on_Snapshot__91_index_sonic.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SEARCH_BACKEND_ENGINE: Must be 'sonic' for this hook to run
USE_INDEXING_BACKEND: Enable search indexing (default: true)
SEARCH_BACKEND_HOST_NAME: Sonic server host (default: 127.0.0.1)
SEARCH_BACKEND_PORT: Sonic server port (default: 1491)
SEARCH_BACKEND_PASSWORD: Sonic server password (default: SecretPassword)
SONIC_COLLECTION: Collection name (default: archivebox)
SONIC_BUCKET: Bucket name (default: snapshots)
"""
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sonic'
OUTPUT_DIR = 'search_index'
# Text file patterns to index
INDEXABLE_FILES = [
('readability', 'content.txt'),
('readability', 'content.html'),
('mercury', 'content.txt'),
('mercury', 'content.html'),
('htmltotext', 'output.txt'),
('singlefile', 'singlefile.html'),
('dom', 'output.html'),
('wget', '**/*.html'),
('wget', '**/*.htm'),
('title', 'title.txt'),
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def strip_html_tags(html: str) -> str:
"""Remove HTML tags, keeping text content."""
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<[^>]+>', ' ', html)
html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
html = html.replace('&lt;', '<').replace('&gt;', '>')
html = html.replace('&quot;', '"')
html = re.sub(r'\s+', ' ', html)
return html.strip()
def find_indexable_content() -> list[tuple[str, str]]:
"""Find text content to index from extractor outputs."""
results = []
cwd = Path.cwd()
for extractor, file_pattern in INDEXABLE_FILES:
extractor_dir = cwd / extractor
if not extractor_dir.exists():
continue
if '*' in file_pattern:
matches = list(extractor_dir.glob(file_pattern))
else:
match = extractor_dir / file_pattern
matches = [match] if match.exists() else []
for match in matches:
if match.is_file() and match.stat().st_size > 0:
try:
content = match.read_text(encoding='utf-8', errors='ignore')
if content.strip():
if match.suffix in ('.html', '.htm'):
content = strip_html_tags(content)
results.append((f'{extractor}/{match.name}', content))
except Exception:
continue
return results
def get_sonic_config() -> dict:
"""Get Sonic connection configuration."""
return {
'host': get_env('SEARCH_BACKEND_HOST_NAME', '127.0.0.1'),
'port': get_env_int('SEARCH_BACKEND_PORT', 1491),
'password': get_env('SEARCH_BACKEND_PASSWORD', 'SecretPassword'),
'collection': get_env('SONIC_COLLECTION', 'archivebox'),
'bucket': get_env('SONIC_BUCKET', 'snapshots'),
}
def index_in_sonic(snapshot_id: str, texts: list[str]) -> None:
"""Index texts in Sonic."""
try:
from sonic import IngestClient
except ImportError:
raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
config = get_sonic_config()
with IngestClient(config['host'], config['port'], config['password']) as ingest:
# Flush existing content
try:
ingest.flush_object(config['collection'], config['bucket'], snapshot_id)
except Exception:
pass
# Index new content in chunks (Sonic has size limits)
content = ' '.join(texts)
chunk_size = 10000
for i in range(0, len(content), chunk_size):
chunk = content[i:i + chunk_size]
ingest.push(config['collection'], config['bucket'], snapshot_id, chunk)
@click.command()
@click.option('--url', required=True, help='URL that was archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Index snapshot content in Sonic."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
indexed_sources = []
try:
# Check if this backend is enabled (permanent skips - don't retry)
backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
if backend != 'sonic':
print(f'Skipping Sonic indexing (SEARCH_BACKEND_ENGINE={backend})')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - different backend selected
if not get_env_bool('USE_INDEXING_BACKEND', True):
print('Skipping indexing (USE_INDEXING_BACKEND=False)')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - indexing disabled
else:
contents = find_indexable_content()
indexed_sources = [source for source, _ in contents]
if not contents:
status = 'skipped'
print('No indexable content found')
else:
texts = [content for _, content in contents]
index_in_sonic(snapshot_id, texts)
status = 'succeeded'
output = OUTPUT_DIR
print(f'Sonic indexed {len(texts)} documents')
print(f'Sources: {", ".join(indexed_sources)}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'indexed_sources': indexed_sources,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,50 @@
"""
Sonic search backend - search and flush operations.
This module provides the search interface for the Sonic backend.
"""
import os
from typing import List, Iterable
def get_sonic_config() -> dict:
"""Get Sonic connection configuration."""
return {
'host': os.environ.get('SEARCH_BACKEND_HOST_NAME', '127.0.0.1').strip(),
'port': int(os.environ.get('SEARCH_BACKEND_PORT', '1491')),
'password': os.environ.get('SEARCH_BACKEND_PASSWORD', 'SecretPassword').strip(),
'collection': os.environ.get('SONIC_COLLECTION', 'archivebox').strip(),
'bucket': os.environ.get('SONIC_BUCKET', 'snapshots').strip(),
}
def search(query: str) -> List[str]:
"""Search for snapshots in Sonic."""
try:
from sonic import SearchClient
except ImportError:
raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
config = get_sonic_config()
with SearchClient(config['host'], config['port'], config['password']) as search_client:
results = search_client.query(config['collection'], config['bucket'], query, limit=100)
return results
def flush(snapshot_ids: Iterable[str]) -> None:
"""Remove snapshots from Sonic index."""
try:
from sonic import IngestClient
except ImportError:
raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
config = get_sonic_config()
with IngestClient(config['host'], config['port'], config['password']) as ingest:
for snapshot_id in snapshot_ids:
try:
ingest.flush_object(config['collection'], config['bucket'], snapshot_id)
except Exception:
pass

View File

@@ -0,0 +1,24 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SQLITEFTS_DB": {
"type": "string",
"default": "search.sqlite3",
"description": "SQLite FTS database filename"
},
"FTS_SEPARATE_DATABASE": {
"type": "boolean",
"default": true,
"x-aliases": ["SQLITEFTS_SEPARATE_DATABASE"],
"description": "Use separate database file for FTS index"
},
"FTS_TOKENIZERS": {
"type": "string",
"default": "porter unicode61 remove_diacritics 2",
"x-aliases": ["SQLITEFTS_TOKENIZERS"],
"description": "FTS5 tokenizer configuration"
}
}
}

View File

@@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""
SQLite FTS5 search backend - indexes snapshot content for full-text search.
This hook runs after all extractors and indexes text content in SQLite FTS5.
Only runs if SEARCH_BACKEND_ENGINE=sqlite.
Usage: on_Snapshot__90_index_sqlite.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run
USE_INDEXING_BACKEND: Enable search indexing (default: true)
SQLITEFTS_DB: Database filename (default: search.sqlite3)
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
"""
import json
import os
import re
import sqlite3
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sqlite'
OUTPUT_DIR = 'search_index'
# Text file patterns to index, in priority order
INDEXABLE_FILES = [
('readability', 'content.txt'),
('readability', 'content.html'),
('mercury', 'content.txt'),
('mercury', 'content.html'),
('htmltotext', 'output.txt'),
('singlefile', 'singlefile.html'),
('dom', 'output.html'),
('wget', '**/*.html'),
('wget', '**/*.htm'),
('title', 'title.txt'),
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def strip_html_tags(html: str) -> str:
"""Remove HTML tags, keeping text content."""
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<[^>]+>', ' ', html)
html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
html = html.replace('&lt;', '<').replace('&gt;', '>')
html = html.replace('&quot;', '"')
html = re.sub(r'\s+', ' ', html)
return html.strip()
def find_indexable_content() -> list[tuple[str, str]]:
"""Find text content to index from extractor outputs."""
results = []
cwd = Path.cwd()
for extractor, file_pattern in INDEXABLE_FILES:
extractor_dir = cwd / extractor
if not extractor_dir.exists():
continue
if '*' in file_pattern:
matches = list(extractor_dir.glob(file_pattern))
else:
match = extractor_dir / file_pattern
matches = [match] if match.exists() else []
for match in matches:
if match.is_file() and match.stat().st_size > 0:
try:
content = match.read_text(encoding='utf-8', errors='ignore')
if content.strip():
if match.suffix in ('.html', '.htm'):
content = strip_html_tags(content)
results.append((f'{extractor}/{match.name}', content))
except Exception:
continue
return results
def get_db_path() -> Path:
"""Get path to the search index database."""
data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent))
db_name = get_env('SQLITEFTS_DB', 'search.sqlite3')
return Path(data_dir) / db_name
def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None:
"""Index texts in SQLite FTS5."""
db_path = get_db_path()
tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2')
conn = sqlite3.connect(str(db_path))
try:
# Create FTS5 table if needed
conn.execute(f'''
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
USING fts5(snapshot_id, content, tokenize='{tokenizers}')
''')
# Remove existing entries
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
# Insert new content
content = '\n\n'.join(texts)
conn.execute(
'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)',
(snapshot_id, content)
)
conn.commit()
finally:
conn.close()
@click.command()
@click.option('--url', required=True, help='URL that was archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Index snapshot content in SQLite FTS5."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
indexed_sources = []
try:
# Check if this backend is enabled (permanent skips - don't retry)
backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
if backend != 'sqlite':
print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - different backend selected
if not get_env_bool('USE_INDEXING_BACKEND', True):
print('Skipping indexing (USE_INDEXING_BACKEND=False)')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - indexing disabled
else:
contents = find_indexable_content()
indexed_sources = [source for source, _ in contents]
if not contents:
status = 'skipped'
print('No indexable content found')
else:
texts = [content for _, content in contents]
index_in_sqlite(snapshot_id, texts)
status = 'succeeded'
output = OUTPUT_DIR
print(f'SQLite FTS indexed {len(texts)} documents')
print(f'Sources: {", ".join(indexed_sources)}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'indexed_sources': indexed_sources,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,65 @@
"""
SQLite FTS5 search backend - search and flush operations.
This module provides the search interface for the SQLite FTS backend.
Environment variables:
SQLITEFTS_DB: Database filename (default: search.sqlite3)
FTS_SEPARATE_DATABASE: Use separate database file (default: true)
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
"""
import os
import sqlite3
from pathlib import Path
from typing import List, Iterable
from django.conf import settings
# Config with old var names for backwards compatibility
SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip()
FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() in ('true', '1', 'yes')
FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip()
def get_db_path() -> Path:
"""Get path to the search index database."""
return Path(settings.DATA_DIR) / SQLITEFTS_DB
def search(query: str) -> List[str]:
"""Search for snapshots matching the query."""
db_path = get_db_path()
if not db_path.exists():
return []
conn = sqlite3.connect(str(db_path))
try:
cursor = conn.execute(
'SELECT DISTINCT snapshot_id FROM search_index WHERE search_index MATCH ?',
(query,)
)
return [row[0] for row in cursor.fetchall()]
except sqlite3.OperationalError:
# Table doesn't exist yet
return []
finally:
conn.close()
def flush(snapshot_ids: Iterable[str]) -> None:
"""Remove snapshots from the index."""
db_path = get_db_path()
if not db_path.exists():
return
conn = sqlite3.connect(str(db_path))
try:
for snapshot_id in snapshot_ids:
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
conn.commit()
except sqlite3.OperationalError:
pass # Table doesn't exist
finally:
conn.close()

View File

@@ -0,0 +1,219 @@
#!/usr/bin/env node
/**
* Extract SEO metadata from a URL.
*
* Extracts all <meta> tags including:
* - og:* (Open Graph)
* - twitter:*
* - description, keywords, author
* - Any other meta tags
*
* Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>
* Output: Writes seo/seo.json
*
* Environment variables:
* SAVE_SEO: Enable SEO extraction (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'seo';
const OUTPUT_DIR = 'seo';
const OUTPUT_FILE = 'seo.json';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Extract SEO metadata
async function extractSeo(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
let browser = null;
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Extract all meta tags
const seoData = await page.evaluate(() => {
const metaTags = Array.from(document.querySelectorAll('meta'));
const seo = {
url: window.location.href,
title: document.title || '',
};
// Process each meta tag
metaTags.forEach(tag => {
// Get the key (name or property attribute)
const key = tag.getAttribute('name') || tag.getAttribute('property') || '';
const content = tag.getAttribute('content') || '';
if (key && content) {
// Store by key
seo[key] = content;
}
});
// Also get canonical URL if present
const canonical = document.querySelector('link[rel="canonical"]');
if (canonical) {
seo.canonical = canonical.getAttribute('href');
}
// Get language
const htmlLang = document.documentElement.lang;
if (htmlLang) {
seo.language = htmlLang;
}
return seo;
});
// Write output
fs.writeFileSync(outputPath, JSON.stringify(seoData, null, 2));
return { success: true, output: outputPath, seoData };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('SAVE_SEO', true)) {
console.log('Skipping SEO (SAVE_SEO=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await extractSeo(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const metaCount = Object.keys(result.seoData).length - 2; // Subtract url and title
console.log(`SEO metadata extracted: ${metaCount} meta tags`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,53 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_SINGLEFILE": {
"type": "boolean",
"default": true,
"description": "Enable SingleFile archiving"
},
"SINGLEFILE_BINARY": {
"type": "string",
"default": "single-file",
"x-aliases": ["SINGLE_FILE_BINARY"],
"description": "Path to single-file binary"
},
"NODE_BINARY": {
"type": "string",
"default": "node",
"description": "Path to Node.js binary"
},
"SINGLEFILE_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 10,
"x-fallback": "TIMEOUT",
"description": "Timeout for SingleFile in seconds"
},
"SINGLEFILE_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string"
},
"SINGLEFILE_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"SINGLEFILE_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [],
"description": "Default single-file arguments"
},
"SINGLEFILE_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for single-file"
}
}
}

View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
Validation hook for single-file binary.
Runs at crawl start to verify single-file (npm package) is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from single-file binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0 and result.stdout:
return result.stdout.strip().split('\n')[0][:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
# For scripts, hash the script content
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_singlefile() -> dict | None:
"""Find single-file binary."""
# Check env var first
env_path = os.environ.get('SINGLEFILE_BINARY', '')
if env_path and Path(env_path).is_file():
return {
'name': 'single-file',
'abspath': env_path,
'version': get_binary_version(env_path),
'sha256': get_binary_hash(env_path),
'binprovider': 'env',
}
# Try shutil.which
for name in ['single-file', 'singlefile']:
abspath = shutil.which(name)
if abspath:
return {
'name': 'single-file',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'npm',
}
# Check common npm paths
npm_paths = [
Path.home() / '.npm-global/bin/single-file',
Path.home() / 'node_modules/.bin/single-file',
Path('/usr/local/bin/single-file'),
Path('/usr/local/lib/node_modules/.bin/single-file'),
]
for path in npm_paths:
if path.is_file():
return {
'name': 'single-file',
'abspath': str(path),
'version': get_binary_version(str(path)),
'sha256': get_binary_hash(str(path)),
'binprovider': 'npm',
}
return None
def main():
result = find_singlefile()
if result and result.get('abspath'):
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/SINGLEFILE_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/SINGLEFILE_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
print(json.dumps({
'type': 'Dependency',
'bin_name': 'single-file',
'bin_providers': 'npm,env',
}))
print(f"single-file binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,270 @@
#!/usr/bin/env node
/**
* SingleFile Extension Plugin
*
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
* Falls back to single-file-cli if the extension is not available.
*
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
*
* Priority: 04 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
*
* This extension automatically:
* - Saves complete web pages as single HTML files
* - Inlines all resources (CSS, JS, images, fonts)
* - Preserves page fidelity better than wget/curl
* - Works with SPAs and dynamically loaded content
*/
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const { exec } = require('child_process');
const execAsync = promisify(exec);
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
const OUTPUT_DIR = 'singlefile';
const OUTPUT_FILE = 'singlefile.html';
/**
* Install the SingleFile extension
*/
async function installSinglefileExtension() {
console.log('[*] Installing SingleFile extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install SingleFile extension');
return null;
}
console.log('[+] SingleFile extension installed');
console.log('[+] Web pages will be saved as single HTML files');
return extension;
}
/**
* Wait for a specified amount of time
*/
function wait(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Save a page using the SingleFile extension
*
* @param {Object} page - Puppeteer page object
* @param {Object} extension - Extension metadata with dispatchAction method
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithExtension(page, extension, options = {}) {
if (!extension || !extension.version) {
throw new Error('SingleFile extension not found or not loaded');
}
const url = await page.url();
// Check for unsupported URL schemes
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
const scheme = url.split(':')[0];
if (URL_SCHEMES_IGNORED.includes(scheme)) {
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
return null;
}
// Ensure downloads directory exists
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
// Get list of existing files to ignore
const files_before = new Set(
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'))
);
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
// Bring page to front (extension action button acts on foreground tab)
await page.bringToFront();
// Trigger the extension's action (toolbar button click)
await extension.dispatchAction();
// Wait for file to appear in downloads directory
const check_delay = 3000; // 3 seconds
const max_tries = 10;
let files_new = [];
for (let attempt = 0; attempt < max_tries; attempt++) {
await wait(check_delay);
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
.filter(fn => fn.endsWith('.html'));
files_new = files_after.filter(file => !files_before.has(file));
if (files_new.length === 0) {
continue;
}
// Find the matching file by checking if it contains the URL in the HTML header
for (const file of files_new) {
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
const dl_header = dl_text.split('meta charset')[0];
if (dl_header.includes(`url: ${url}`)) {
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
await fs.promises.rename(dl_path, out_path);
return out_path;
}
}
}
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
return null;
}
/**
* Save a page using single-file-cli (fallback method)
*
* @param {string} url - URL to archive
* @param {Object} options - Additional options
* @returns {Promise<string|null>} - Path to saved file or null on failure
*/
async function saveSinglefileWithCLI(url, options = {}) {
console.log('[*] Falling back to single-file-cli...');
// Find single-file binary
let binary = null;
try {
const { stdout } = await execAsync('which single-file');
binary = stdout.trim();
} catch (err) {
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
return null;
}
// Ensure output directory exists
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Build command
const cmd = [
binary,
'--browser-headless',
url,
out_path,
];
// Add optional args
if (options.userAgent) {
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
}
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
}
if (options.ignoreSSL) {
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
}
// Execute
try {
const timeout = options.timeout || 120000;
await execAsync(cmd.join(' '), { timeout });
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
return out_path;
}
console.error('[❌] SingleFile CLI completed but no output file found');
return null;
} catch (err) {
console.error(`[❌] SingleFile CLI error: ${err.message}`);
return null;
}
}
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] SingleFile extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installSinglefileExtension();
// Export extension metadata for chrome_session to load
if (extension) {
// Write extension info to a cache file that chrome_session can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installSinglefileExtension,
saveSinglefileWithExtension,
saveSinglefileWithCLI,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] SingleFile extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] SingleFile extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,328 @@
#!/usr/bin/env python3
"""
Archive a URL using SingleFile.
Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
Output: Writes singlefile.html to $PWD
Environment variables:
SINGLEFILE_BINARY: Path to SingleFile binary
SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
SINGLEFILE_USER_AGENT: User agent string (optional)
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
# Feature toggle
SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
# Chrome binary (SingleFile needs Chrome)
CHROME_BINARY: Path to Chrome/Chromium binary
# Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
"""
import json
import os
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'singlefile'
BIN_NAME = 'single-file'
BIN_PROVIDERS = 'npm,env'
OUTPUT_DIR = 'singlefile'
OUTPUT_FILE = 'singlefile.html'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
STATICFILE_DIR = 'staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
staticfile_dir = Path(STATICFILE_DIR)
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
# Chrome binary search paths
CHROMIUM_BINARY_NAMES_LINUX = [
'chromium', 'chromium-browser', 'chromium-browser-beta',
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
]
CHROME_BINARY_NAMES_LINUX = [
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
]
CHROME_BINARY_NAMES_MACOS = [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
]
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
ALL_CHROME_BINARIES = (
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
)
def find_singlefile() -> str | None:
"""Find SingleFile binary."""
singlefile = get_env('SINGLEFILE_BINARY')
if singlefile and os.path.isfile(singlefile):
return singlefile
for name in ['single-file', 'singlefile']:
binary = shutil.which(name)
if binary:
return binary
return None
def find_chrome() -> str | None:
"""Find Chrome/Chromium binary."""
chrome = get_env('CHROME_BINARY')
if chrome and os.path.isfile(chrome):
return chrome
for name in ALL_CHROME_BINARIES:
if '/' in name:
if os.path.isfile(name):
return name
else:
binary = shutil.which(name)
if binary:
return binary
return None
def get_version(binary: str) -> str:
"""Get SingleFile version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.strip()[:64]
except Exception:
return ''
CHROME_SESSION_DIR = 'chrome_session'
def get_cdp_url() -> str | None:
"""Get CDP URL from chrome_session if available."""
cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt'
if cdp_file.exists():
return cdp_file.read_text().strip()
return None
def get_port_from_cdp_url(cdp_url: str) -> str | None:
"""Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...)."""
import re
match = re.search(r':(\d+)/', cdp_url)
if match:
return match.group(1)
return None
def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Archive URL using SingleFile.
If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
Otherwise launches a new Chrome instance.
Returns: (success, output_path, error_message)
"""
# Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
chrome = find_chrome()
cmd = [binary]
# Try to use existing Chrome session via CDP
cdp_url = get_cdp_url()
if cdp_url:
# SingleFile can connect to existing browser via WebSocket
# Extract port from CDP URL (ws://127.0.0.1:PORT/...)
port = get_port_from_cdp_url(cdp_url)
if port:
cmd.extend(['--browser-server', f'http://127.0.0.1:{port}'])
elif chrome:
cmd.extend(['--browser-executable-path', chrome])
# Common options
cmd.extend([
'--browser-headless',
])
# SSL handling
if not check_ssl:
cmd.append('--browser-ignore-insecure-certs')
if user_agent:
cmd.extend(['--browser-user-agent', user_agent])
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--browser-cookies-file', cookies_file])
if extra_args:
cmd.extend(extra_args.split())
# Create output directory
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
output_path = output_dir / OUTPUT_FILE
cmd.extend([url, str(output_path)])
try:
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
if output_path.exists() and output_path.stat().st_size > 0:
return True, str(output_path), ''
else:
stderr = result.stderr.decode('utf-8', errors='replace')
if 'ERR_NAME_NOT_RESOLVED' in stderr:
return False, None, 'DNS resolution failed'
if 'ERR_CONNECTION_REFUSED' in stderr:
return False, None, 'Connection refused'
return False, None, f'SingleFile failed: {stderr[:200]}'
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to archive')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Archive a URL using SingleFile."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if SingleFile is enabled
if not get_env_bool('SAVE_SINGLEFILE', True):
print('Skipping SingleFile (SAVE_SINGLEFILE=False)')
status = 'skipped'
end_ts = datetime.now(timezone.utc)
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Check if staticfile extractor already handled this (permanent skip)
if has_staticfile_output():
print(f'Skipping SingleFile - staticfile extractor already downloaded this')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - staticfile already handled
# Find binary
binary = find_singlefile()
if not binary:
print(f'ERROR: SingleFile binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=npm install -g single-file-cli', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
# Run extraction
success, output, error = save_singlefile(url, binary)
status = 'succeeded' if success else 'failed'
if success and output:
size = Path(output).stat().st_size
print(f'SingleFile saved ({size} bytes)')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if cmd_str:
print(f'CMD={cmd_str}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,110 @@
"""
Integration tests - archive example.com with SingleFile and verify output
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
TEST_URL = "https://example.com"
# Check if single-file CLI is available
try:
result = subprocess.run(
["which", "single-file"],
capture_output=True,
timeout=5
)
SINGLEFILE_CLI_AVAILABLE = result.returncode == 0
except:
SINGLEFILE_CLI_AVAILABLE = False
@pytest.mark.skipif(
not SINGLEFILE_CLI_AVAILABLE,
reason="single-file CLI not installed (npm install -g single-file-cli)"
)
def test_archives_example_com():
"""Archive example.com and verify output contains expected content"""
with tempfile.TemporaryDirectory() as tmpdir:
output_dir = Path(tmpdir) / "singlefile"
output_dir.mkdir()
output_file = output_dir / "singlefile.html"
# Run single-file CLI
result = subprocess.run(
[
"single-file",
"--browser-headless",
TEST_URL,
str(output_file)
],
capture_output=True,
text=True,
timeout=120
)
assert result.returncode == 0, f"Archive failed: {result.stderr}"
# Verify output exists
assert output_file.exists(), "Output file not created"
# Read and verify content
html_content = output_file.read_text()
file_size = output_file.stat().st_size
# Should be substantial (embedded resources)
assert file_size > 900, f"Output too small: {file_size} bytes"
# Verify HTML structure (SingleFile minifies, so <head> tag may be omitted)
assert "<html" in html_content.lower()
assert "<body" in html_content.lower()
assert "<title>" in html_content.lower() or "title>" in html_content.lower()
# Verify example.com content is actually present
assert "example domain" in html_content.lower(), "Missing 'Example Domain' title"
assert "this domain is" in html_content.lower(), "Missing example.com description text"
assert "iana.org" in html_content.lower(), "Missing IANA link"
# Verify it's not just empty/error page
assert file_size > 900, f"File too small: {file_size} bytes"
@pytest.mark.skipif(not SINGLEFILE_CLI_AVAILABLE, reason="single-file CLI not installed")
def test_different_urls_produce_different_outputs():
"""Verify different URLs produce different archived content"""
with tempfile.TemporaryDirectory() as tmpdir:
outputs = {}
for url in ["https://example.com", "https://example.org"]:
output_file = Path(tmpdir) / f"{url.replace('https://', '').replace('.', '_')}.html"
result = subprocess.run(
["single-file", "--browser-headless", url, str(output_file)],
capture_output=True,
timeout=120
)
if result.returncode == 0 and output_file.exists():
outputs[url] = output_file.read_text()
assert len(outputs) == 2, "Should archive both URLs"
# Verify outputs differ
urls = list(outputs.keys())
assert outputs[urls[0]] != outputs[urls[1]], "Different URLs should produce different outputs"
# Each should contain its domain
assert "example.com" in outputs[urls[0]]
assert "example.org" in outputs[urls[1]]

View File

@@ -0,0 +1,385 @@
/**
* Unit tests for singlefile plugin
*
* Run with: node --test tests/test_singlefile.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
const TEST_DOWNLOADS_DIR = path.join(TEST_DIR, 'chrome_downloads');
describe('singlefile plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id', () => {
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
assert.strictEqual(EXTENSION.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
assert.strictEqual(EXTENSION.name, 'singlefile');
});
});
describe('installSinglefileExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installSinglefileExtension } = require('../on_Snapshot__04_singlefile.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'singlefile.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_singlefile');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.22.90' })
);
const fakeCache = {
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
name: 'singlefile',
unpacked_path: fakeExtensionDir,
version: '1.22.90'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installSinglefileExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
});
});
describe('saveSinglefileWithExtension', () => {
beforeEach(() => {
process.env.CHROME_DOWNLOADS_DIR = TEST_DOWNLOADS_DIR;
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
delete process.env.CHROME_DOWNLOADS_DIR;
});
it('should require extension and version to be present', () => {
const mockExtension = {
name: 'singlefile',
version: '1.22.96',
id: 'test_id'
};
assert.ok(mockExtension.version);
assert.ok(mockExtension.id);
});
it('should filter unsupported URL schemes', () => {
const unsupportedSchemes = [
'about:',
'chrome:',
'chrome-extension:',
'data:',
'javascript:',
'blob:'
];
unsupportedSchemes.forEach(scheme => {
const testUrl = scheme + 'something';
const urlScheme = testUrl.split(':')[0];
assert.ok(unsupportedSchemes.some(s => s.startsWith(urlScheme)));
});
});
it('should wait for file to appear in downloads directory', async () => {
const checkDelay = 3000; // 3 seconds
const maxTries = 10;
// Total max wait time
const maxWaitTime = checkDelay * maxTries;
assert.strictEqual(maxWaitTime, 30000); // 30 seconds
});
it('should find downloaded file by checking URL in HTML header', () => {
const testUrl = 'https://example.com';
const mockHtml = `<!-- url: ${testUrl} --><html><head><meta charset="utf-8"></head></html>`;
// Should be able to extract URL from header
const headerPart = mockHtml.split('meta charset')[0];
assert.ok(headerPart.includes(`url: ${testUrl}`));
});
it('should move file from downloads to output directory', () => {
const downloadPath = path.join(TEST_DOWNLOADS_DIR, 'temp_file.html');
const outputDir = 'singlefile';
const outputFile = 'singlefile.html';
const outputPath = path.join(outputDir, outputFile);
// Verify paths are different
assert.notStrictEqual(downloadPath, outputPath);
});
});
describe('saveSinglefileWithCLI', () => {
it('should use single-file-cli as fallback', () => {
const cliCommand = 'single-file';
// Should check for CLI availability
assert.strictEqual(typeof cliCommand, 'string');
assert.ok(cliCommand.length > 0);
});
it('should pass correct arguments to CLI', () => {
const args = [
'--browser-headless',
'https://example.com',
'singlefile/singlefile.html'
];
assert.ok(args.includes('--browser-headless'));
assert.ok(args.some(arg => arg.startsWith('http')));
});
it('should handle optional CLI arguments', () => {
const options = {
userAgent: 'Mozilla/5.0...',
cookiesFile: '/path/to/cookies.txt',
ignoreSSL: true
};
// Optional args should be conditionally added
if (options.userAgent) {
assert.ok(options.userAgent.length > 0);
}
if (options.ignoreSSL) {
assert.strictEqual(options.ignoreSSL, true);
}
});
});
describe('priority and execution order', () => {
it('should have priority 04 (early)', () => {
const filename = 'on_Snapshot__04_singlefile.js';
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 4);
});
it('should run before chrome_session (priority 20)', () => {
const extensionPriority = 4;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
it('should install extensions in correct order', () => {
const priorities = {
captcha2: 1,
istilldontcareaboutcookies: 2,
ublock: 3,
singlefile: 4
};
// Should be in ascending order
assert.ok(priorities.captcha2 < priorities.istilldontcareaboutcookies);
assert.ok(priorities.istilldontcareaboutcookies < priorities.ublock);
assert.ok(priorities.ublock < priorities.singlefile);
});
});
describe('output structure', () => {
it('should define output directory and file', () => {
const OUTPUT_DIR = 'singlefile';
const OUTPUT_FILE = 'singlefile.html';
assert.strictEqual(OUTPUT_DIR, 'singlefile');
assert.strictEqual(OUTPUT_FILE, 'singlefile.html');
});
it('should create output directory if not exists', () => {
const outputDir = path.join(TEST_DIR, 'singlefile');
// Should create directory
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
assert.ok(fs.existsSync(outputDir));
// Cleanup
fs.rmSync(outputDir, { recursive: true });
});
});
describe('extension vs CLI fallback', () => {
it('should prefer extension over CLI', () => {
const preferenceOrder = [
'extension',
'cli'
];
assert.strictEqual(preferenceOrder[0], 'extension');
assert.strictEqual(preferenceOrder[1], 'cli');
});
it('should fallback to CLI if extension unavailable', () => {
const extensionAvailable = false;
const cliAvailable = true;
let method;
if (extensionAvailable) {
method = 'extension';
} else if (cliAvailable) {
method = 'cli';
}
assert.strictEqual(method, 'cli');
});
it('should use extension if available', () => {
const extensionAvailable = true;
let method;
if (extensionAvailable) {
method = 'extension';
} else {
method = 'cli';
}
assert.strictEqual(method, 'extension');
});
});
describe('file matching and validation', () => {
beforeEach(() => {
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
}
});
it('should filter HTML files from downloads', () => {
// Create mock download files
const files = [
'example.html',
'test.pdf',
'image.png',
'page.html'
];
const htmlFiles = files.filter(f => f.endsWith('.html'));
assert.strictEqual(htmlFiles.length, 2);
assert.ok(htmlFiles.includes('example.html'));
assert.ok(htmlFiles.includes('page.html'));
});
it('should match URL in HTML header comment', () => {
const testUrl = 'https://example.com/page';
const htmlContent = `<!--
Page saved with SingleFile
url: ${testUrl}
saved date: 2024-01-01
-->
<html>...</html>`;
const headerSection = htmlContent.split('meta charset')[0] || htmlContent.split('<html>')[0];
assert.ok(headerSection.includes(`url: ${testUrl}`));
});
it('should handle multiple new files in downloads', () => {
const filesBefore = new Set(['old1.html', 'old2.html']);
const filesAfter = ['old1.html', 'old2.html', 'new1.html', 'new2.html'];
const filesNew = filesAfter.filter(f => !filesBefore.has(f));
assert.strictEqual(filesNew.length, 2);
assert.ok(filesNew.includes('new1.html'));
assert.ok(filesNew.includes('new2.html'));
});
});
describe('error handling', () => {
it('should timeout after max wait time', () => {
const checkDelay = 3000; // ms
const maxTries = 10;
const timeoutMs = checkDelay * maxTries;
assert.strictEqual(timeoutMs, 30000); // 30 seconds
});
it('should handle missing extension gracefully', () => {
const extension = null;
if (!extension || !extension.version) {
// Should throw error
assert.ok(true);
}
});
it('should handle file not found after waiting', () => {
const filesNew = [];
const maxWaitReached = true;
if (filesNew.length === 0 && maxWaitReached) {
// Should return null
const result = null;
assert.strictEqual(result, null);
}
});
});
});

View File

@@ -0,0 +1,141 @@
"""
Unit tests for singlefile plugin
Tests invoke the plugin hook as an external process and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_extension_metadata():
"""Test that SingleFile extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert metadata["name"] == "singlefile"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Check output mentions installation
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
# Check cache file was created
cache_file = ext_dir / "singlefile.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
assert cache_data["name"] == "singlefile"
def test_install_uses_existing_cache():
"""Test that install uses existing cache when available"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
# Create fake cache
fake_extension_dir = ext_dir / "mpiodijhokgodhhofbcjdecpffjipkle__singlefile"
fake_extension_dir.mkdir(parents=True)
manifest = {"version": "1.22.96", "name": "SingleFile"}
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should use cache or install successfully
assert result.returncode == 0
def test_no_configuration_required():
"""Test that SingleFile works without configuration"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# No API keys needed
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should work without API keys
assert result.returncode == 0
def test_priority_order():
"""Test that singlefile has correct priority (04)"""
# Extract priority from filename
filename = INSTALL_SCRIPT.name
assert "04" in filename, "SingleFile should have priority 04"
assert filename.startswith("on_Snapshot__04_"), "Should follow priority naming convention"
def test_output_directory_structure():
"""Test that plugin defines correct output structure"""
# Verify the script mentions singlefile output directory
script_content = INSTALL_SCRIPT.read_text()
# Should mention singlefile output directory
assert "singlefile" in script_content.lower()
# Should mention HTML output
assert ".html" in script_content or "html" in script_content.lower()

View File

@@ -0,0 +1,243 @@
#!/usr/bin/env node
/**
* Extract SSL/TLS certificate details from a URL.
*
* Connects to Chrome session and retrieves security details including:
* - Protocol (TLS 1.2, TLS 1.3, etc.)
* - Cipher suite
* - Certificate issuer, validity period
* - Security state
*
* Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>
* Output: Writes ssl/ssl.json
*
* Environment variables:
* SAVE_SSL: Enable SSL extraction (default: true)
*/
const fs = require('fs');
const path = require('path');
const puppeteer = require('puppeteer-core');
// Extractor metadata
const EXTRACTOR_NAME = 'ssl';
const OUTPUT_DIR = 'ssl';
const OUTPUT_FILE = 'ssl.json';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvBool(name, defaultValue = false) {
const val = getEnv(name, '').toLowerCase();
if (['true', '1', 'yes', 'on'].includes(val)) return true;
if (['false', '0', 'no', 'off'].includes(val)) return false;
return defaultValue;
}
// Get CDP URL from chrome_session
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Extract SSL details
async function extractSsl(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Only extract SSL for HTTPS URLs
if (!url.startsWith('https://')) {
return { success: false, error: 'URL is not HTTPS' };
}
let browser = null;
let sslInfo = {};
try {
// Connect to existing Chrome session
const cdpUrl = getCdpUrl();
if (!cdpUrl) {
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
}
browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
// Get the page
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
return { success: false, error: 'No page found in Chrome session' };
}
// Get CDP client for low-level access
const client = await page.target().createCDPSession();
// Enable Security domain
await client.send('Security.enable');
// Get security details from the loaded page
const securityState = await client.send('Security.getSecurityState');
sslInfo = {
url,
securityState: securityState.securityState,
schemeIsCryptographic: securityState.schemeIsCryptographic,
summary: securityState.summary || '',
};
// Try to get detailed certificate info if available
if (securityState.securityStateIssueIds && securityState.securityStateIssueIds.length > 0) {
sslInfo.issues = securityState.securityStateIssueIds;
}
// Get response security details from navigation
let mainResponse = null;
page.on('response', async (response) => {
if (response.url() === url || response.request().isNavigationRequest()) {
mainResponse = response;
}
});
// If we have security details from response
if (mainResponse) {
try {
const securityDetails = await mainResponse.securityDetails();
if (securityDetails) {
sslInfo.protocol = securityDetails.protocol();
sslInfo.subjectName = securityDetails.subjectName();
sslInfo.issuer = securityDetails.issuer();
sslInfo.validFrom = securityDetails.validFrom();
sslInfo.validTo = securityDetails.validTo();
sslInfo.certificateId = securityDetails.subjectName();
const sanList = securityDetails.sanList();
if (sanList && sanList.length > 0) {
sslInfo.subjectAlternativeNames = sanList;
}
}
} catch (e) {
// Security details not available
}
}
await client.detach();
// Write output
fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
return { success: true, output: outputPath, sslInfo };
} catch (e) {
return { success: false, error: `${e.name}: ${e.message}` };
} finally {
if (browser) {
browser.disconnect();
}
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
// Check if enabled
if (!getEnvBool('SAVE_SSL', true)) {
console.log('Skipping SSL (SAVE_SSL=False)');
status = 'skipped';
const endTs = new Date();
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`STATUS=${status}`);
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
process.exit(0);
}
const result = await extractSsl(url);
if (result.success) {
status = 'succeeded';
output = result.output;
const protocol = result.sslInfo?.protocol || 'unknown';
console.log(`SSL details extracted: ${protocol}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,337 @@
#!/usr/bin/env python3
"""
Download static files (PDFs, images, archives, etc.) directly.
This extractor runs AFTER chrome_session and checks the Content-Type header
from chrome_session/response_headers.json to determine if the URL points to
a static file that should be downloaded directly.
Other extractors check for the presence of this extractor's output directory
to know if they should skip (since Chrome-based extractors can't meaningfully
process static files like PDFs, images, etc.).
Usage: on_Snapshot__21_staticfile.py --url=<url> --snapshot-id=<uuid>
Output: Downloads file to staticfile/<filename>
Environment variables:
STATICFILE_TIMEOUT: Timeout in seconds (default: 300)
STATICFILE_MAX_SIZE: Maximum file size in bytes (default: 1GB)
USER_AGENT: User agent string (optional)
CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
"""
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse, unquote
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'staticfile'
OUTPUT_DIR = 'staticfile'
CHROME_SESSION_DIR = 'chrome_session'
# Content-Types that indicate static files
# These can't be meaningfully processed by Chrome-based extractors
STATIC_CONTENT_TYPES = {
# Documents
'application/pdf',
'application/msword',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.ms-excel',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.ms-powerpoint',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/rtf',
'application/epub+zip',
# Images
'image/png',
'image/jpeg',
'image/gif',
'image/webp',
'image/svg+xml',
'image/x-icon',
'image/bmp',
'image/tiff',
'image/avif',
'image/heic',
'image/heif',
# Audio
'audio/mpeg',
'audio/mp3',
'audio/wav',
'audio/flac',
'audio/aac',
'audio/ogg',
'audio/webm',
'audio/m4a',
'audio/opus',
# Video
'video/mp4',
'video/webm',
'video/x-matroska',
'video/avi',
'video/quicktime',
'video/x-ms-wmv',
'video/x-flv',
# Archives
'application/zip',
'application/x-tar',
'application/gzip',
'application/x-bzip2',
'application/x-xz',
'application/x-7z-compressed',
'application/x-rar-compressed',
'application/vnd.rar',
# Data
'application/json',
'application/xml',
'text/csv',
'text/xml',
'application/x-yaml',
# Executables/Binaries
'application/octet-stream', # Generic binary
'application/x-executable',
'application/x-msdos-program',
'application/x-apple-diskimage',
'application/vnd.debian.binary-package',
'application/x-rpm',
# Other
'application/x-bittorrent',
'application/wasm',
}
# Also check Content-Type prefixes for categories
STATIC_CONTENT_TYPE_PREFIXES = (
'image/',
'audio/',
'video/',
'application/zip',
'application/x-',
)
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def get_content_type_from_chrome_session() -> str | None:
"""Read Content-Type from chrome_session's response headers."""
headers_file = Path(CHROME_SESSION_DIR) / 'response_headers.json'
if not headers_file.exists():
return None
try:
with open(headers_file) as f:
headers = json.load(f)
# Headers might be nested or flat depending on chrome_session format
content_type = headers.get('content-type') or headers.get('Content-Type') or ''
# Strip charset and other parameters
return content_type.split(';')[0].strip().lower()
except Exception:
return None
def is_static_content_type(content_type: str) -> bool:
"""Check if Content-Type indicates a static file."""
if not content_type:
return False
# Check exact match
if content_type in STATIC_CONTENT_TYPES:
return True
# Check prefixes
for prefix in STATIC_CONTENT_TYPE_PREFIXES:
if content_type.startswith(prefix):
return True
return False
def get_filename_from_url(url: str) -> str:
"""Extract filename from URL."""
parsed = urlparse(url)
path = unquote(parsed.path)
filename = path.split('/')[-1] or 'downloaded_file'
# Sanitize filename
filename = filename.replace('/', '_').replace('\\', '_')
if len(filename) > 200:
filename = filename[:200]
return filename
def download_file(url: str) -> tuple[bool, str | None, str]:
"""
Download a static file.
Returns: (success, output_path, error_message)
"""
import requests
timeout = get_env_int('STATICFILE_TIMEOUT', 300)
max_size = get_env_int('STATICFILE_MAX_SIZE', 1024 * 1024 * 1024) # 1GB default
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
headers = {'User-Agent': user_agent}
try:
# Stream download to handle large files
response = requests.get(
url,
headers=headers,
timeout=timeout,
stream=True,
verify=check_ssl,
allow_redirects=True,
)
response.raise_for_status()
# Check content length if available
content_length = response.headers.get('content-length')
if content_length and int(content_length) > max_size:
return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
# Create output directory
output_dir = Path(OUTPUT_DIR)
output_dir.mkdir(exist_ok=True)
# Determine filename
filename = get_filename_from_url(url)
# Check content-disposition header for better filename
content_disp = response.headers.get('content-disposition', '')
if 'filename=' in content_disp:
import re
match = re.search(r'filename[*]?=["\']?([^"\';\n]+)', content_disp)
if match:
filename = match.group(1).strip()
output_path = output_dir / filename
# Download in chunks
downloaded_size = 0
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
downloaded_size += len(chunk)
if downloaded_size > max_size:
f.close()
output_path.unlink()
return False, None, f'File too large: exceeded {max_size} bytes'
f.write(chunk)
return True, str(output_path), ''
except requests.exceptions.Timeout:
return False, None, f'Timed out after {timeout} seconds'
except requests.exceptions.SSLError as e:
return False, None, f'SSL error: {e}'
except requests.exceptions.RequestException as e:
return False, None, f'Download failed: {e}'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to download')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Download static files based on Content-Type from chrome_session."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
# Check Content-Type from chrome_session's response headers
content_type = get_content_type_from_chrome_session()
# If chrome_session didn't run or no Content-Type, skip
if not content_type:
print(f'No Content-Type found (chrome_session may not have run)')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - can't determine content type
# If not a static file type, skip (this is the normal case for HTML pages)
if not is_static_content_type(content_type):
print(f'Not a static file (Content-Type: {content_type})')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id, "content_type": content_type})}')
sys.exit(0) # Permanent skip - not a static file
try:
# Download the file
print(f'Static file detected (Content-Type: {content_type}), downloading...')
success, output, error = download_file(url)
status = 'succeeded' if success else 'failed'
if success and output:
size = Path(output).stat().st_size
print(f'Static file downloaded ({size} bytes): {output}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'content_type': content_type,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,262 @@
#!/usr/bin/env node
/**
* Extract the title of a URL.
*
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP
* to get the page title (which includes JS-rendered content).
* Otherwise falls back to fetching the URL and parsing HTML.
*
* Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>
* Output: Writes title/title.txt
*
* Environment variables:
* TIMEOUT: Timeout in seconds (default: 30)
* USER_AGENT: User agent string (optional)
*/
const fs = require('fs');
const path = require('path');
const https = require('https');
const http = require('http');
// Extractor metadata
const EXTRACTOR_NAME = 'title';
const OUTPUT_DIR = 'title';
const OUTPUT_FILE = 'title.txt';
const CHROME_SESSION_DIR = 'chrome_session';
// Parse command line arguments
function parseArgs() {
const args = {};
process.argv.slice(2).forEach(arg => {
if (arg.startsWith('--')) {
const [key, ...valueParts] = arg.slice(2).split('=');
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
}
});
return args;
}
// Get environment variable with default
function getEnv(name, defaultValue = '') {
return (process.env[name] || defaultValue).trim();
}
function getEnvInt(name, defaultValue = 0) {
const val = parseInt(getEnv(name, String(defaultValue)), 10);
return isNaN(val) ? defaultValue : val;
}
// Get CDP URL from chrome_session if available
function getCdpUrl() {
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
if (fs.existsSync(cdpFile)) {
return fs.readFileSync(cdpFile, 'utf8').trim();
}
return null;
}
// Extract title from HTML
function extractTitleFromHtml(html) {
// Try <title> tag
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
if (titleMatch) {
return titleMatch[1].trim();
}
// Try og:title
const ogMatch = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i);
if (ogMatch) {
return ogMatch[1].trim();
}
// Try twitter:title
const twitterMatch = html.match(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i);
if (twitterMatch) {
return twitterMatch[1].trim();
}
return null;
}
// Fetch URL and extract title (fallback method)
function fetchTitle(url) {
return new Promise((resolve, reject) => {
const timeout = getEnvInt('TIMEOUT', 30) * 1000;
const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
const client = url.startsWith('https') ? https : http;
const req = client.get(url, {
headers: { 'User-Agent': userAgent },
timeout,
}, (res) => {
// Handle redirects
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
fetchTitle(res.headers.location).then(resolve).catch(reject);
return;
}
let data = '';
res.on('data', chunk => {
data += chunk;
// Only need first 64KB to find title
if (data.length > 65536) {
req.destroy();
}
});
res.on('end', () => {
const title = extractTitleFromHtml(data);
if (title) {
resolve(title);
} else {
reject(new Error('No title found in HTML'));
}
});
});
req.on('error', reject);
req.on('timeout', () => {
req.destroy();
reject(new Error('Request timeout'));
});
});
}
// Get title using Puppeteer CDP connection
async function getTitleFromCdp(cdpUrl) {
const puppeteer = require('puppeteer-core');
const browser = await puppeteer.connect({
browserWSEndpoint: cdpUrl,
});
try {
// Get existing pages
const pages = await browser.pages();
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
if (!page) {
throw new Error('No page found in Chrome session');
}
// Get title from page
const title = await page.title();
if (!title) {
// Try getting from DOM directly
const domTitle = await page.evaluate(() => {
return document.title ||
document.querySelector('meta[property="og:title"]')?.content ||
document.querySelector('meta[name="twitter:title"]')?.content ||
document.querySelector('h1')?.textContent?.trim();
});
return domTitle;
}
return title;
} finally {
// Disconnect without closing browser
browser.disconnect();
}
}
async function extractTitle(url) {
// Create output directory
if (!fs.existsSync(OUTPUT_DIR)) {
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
}
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
// Try Chrome session first
const cdpUrl = getCdpUrl();
if (cdpUrl) {
try {
const title = await getTitleFromCdp(cdpUrl);
if (title) {
fs.writeFileSync(outputPath, title, 'utf8');
return { success: true, output: outputPath, title, method: 'cdp' };
}
} catch (e) {
console.error(`CDP title extraction failed: ${e.message}, falling back to HTTP`);
}
}
// Fallback to HTTP fetch
try {
const title = await fetchTitle(url);
fs.writeFileSync(outputPath, title, 'utf8');
return { success: true, output: outputPath, title, method: 'http' };
} catch (e) {
return { success: false, error: e.message };
}
}
async function main() {
const args = parseArgs();
const url = args.url;
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}
const startTs = new Date();
let status = 'failed';
let output = null;
let error = '';
try {
const result = await extractTitle(url);
if (result.success) {
status = 'succeeded';
output = result.output;
console.log(`Title extracted (${result.method}): ${result.title}`);
} else {
status = 'failed';
error = result.error;
}
} catch (e) {
error = `${e.name}: ${e.message}`;
status = 'failed';
}
const endTs = new Date();
const duration = (endTs - startTs) / 1000;
// Print results
console.log(`START_TS=${startTs.toISOString()}`);
console.log(`END_TS=${endTs.toISOString()}`);
console.log(`DURATION=${duration.toFixed(2)}`);
if (output) {
console.log(`OUTPUT=${output}`);
}
console.log(`STATUS=${status}`);
if (error) {
console.error(`ERROR=${error}`);
}
// Print JSON result
const resultJson = {
extractor: EXTRACTOR_NAME,
url,
snapshot_id: snapshotId,
status,
start_ts: startTs.toISOString(),
end_ts: endTs.toISOString(),
duration: Math.round(duration * 100) / 100,
output,
error: error || null,
};
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
process.exit(status === 'succeeded' ? 0 : 1);
}
main().catch(e => {
console.error(`Fatal error: ${e.message}`);
process.exit(1);
});

View File

@@ -0,0 +1,241 @@
"""
Integration tests for title plugin
Tests verify:
1. Plugin script exists
2. Node.js is available
3. Title extraction works for real example.com
4. Output file contains actual page title
5. Handles various title sources (<title>, og:title, twitter:title)
6. Config options work (TIMEOUT, USER_AGENT)
7. Fallback to HTTP when chrome_session not available
"""
import shutil
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
TITLE_HOOK = PLUGIN_DIR / 'on_Snapshot__32_title.js'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"
def test_extracts_title_from_example_com():
"""Test full workflow: extract title from real example.com."""
# Check node is available
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run title extraction
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Verify output in stdout
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'Title extracted' in result.stdout, "Should report completion"
# Verify output directory created
title_dir = tmpdir / 'title'
assert title_dir.exists(), "Output directory not created"
# Verify output file exists
title_file = title_dir / 'title.txt'
assert title_file.exists(), "title.txt not created"
# Verify title contains REAL example.com title
title_text = title_file.read_text().strip()
assert len(title_text) > 0, "Title should not be empty"
assert 'example' in title_text.lower(), "Title should contain 'example'"
# example.com has title "Example Domain"
assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
# Verify RESULT_JSON is present
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
def test_falls_back_to_http_when_chrome_session_unavailable():
"""Test that title plugin falls back to HTTP when chrome_session unavailable."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Don't create chrome_session directory - force HTTP fallback
# Run title extraction
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
assert 'STATUS=succeeded' in result.stdout, "Should report success"
# Verify output exists and has real title
output_title_file = tmpdir / 'title' / 'title.txt'
assert output_title_file.exists(), "Output title.txt not created"
title_text = output_title_file.read_text().strip()
assert 'example' in title_text.lower()
def test_config_timeout_honored():
"""Test that TIMEOUT config is respected."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout (but example.com should still succeed)
import os
env = os.environ.copy()
env['TIMEOUT'] = '5'
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should complete (success or fail, but not hang)
assert result.returncode in (0, 1), "Should complete without hanging"
def test_config_user_agent():
"""Test that USER_AGENT config is used."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
import os
env = os.environ.copy()
env['USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=60
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout
def test_handles_https_urls():
"""Test that HTTPS URLs work correctly."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
if result.returncode == 0:
output_title_file = tmpdir / 'title' / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert len(title_text) > 0, "Title should not be empty"
assert 'example' in title_text.lower()
def test_handles_404_gracefully():
"""Test that title plugin handles 404 pages.
Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
with the generic "Example Domain" title.
"""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
result = subprocess.run(
['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# May succeed or fail depending on server behavior
# example.com returns "Example Domain" even for 404s
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
def test_handles_redirects():
"""Test that title plugin handles redirects correctly."""
if not shutil.which('node'):
pytest.skip("node not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# http://example.com redirects to https://example.com
result = subprocess.run(
['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# Should succeed and follow redirect
if result.returncode == 0:
output_title_file = tmpdir / 'title' / 'title.txt'
if output_title_file.exists():
title_text = output_title_file.read_text().strip()
assert 'example' in title_text.lower()
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,116 @@
#!/usr/bin/env node
/**
* uBlock Origin Extension Plugin
*
* Installs and configures the uBlock Origin Chrome extension for ad blocking
* and privacy protection during page archiving.
*
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
*
* Priority: 03 (early) - Must install before Chrome session starts
* Hook: on_Snapshot
*
* This extension automatically:
* - Blocks ads, trackers, and malware domains
* - Reduces page load time and bandwidth usage
* - Improves privacy during archiving
* - Removes clutter from archived pages
* - Uses efficient blocking with filter lists
*/
const path = require('path');
const fs = require('fs');
// Import extension utilities
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
// Extension metadata
const EXTENSION = {
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
};
// Get extensions directory from environment or use default
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
/**
* Install the uBlock Origin extension
*/
async function installUblockExtension() {
console.log('[*] Installing uBlock Origin extension...');
// Install the extension
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
if (!extension) {
console.error('[❌] Failed to install uBlock Origin extension');
return null;
}
console.log('[+] uBlock Origin extension installed');
console.log('[+] Ads and trackers will be blocked during archiving');
return extension;
}
/**
* Note: uBlock Origin works automatically with default filter lists.
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
*/
/**
* Main entry point - install extension before archiving
*/
async function main() {
// Check if extension is already cached
const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
if (fs.existsSync(cacheFile)) {
try {
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
console.log('[*] uBlock Origin extension already installed (using cache)');
return cached;
}
} catch (e) {
// Cache file corrupted, re-install
console.warn('[⚠️] Extension cache corrupted, re-installing...');
}
}
// Install extension
const extension = await installUblockExtension();
// Export extension metadata for chrome_session to load
if (extension) {
// Write extension info to a cache file that chrome_session can read
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
await fs.promises.writeFile(
cacheFile,
JSON.stringify(extension, null, 2)
);
console.log(`[+] Extension metadata written to ${cacheFile}`);
}
return extension;
}
// Export functions for use by other plugins
module.exports = {
EXTENSION,
installUblockExtension,
};
// Run if executed directly
if (require.main === module) {
main().then(() => {
console.log('[✓] uBlock Origin extension setup complete');
process.exit(0);
}).catch(err => {
console.error('[❌] uBlock Origin extension setup failed:', err);
process.exit(1);
});
}

View File

@@ -0,0 +1,321 @@
/**
* Unit tests for ublock plugin
*
* Run with: node --test tests/test_ublock.js
*/
const assert = require('assert');
const fs = require('fs');
const path = require('path');
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
// Test fixtures
const TEST_DIR = path.join(__dirname, '.test_fixtures');
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
describe('ublock plugin', () => {
before(() => {
if (!fs.existsSync(TEST_DIR)) {
fs.mkdirSync(TEST_DIR, { recursive: true });
}
});
after(() => {
if (fs.existsSync(TEST_DIR)) {
fs.rmSync(TEST_DIR, { recursive: true, force: true });
}
});
describe('EXTENSION metadata', () => {
it('should have correct webstore_id for uBlock Origin', () => {
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
assert.strictEqual(EXTENSION.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
});
it('should have correct name', () => {
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
assert.strictEqual(EXTENSION.name, 'ublock');
});
});
describe('installUblockExtension', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should use cached extension if available', async () => {
const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
// Create fake cache
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_ublock');
fs.mkdirSync(fakeExtensionDir, { recursive: true });
fs.writeFileSync(
path.join(fakeExtensionDir, 'manifest.json'),
JSON.stringify({ version: '1.67.0' })
);
const fakeCache = {
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
unpacked_path: fakeExtensionDir,
version: '1.67.0'
};
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
const result = await installUblockExtension();
assert.notStrictEqual(result, null);
assert.strictEqual(result.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
});
it('should not require any configuration', async () => {
// uBlock Origin works out of the box with default filter lists
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
assert.ok(EXTENSION);
// No config fields should be required
});
it('should have large download size (filter lists)', () => {
// uBlock Origin is typically larger than other extensions
// due to included filter lists (usually 3-5 MB)
const typicalSize = 4 * 1024 * 1024; // ~4 MB
const minExpectedSize = 2 * 1024 * 1024; // Minimum 2 MB
// Just verify we understand the expected size
assert.ok(typicalSize > minExpectedSize);
});
});
describe('cache file creation', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should create cache file with correct structure', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
const mockExtension = {
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
version: '1.68.0',
unpacked_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock'),
crx_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock.crx')
};
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
assert.ok(fs.existsSync(cacheFile));
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
assert.strictEqual(cache.name, 'ublock');
assert.strictEqual(cache.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
});
});
describe('extension functionality', () => {
it('should work automatically with default filter lists', () => {
const features = {
automaticBlocking: true,
requiresConfiguration: false,
requiresApiKey: false,
defaultFilterLists: true,
blocksAds: true,
blocksTrackers: true,
blocksMalware: true
};
assert.strictEqual(features.automaticBlocking, true);
assert.strictEqual(features.requiresConfiguration, false);
assert.strictEqual(features.requiresApiKey, false);
assert.strictEqual(features.defaultFilterLists, true);
});
it('should not require runtime configuration', () => {
// uBlock Origin works purely via filter lists and content scripts
// No API keys or runtime configuration needed
const requiresRuntimeConfig = false;
const requiresApiKey = false;
assert.strictEqual(requiresRuntimeConfig, false);
assert.strictEqual(requiresApiKey, false);
});
it('should support standard filter list formats', () => {
const supportedFormats = [
'EasyList',
'EasyPrivacy',
'Malware Domains',
'Peter Lowe\'s List',
'uBlock Origin filters'
];
assert.ok(supportedFormats.length > 0);
// Should support multiple filter list formats
});
});
describe('priority and execution order', () => {
it('should have priority 03 (early)', () => {
const filename = 'on_Snapshot__03_ublock.js';
const match = filename.match(/on_Snapshot__(\d+)_/);
assert.ok(match);
const priority = parseInt(match[1]);
assert.strictEqual(priority, 3);
});
it('should run before chrome_session (priority 20)', () => {
const extensionPriority = 3;
const chromeSessionPriority = 20;
assert.ok(extensionPriority < chromeSessionPriority);
});
it('should run after cookie dismissal extension', () => {
const ublockPriority = 3;
const cookiesPriority = 2;
assert.ok(ublockPriority > cookiesPriority);
});
});
describe('performance considerations', () => {
it('should benefit from caching due to large size', () => {
// uBlock Origin's large size makes caching especially important
const averageDownloadTime = 10; // seconds
const averageCacheCheckTime = 0.01; // seconds
const performanceGain = averageDownloadTime / averageCacheCheckTime;
// Should be at least 100x faster with cache
assert.ok(performanceGain > 100);
});
it('should not impact page load time significantly', () => {
// While extension is large, it uses efficient blocking
const efficientBlocking = true;
const minimalOverhead = true;
assert.strictEqual(efficientBlocking, true);
assert.strictEqual(minimalOverhead, true);
});
});
describe('error handling', () => {
beforeEach(() => {
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
});
afterEach(() => {
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
}
delete process.env.CHROME_EXTENSIONS_DIR;
});
it('should handle corrupted cache gracefully', async () => {
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
// Create corrupted cache
fs.writeFileSync(cacheFile, 'invalid json content');
const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
// Mock loadOrInstallExtension to avoid actual download
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
const originalFunc = extensionUtils.loadOrInstallExtension;
extensionUtils.loadOrInstallExtension = async () => ({
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
name: 'ublock',
version: '1.68.0'
});
const result = await installUblockExtension();
extensionUtils.loadOrInstallExtension = originalFunc;
assert.notStrictEqual(result, null);
});
it('should handle download timeout gracefully', () => {
// For large extension like uBlock, timeout handling is important
const timeoutSeconds = 120; // 2 minutes
const minTimeout = 30; // Should allow at least 30 seconds
assert.ok(timeoutSeconds > minTimeout);
});
});
describe('filter list validation', () => {
it('should have valid filter list format', () => {
// Example filter list entry
const sampleFilters = [
'||ads.example.com^',
'||tracker.example.com^$third-party',
'##.advertisement'
];
// All filters should follow standard format
sampleFilters.forEach(filter => {
assert.ok(typeof filter === 'string');
assert.ok(filter.length > 0);
});
});
it('should support cosmetic filters', () => {
const cosmeticFilter = '##.banner-ad';
// Should start with ## for cosmetic filters
assert.ok(cosmeticFilter.startsWith('##'));
});
it('should support network filters', () => {
const networkFilter = '||ads.example.com^';
// Network filters typically start with || or contain ^
assert.ok(networkFilter.includes('||') || networkFilter.includes('^'));
});
});
});

View File

@@ -0,0 +1,148 @@
"""
Unit tests for ublock plugin
Tests invoke the plugin hook as an external process and verify outputs/side effects.
"""
import json
import os
import subprocess
import tempfile
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__03_ublock.js"
def test_install_script_exists():
"""Verify install script exists"""
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
def test_extension_metadata():
"""Test that uBlock Origin extension has correct metadata"""
with tempfile.TemporaryDirectory() as tmpdir:
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
result = subprocess.run(
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
capture_output=True,
text=True,
env=env
)
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
metadata = json.loads(result.stdout)
assert metadata["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
assert metadata["name"] == "ublock"
def test_install_creates_cache():
"""Test that install creates extension cache"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=120 # uBlock is large, may take longer to download
)
# Check output mentions installation
assert "uBlock" in result.stdout or "ublock" in result.stdout
# Check cache file was created
cache_file = ext_dir / "ublock.extension.json"
assert cache_file.exists(), "Cache file should be created"
# Verify cache content
cache_data = json.loads(cache_file.read_text())
assert cache_data["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
assert cache_data["name"] == "ublock"
def test_install_uses_existing_cache():
"""Test that install uses existing cache when available"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
# Create fake cache
fake_extension_dir = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock"
fake_extension_dir.mkdir(parents=True)
manifest = {"version": "1.68.0", "name": "uBlock Origin"}
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should use cache or install successfully
assert result.returncode == 0
def test_no_configuration_required():
"""Test that uBlock Origin works without configuration"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
# No API keys needed - works with default filter lists
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=120
)
# Should not require any API keys
combined_output = result.stdout + result.stderr
assert "API" not in combined_output or result.returncode == 0
def test_large_extension_size():
"""Test that uBlock Origin is downloaded successfully despite large size"""
with tempfile.TemporaryDirectory() as tmpdir:
ext_dir = Path(tmpdir) / "chrome_extensions"
ext_dir.mkdir(parents=True)
env = os.environ.copy()
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
result = subprocess.run(
["node", str(INSTALL_SCRIPT)],
capture_output=True,
text=True,
env=env,
timeout=120
)
# If extension was downloaded, verify it's substantial size
crx_file = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock.crx"
if crx_file.exists():
# uBlock Origin with filter lists is typically 2-5 MB
size_bytes = crx_file.stat().st_size
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"

View File

@@ -0,0 +1,80 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SAVE_WGET": {
"type": "boolean",
"default": true,
"description": "Enable wget archiving"
},
"SAVE_WARC": {
"type": "boolean",
"default": true,
"description": "Save WARC archive file"
},
"SAVE_WGET_REQUISITES": {
"type": "boolean",
"default": true,
"description": "Download page requisites (CSS, JS, images)"
},
"WGET_BINARY": {
"type": "string",
"default": "wget",
"description": "Path to wget binary"
},
"WGET_TIMEOUT": {
"type": "integer",
"default": 60,
"minimum": 5,
"x-fallback": "TIMEOUT",
"description": "Timeout for wget in seconds"
},
"WGET_USER_AGENT": {
"type": "string",
"default": "",
"x-fallback": "USER_AGENT",
"description": "User agent string for wget"
},
"WGET_CHECK_SSL_VALIDITY": {
"type": "boolean",
"default": true,
"x-fallback": "CHECK_SSL_VALIDITY",
"x-aliases": ["CHECK_SSL_VALIDITY"],
"description": "Whether to verify SSL certificates"
},
"WGET_COOKIES_FILE": {
"type": "string",
"default": "",
"x-fallback": "COOKIES_FILE",
"description": "Path to cookies file"
},
"WGET_RESTRICT_FILE_NAMES": {
"type": "string",
"default": "windows",
"enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
"x-fallback": "RESTRICT_FILE_NAMES",
"description": "Filename restriction mode"
},
"WGET_ARGS": {
"type": "array",
"items": {"type": "string"},
"default": [
"--no-verbose",
"--adjust-extension",
"--convert-links",
"--force-directories",
"--backup-converted",
"--span-hosts",
"--no-parent",
"-e", "robots=off"
],
"description": "Default wget arguments"
},
"WGET_EXTRA_ARGS": {
"type": "string",
"default": "",
"description": "Extra arguments for wget (space-separated)"
}
}
}

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
Validation hook for wget binary.
Runs at crawl start to verify wget is available.
Outputs JSONL for InstalledBinary and Machine config updates.
"""
import os
import sys
import json
import shutil
import hashlib
import subprocess
from pathlib import Path
def get_binary_version(abspath: str) -> str | None:
"""Get version string from binary."""
try:
result = subprocess.run(
[abspath, '--version'],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode == 0 and result.stdout:
# wget version string: "GNU Wget 1.24.5 built on ..."
first_line = result.stdout.strip().split('\n')[0]
# Extract version number
parts = first_line.split()
for i, part in enumerate(parts):
if part.lower() == 'wget' and i + 1 < len(parts):
return parts[i + 1]
return first_line[:32]
except Exception:
pass
return None
def get_binary_hash(abspath: str) -> str | None:
"""Get SHA256 hash of binary."""
try:
with open(abspath, 'rb') as f:
return hashlib.sha256(f.read()).hexdigest()
except Exception:
return None
def find_wget() -> dict | None:
"""Find wget binary using abx-pkg or fallback to shutil.which."""
# Try abx-pkg first
try:
from abx_pkg import Binary, EnvProvider
class WgetBinary(Binary):
name: str = 'wget'
binproviders_supported = [EnvProvider()]
binary = WgetBinary()
loaded = binary.load()
if loaded and loaded.abspath:
return {
'name': 'wget',
'abspath': str(loaded.abspath),
'version': str(loaded.version) if loaded.version else None,
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
}
except ImportError:
pass
except Exception:
pass
# Fallback to shutil.which
abspath = shutil.which('wget') or os.environ.get('WGET_BINARY', '')
if abspath and Path(abspath).is_file():
return {
'name': 'wget',
'abspath': abspath,
'version': get_binary_version(abspath),
'sha256': get_binary_hash(abspath),
'binprovider': 'env',
}
return None
def main():
"""Validate wget binary and output JSONL."""
result = find_wget()
if result and result.get('abspath'):
# Output InstalledBinary
print(json.dumps({
'type': 'InstalledBinary',
'name': result['name'],
'abspath': result['abspath'],
'version': result['version'],
'sha256': result['sha256'],
'binprovider': result['binprovider'],
}))
# Output Machine config update
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/WGET_BINARY',
'value': result['abspath'],
}))
if result['version']:
print(json.dumps({
'type': 'Machine',
'_method': 'update',
'key': 'config/WGET_VERSION',
'value': result['version'],
}))
sys.exit(0)
else:
# Output Dependency request
print(json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
}))
# Exit non-zero to indicate binary not found
print(f"wget binary not found", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python3
"""
Validate and compute derived wget config values.
This hook runs early in the Crawl lifecycle to:
1. Validate config values with warnings (not hard errors)
2. Compute derived values (USE_WGET from SAVE_WGET/SAVE_WARC)
3. Check binary availability and version
Output:
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
- InstalledBinary JSONL records to stdout when binaries are found
"""
import json
import os
import shutil
import subprocess
import sys
from abx_pkg import Binary, EnvProvider
# Read config from environment (already validated by JSONSchema)
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
def output_installed_binary(binary: Binary, name: str):
"""Output InstalledBinary JSONL record to stdout."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'InstalledBinary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
warnings = []
errors = []
computed = {}
# Get config values
save_wget = get_env_bool('SAVE_WGET', True)
save_warc = get_env_bool('SAVE_WARC', True)
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
wget_binary = get_env('WGET_BINARY', 'wget')
# Compute derived values
use_wget = save_wget or save_warc
computed['USE_WGET'] = str(use_wget).lower()
# Validate timeout with warning (not error)
if use_wget and wget_timeout < 20:
warnings.append(
f"WGET_TIMEOUT={wget_timeout} is very low. "
"wget may fail to archive sites if set to less than ~20 seconds. "
"Consider setting WGET_TIMEOUT=60 or higher."
)
# Check binary availability using abx-pkg
provider = EnvProvider()
try:
binary = Binary(name=wget_binary, binproviders=[provider]).load()
binary_path = str(binary.abspath) if binary.abspath else ''
except Exception:
binary = None
binary_path = ''
if not binary_path:
if use_wget:
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set SAVE_WGET=false.")
computed['WGET_BINARY'] = ''
else:
computed['WGET_BINARY'] = binary_path
wget_version = str(binary.version) if binary.version else 'unknown'
computed['WGET_VERSION'] = wget_version
# Output InstalledBinary JSONL record
output_installed_binary(binary, name='wget')
# Check for compression support
if computed.get('WGET_BINARY'):
try:
result = subprocess.run(
[computed['WGET_BINARY'], '--compression=auto', '--help'],
capture_output=True, timeout=5
)
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
except Exception:
computed['WGET_AUTO_COMPRESSION'] = 'false'
# Output results
# Format: KEY=VALUE lines that hooks.py will parse and add to env
for key, value in computed.items():
print(f"COMPUTED:{key}={value}")
for warning in warnings:
print(f"WARNING:{warning}", file=sys.stderr)
for error in errors:
print(f"ERROR:{error}", file=sys.stderr)
# Exit with error if any hard errors
sys.exit(1 if errors else 0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,325 @@
#!/usr/bin/env python3
"""
Archive a URL using wget.
Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
Output: Downloads files to $PWD
Environment variables:
WGET_BINARY: Path to wget binary (optional, falls back to PATH)
WGET_TIMEOUT: Timeout in seconds (default: 60)
WGET_USER_AGENT: User agent string
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
WGET_COOKIES_FILE: Path to cookies file (optional)
WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
# Wget feature toggles
SAVE_WGET: Enable wget archiving (default: True)
SAVE_WARC: Save WARC file (default: True)
SAVE_WGET_REQUISITES: Download page requisites (default: True)
# Fallback to ARCHIVING_CONFIG values if WGET_* not set:
TIMEOUT: Fallback timeout
USER_AGENT: Fallback user agent
CHECK_SSL_VALIDITY: Fallback SSL check
COOKIES_FILE: Fallback cookies file
RESTRICT_FILE_NAMES: Fallback filename restriction
"""
import json
import os
import re
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'wget'
BIN_NAME = 'wget'
BIN_PROVIDERS = 'apt,brew,env'
OUTPUT_DIR = 'wget'
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def get_env_int(name: str, default: int = 0) -> int:
try:
return int(get_env(name, str(default)))
except ValueError:
return default
STATICFILE_DIR = 'staticfile'
def has_staticfile_output() -> bool:
"""Check if staticfile extractor already downloaded this URL."""
staticfile_dir = Path(STATICFILE_DIR)
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
def find_wget() -> str | None:
"""Find wget binary."""
wget = get_env('WGET_BINARY')
if wget and os.path.isfile(wget):
return wget
return shutil.which('wget')
def get_version(binary: str) -> str:
"""Get wget version."""
try:
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
return result.stdout.split('\n')[0].strip()[:64]
except Exception:
return ''
def check_wget_compression(binary: str) -> bool:
"""Check if wget supports --compression=auto."""
try:
result = subprocess.run(
[binary, '--compression=auto', '--help'],
capture_output=True,
timeout=5
)
return result.returncode == 0
except Exception:
return False
# Default wget args (from old WGET_CONFIG)
WGET_DEFAULT_ARGS = [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
"""
Archive URL using wget.
Returns: (success, output_path, error_message)
"""
# Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
extra_args = get_env('WGET_EXTRA_ARGS', '')
# Feature toggles
save_warc = get_env_bool('SAVE_WARC', True)
save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True)
# Check for compression support
supports_compression = check_wget_compression(binary)
# Build wget command (later options take precedence)
cmd = [
binary,
*WGET_DEFAULT_ARGS,
f'--timeout={timeout}',
'--tries=2',
]
if user_agent:
cmd.append(f'--user-agent={user_agent}')
if restrict_names:
cmd.append(f'--restrict-file-names={restrict_names}')
if save_requisites:
cmd.append('--page-requisites')
if save_warc:
warc_dir = Path('warc')
warc_dir.mkdir(exist_ok=True)
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
cmd.append(f'--warc-file={warc_path}')
else:
cmd.append('--timestamping')
if cookies_file and Path(cookies_file).is_file():
cmd.extend(['--load-cookies', cookies_file])
if supports_compression:
cmd.append('--compression=auto')
if not check_ssl:
cmd.extend(['--no-check-certificate', '--no-hsts'])
if extra_args:
cmd.extend(extra_args.split())
cmd.append(url)
# Run wget
try:
result = subprocess.run(
cmd,
capture_output=True,
timeout=timeout * 2, # Allow extra time for large downloads
)
# Find downloaded files
downloaded_files = [
f for f in Path('.').rglob('*')
if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/')
]
if not downloaded_files:
stderr = result.stderr.decode('utf-8', errors='replace')
stdout = result.stdout.decode('utf-8', errors='replace')
combined = stderr + stdout
if '403' in combined or 'Forbidden' in combined:
return False, None, '403 Forbidden (try changing USER_AGENT)'
elif '404' in combined or 'Not Found' in combined:
return False, None, '404 Not Found'
elif '500' in combined:
return False, None, '500 Internal Server Error'
else:
return False, None, f'No files downloaded: {stderr[:200]}'
# Find main HTML file
html_files = [
f for f in downloaded_files
if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f))
]
output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
# Parse download stats from wget output
output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:]
files_count = len(downloaded_files)
return True, output_path, ''
except subprocess.TimeoutExpired:
return False, None, f'Timed out after {timeout * 2} seconds'
except Exception as e:
return False, None, f'{type(e).__name__}: {e}'
@click.command()
@click.option('--url', required=True, help='URL to archive')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Archive a URL using wget."""
start_ts = datetime.now(timezone.utc)
version = ''
output = None
status = 'failed'
error = ''
binary = None
cmd_str = ''
try:
# Check if wget is enabled
if not get_env_bool('SAVE_WGET', True):
print('Skipping wget (SAVE_WGET=False)')
status = 'skipped'
end_ts = datetime.now(timezone.utc)
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'STATUS={status}')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0)
# Check if staticfile extractor already handled this (permanent skip)
if has_staticfile_output():
print(f'Skipping wget - staticfile extractor already downloaded this')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - staticfile already handled
# Find binary
binary = find_wget()
if not binary:
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr)
sys.exit(1)
version = get_version(binary)
cmd_str = f'{binary} ... {url}'
# Run extraction
success, output, error = save_wget(url, binary)
status = 'succeeded' if success else 'failed'
if success:
# Count downloaded files
files = list(Path('.').rglob('*'))
file_count = len([f for f in files if f.is_file()])
print(f'wget completed: {file_count} files downloaded')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
# Print results
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if cmd_str:
print(f'CMD={cmd_str}')
if version:
print(f'VERSION={version}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
# Print JSON result
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'cmd_version': version,
'output': output,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,369 @@
"""
Integration tests for wget plugin
Tests verify:
1. Plugin reports missing dependency correctly
2. wget can be installed via brew/apt provider hooks
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
4. Extraction works against real example.com
5. Output files contain actual page content
6. Skip cases work (SAVE_WGET=False, staticfile present)
7. Failure cases handled (404, network errors)
"""
import json
import os
import shutil
import subprocess
import sys
import tempfile
import uuid
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
TEST_URL = 'https://example.com'
def test_hook_script_exists():
"""Verify hook script exists."""
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
def test_reports_missing_dependency_when_not_installed():
"""Test that script reports DEPENDENCY_NEEDED when wget is not found."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run with empty PATH so binary won't be found
env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env
)
# Should fail and report missing dependency
assert result.returncode != 0, "Should exit non-zero when dependency missing"
combined = result.stdout + result.stderr
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
assert 'wget' in combined.lower(), "Should mention wget"
assert 'BIN_PROVIDERS' in combined, "Should report available providers (apt,brew,env)"
def test_can_install_wget_via_provider():
"""Test that wget can be installed via brew/apt provider hooks."""
# Determine which provider to use
if shutil.which('brew'):
provider_hook = BREW_HOOK
provider_name = 'brew'
elif shutil.which('apt-get'):
provider_hook = APT_HOOK
provider_name = 'apt'
else:
pytest.skip("Neither brew nor apt available on this system")
assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
# Test installation via provider hook
dependency_id = str(uuid.uuid4())
result = subprocess.run(
[
sys.executable,
str(provider_hook),
'--dependency-id', dependency_id,
'--bin-name', 'wget',
'--bin-providers', 'apt,brew,env'
],
capture_output=True,
text=True,
timeout=300 # Installation can take time
)
# Should succeed (wget installs successfully or is already installed)
assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
# Should output InstalledBinary JSONL record
assert 'InstalledBinary' in result.stdout or 'wget' in result.stderr, \
f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
# Parse JSONL if present
if result.stdout.strip():
for line in result.stdout.strip().split('\n'):
try:
record = json.loads(line)
if record.get('type') == 'InstalledBinary':
assert record['name'] == 'wget'
assert record['binprovider'] in ['brew', 'apt']
assert record['abspath'], "Should have binary path"
assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}"
break
except json.JSONDecodeError:
continue
# Verify wget is now available
result = subprocess.run(['which', 'wget'], capture_output=True, text=True)
assert result.returncode == 0, "wget should be available after installation"
def test_archives_example_com():
"""Test full workflow: ensure wget installed then archive example.com."""
# First ensure wget is installed via provider
if shutil.which('brew'):
provider_hook = BREW_HOOK
elif shutil.which('apt-get'):
provider_hook = APT_HOOK
else:
pytest.skip("Neither brew nor apt available")
# Run installation (idempotent - will succeed if already installed)
install_result = subprocess.run(
[
sys.executable,
str(provider_hook),
'--dependency-id', str(uuid.uuid4()),
'--bin-name', 'wget',
'--bin-providers', 'apt,brew,env'
],
capture_output=True,
text=True,
timeout=300
)
if install_result.returncode != 0:
pytest.skip(f"Could not install wget: {install_result.stderr}")
# Now test archiving
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Run wget extraction
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=120
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Verify output in stdout
assert 'STATUS=succeeded' in result.stdout, "Should report success"
assert 'wget completed' in result.stdout, "Should report completion"
# Verify files were downloaded
downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
assert len(downloaded_files) > 0, "No HTML files downloaded"
# Find main HTML file (should contain example.com)
main_html = None
for html_file in downloaded_files:
content = html_file.read_text(errors='ignore')
if 'example domain' in content.lower():
main_html = html_file
break
assert main_html is not None, "Could not find main HTML file with example.com content"
# Verify HTML content contains REAL example.com text
html_content = main_html.read_text(errors='ignore')
assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
assert ('this domain' in html_content.lower() or
'illustrative examples' in html_content.lower()), \
"Missing example.com description text"
assert ('iana' in html_content.lower() or
'more information' in html_content.lower()), \
"Missing IANA reference"
# Verify RESULT_JSON is present and valid
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
for line in result.stdout.split('\n'):
if line.startswith('RESULT_JSON='):
result_json = json.loads(line.replace('RESULT_JSON=', ''))
assert result_json['extractor'] == 'wget'
assert result_json['status'] == 'succeeded'
assert result_json['url'] == TEST_URL
assert result_json['snapshot_id'] == 'test789'
assert 'duration' in result_json
assert result_json['duration'] >= 0
break
def test_config_save_wget_false_skips():
"""Test that SAVE_WGET=False causes skip."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set SAVE_WGET=False
env = os.environ.copy()
env['SAVE_WGET'] = 'False'
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Should succeed but skip
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
assert 'SAVE_WGET=False' in result.stdout, "Should mention SAVE_WGET=False"
def test_config_save_warc():
"""Test that SAVE_WARC=True creates WARC files."""
# Ensure wget is available
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set SAVE_WARC=True explicitly
env = os.environ.copy()
env['SAVE_WARC'] = 'True'
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=120
)
if result.returncode == 0:
# Look for WARC files in warc/ subdirectory
warc_dir = tmpdir / 'warc'
if warc_dir.exists():
warc_files = list(warc_dir.rglob('*'))
warc_files = [f for f in warc_files if f.is_file()]
assert len(warc_files) > 0, "WARC file not created when SAVE_WARC=True"
def test_staticfile_present_skips():
"""Test that wget skips when staticfile already downloaded."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Create staticfile directory with content to simulate staticfile extractor ran
staticfile_dir = tmpdir / 'staticfile'
staticfile_dir.mkdir()
(staticfile_dir / 'index.html').write_text('<html>test</html>')
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=30
)
# Should skip
assert result.returncode == 0, "Should exit 0 when skipping"
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
def test_handles_404_gracefully():
"""Test that wget fails gracefully on 404."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try to download non-existent page
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=60
)
# Should fail
assert result.returncode != 0, "Should fail on 404"
combined = result.stdout + result.stderr
assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \
"Should report 404 or no files downloaded"
def test_config_timeout_honored():
"""Test that WGET_TIMEOUT config is respected."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set very short timeout
env = os.environ.copy()
env['WGET_TIMEOUT'] = '5'
# This should still succeed for example.com (it's fast)
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=30
)
# Verify it completed (success or fail, but didn't hang)
assert result.returncode in (0, 1), "Should complete (success or fail)"
def test_config_user_agent():
"""Test that WGET_USER_AGENT config is used."""
if not shutil.which('wget'):
pytest.skip("wget not installed")
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set custom user agent
env = os.environ.copy()
env['WGET_USER_AGENT'] = 'TestBot/1.0'
result = subprocess.run(
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
cwd=tmpdir,
capture_output=True,
text=True,
env=env,
timeout=120
)
# Should succeed (example.com doesn't block)
if result.returncode == 0:
assert 'STATUS=succeeded' in result.stdout
if __name__ == '__main__':
pytest.main([__file__, '-v'])