mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 14:57:56 +10:00
wip major changes
This commit is contained in:
266
archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
Executable file
266
archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
Executable file
@@ -0,0 +1,266 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract accessibility tree and page outline from a URL.
|
||||
*
|
||||
* Extracts:
|
||||
* - Page outline (headings h1-h6, sections, articles)
|
||||
* - Iframe tree
|
||||
* - Accessibility snapshot
|
||||
* - ARIA labels and roles
|
||||
*
|
||||
* Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes accessibility/accessibility.json
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_ACCESSIBILITY: Enable accessibility extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'accessibility';
|
||||
const OUTPUT_DIR = 'accessibility';
|
||||
const OUTPUT_FILE = 'accessibility.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract accessibility info
|
||||
async function extractAccessibility(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Get accessibility snapshot
|
||||
const accessibilityTree = await page.accessibility.snapshot({ interestingOnly: true });
|
||||
|
||||
// Extract page outline (headings, sections, etc.)
|
||||
const outline = await page.evaluate(() => {
|
||||
const headings = [];
|
||||
const elements = document.querySelectorAll(
|
||||
'h1, h2, h3, h4, h5, h6, a[name], header, footer, article, main, aside, nav, section, figure, summary, table, form, iframe'
|
||||
);
|
||||
|
||||
elements.forEach(elem => {
|
||||
// Skip unnamed anchors
|
||||
if (elem.tagName.toLowerCase() === 'a' && !elem.name) return;
|
||||
|
||||
const tagName = elem.tagName.toLowerCase();
|
||||
const elemId = elem.id || elem.name || elem.getAttribute('aria-label') || elem.role || '';
|
||||
const elemClasses = (elem.className || '').toString().trim().split(/\s+/).slice(0, 3).join(' .');
|
||||
const action = elem.action?.split('/').pop() || '';
|
||||
|
||||
let summary = (elem.innerText || '').slice(0, 128);
|
||||
if (summary.length >= 128) summary += '...';
|
||||
|
||||
let prefix = '';
|
||||
let title = '';
|
||||
|
||||
// Format headings with # prefix
|
||||
const level = parseInt(tagName.replace('h', ''));
|
||||
if (!isNaN(level)) {
|
||||
prefix = '#'.repeat(level);
|
||||
title = elem.innerText || elemId || elemClasses;
|
||||
} else {
|
||||
// For other elements, create breadcrumb path
|
||||
const parents = [tagName];
|
||||
let node = elem.parentNode;
|
||||
while (node && parents.length < 5) {
|
||||
if (node.tagName) {
|
||||
const tag = node.tagName.toLowerCase();
|
||||
if (!['div', 'span', 'p', 'body', 'html'].includes(tag)) {
|
||||
parents.unshift(tag);
|
||||
} else {
|
||||
parents.unshift('');
|
||||
}
|
||||
}
|
||||
node = node.parentNode;
|
||||
}
|
||||
prefix = parents.join('>');
|
||||
|
||||
title = elemId ? `#${elemId}` : '';
|
||||
if (!title && elemClasses) title = `.${elemClasses}`;
|
||||
if (action) title += ` /${action}`;
|
||||
if (summary && !title.includes(summary)) title += `: ${summary}`;
|
||||
}
|
||||
|
||||
// Clean up title
|
||||
title = title.replace(/\s+/g, ' ').trim();
|
||||
|
||||
if (prefix) {
|
||||
headings.push(`${prefix} ${title}`);
|
||||
}
|
||||
});
|
||||
|
||||
return headings;
|
||||
});
|
||||
|
||||
// Get iframe tree
|
||||
const iframes = [];
|
||||
function dumpFrameTree(frame, indent = '>') {
|
||||
iframes.push(indent + frame.url());
|
||||
for (const child of frame.childFrames()) {
|
||||
dumpFrameTree(child, indent + '>');
|
||||
}
|
||||
}
|
||||
dumpFrameTree(page.mainFrame(), '');
|
||||
|
||||
const accessibilityData = {
|
||||
url,
|
||||
headings: outline,
|
||||
iframes,
|
||||
tree: accessibilityTree,
|
||||
};
|
||||
|
||||
// Write output
|
||||
fs.writeFileSync(outputPath, JSON.stringify(accessibilityData, null, 2));
|
||||
|
||||
return { success: true, output: outputPath, accessibilityData };
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_ACCESSIBILITY', true)) {
|
||||
console.log('Skipping accessibility (SAVE_ACCESSIBILITY=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await extractAccessibility(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const headingCount = result.accessibilityData.headings.length;
|
||||
const iframeCount = result.accessibilityData.iframes.length;
|
||||
console.log(`Accessibility extracted: ${headingCount} headings, ${iframeCount} iframes`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using apt package manager.
|
||||
|
||||
Usage: on_Dependency__install_using_apt_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
|
||||
Output: InstalledBinary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, AptProvider, BinProviderOverrides
|
||||
|
||||
# Fix pydantic forward reference issue
|
||||
AptProvider.model_rebuild()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dependency-id', required=True, help="Dependency UUID")
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command (overrides default)")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
"""Install binary using apt package manager."""
|
||||
|
||||
# Check if apt provider is allowed
|
||||
if bin_providers != '*' and 'apt' not in bin_providers.split(','):
|
||||
click.echo(f"apt provider not allowed for {bin_name}", err=True)
|
||||
sys.exit(0) # Not an error, just skip
|
||||
|
||||
# Use abx-pkg AptProvider to install binary
|
||||
provider = AptProvider()
|
||||
if not provider.INSTALLER_BIN:
|
||||
click.echo("apt not available on this system", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {bin_name} via apt...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
except Exception as e:
|
||||
click.echo(f"apt install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{bin_name} not found after apt install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output InstalledBinary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': bin_name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'apt',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
26
archivebox/plugins/archive_org/config.json
Normal file
26
archivebox/plugins/archive_org/config.json
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_ARCHIVE_DOT_ORG": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SUBMIT_ARCHIVE_DOT_ORG"],
|
||||
"description": "Submit URLs to archive.org Wayback Machine"
|
||||
},
|
||||
"ARCHIVE_ORG_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for archive.org submission in seconds"
|
||||
},
|
||||
"ARCHIVE_ORG_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
}
|
||||
}
|
||||
}
|
||||
156
archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
Normal file
156
archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Submit a URL to archive.org for archiving.
|
||||
|
||||
Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes archive.org.txt to $PWD with the archived URL
|
||||
|
||||
Environment variables:
|
||||
TIMEOUT: Timeout in seconds (default: 60)
|
||||
USER_AGENT: User agent string
|
||||
|
||||
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
|
||||
It can run standalone if requests is installed: pip install requests
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'archive_org'
|
||||
OUTPUT_DIR = 'archive_org'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Submit URL to archive.org Wayback Machine.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
return False, None, 'requests library not installed'
|
||||
|
||||
timeout = get_env_int('TIMEOUT', 60)
|
||||
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
submit_url = f'https://web.archive.org/save/{url}'
|
||||
|
||||
try:
|
||||
response = requests.get(
|
||||
submit_url,
|
||||
timeout=timeout,
|
||||
headers={'User-Agent': user_agent},
|
||||
allow_redirects=True,
|
||||
)
|
||||
|
||||
# Check for successful archive
|
||||
content_location = response.headers.get('Content-Location', '')
|
||||
x_archive_orig_url = response.headers.get('X-Archive-Orig-Url', '')
|
||||
|
||||
# Build archive URL
|
||||
if content_location:
|
||||
archive_url = f'https://web.archive.org{content_location}'
|
||||
Path(OUTPUT_FILE).write_text(archive_url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, ''
|
||||
elif 'web.archive.org' in response.url:
|
||||
# We were redirected to an archive page
|
||||
Path(OUTPUT_FILE).write_text(response.url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, ''
|
||||
else:
|
||||
# Check for errors in response
|
||||
if 'RobotAccessControlException' in response.text:
|
||||
# Blocked by robots.txt - save submit URL for manual retry
|
||||
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, '' # Consider this a soft success
|
||||
elif response.status_code >= 400:
|
||||
return False, None, f'HTTP {response.status_code}'
|
||||
else:
|
||||
# Save submit URL anyway
|
||||
Path(OUTPUT_FILE).write_text(submit_url, encoding='utf-8')
|
||||
return True, OUTPUT_FILE, ''
|
||||
|
||||
except requests.Timeout:
|
||||
return False, None, f'Request timed out after {timeout} seconds'
|
||||
except requests.RequestException as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to submit to archive.org')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Submit a URL to archive.org for archiving."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = submit_to_archive_org(url)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
archive_url = Path(output).read_text().strip()
|
||||
print(f'Archived at: {archive_url}')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using Homebrew package manager.
|
||||
|
||||
Usage: on_Dependency__install_using_brew_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
|
||||
Output: InstalledBinary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, BrewProvider, BinProviderOverrides
|
||||
|
||||
# Fix pydantic forward reference issue
|
||||
BrewProvider.model_rebuild()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dependency-id', required=True, help="Dependency UUID")
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
"""Install binary using Homebrew."""
|
||||
|
||||
if bin_providers != '*' and 'brew' not in bin_providers.split(','):
|
||||
click.echo(f"brew provider not allowed for {bin_name}", err=True)
|
||||
sys.exit(0)
|
||||
|
||||
# Use abx-pkg BrewProvider to install binary
|
||||
provider = BrewProvider()
|
||||
if not provider.INSTALLER_BIN:
|
||||
click.echo("brew not available on this system", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {bin_name} via brew...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
except Exception as e:
|
||||
click.echo(f"brew install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{bin_name} not found after brew install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output InstalledBinary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': bin_name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'brew',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
240
archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py
Executable file
240
archivebox/plugins/canonical_outputs/on_Snapshot__92_canonical_outputs.py
Executable file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create symlinks from plugin outputs to canonical legacy locations.
|
||||
|
||||
This plugin runs after all extractors complete and creates symlinks from the
|
||||
new plugin-based output structure to the legacy canonical output paths that
|
||||
ArchiveBox has historically used. This maintains backward compatibility with
|
||||
existing tools and scripts that expect outputs at specific locations.
|
||||
|
||||
Canonical output paths (from Snapshot.canonical_outputs()):
|
||||
- favicon.ico → favicon/favicon.ico
|
||||
- singlefile.html → singlefile/singlefile.html
|
||||
- readability/content.html → readability/content.html
|
||||
- mercury/content.html → mercury/content.html
|
||||
- htmltotext.txt → htmltotext/htmltotext.txt
|
||||
- output.pdf → pdf/output.pdf
|
||||
- screenshot.png → screenshot/screenshot.png
|
||||
- output.html → dom/output.html
|
||||
- headers.json → headers/headers.json
|
||||
- warc/{timestamp} → wget/warc/{timestamp}
|
||||
|
||||
New plugin outputs:
|
||||
- ssl.json → ssl/ssl.json
|
||||
- seo.json → seo/seo.json
|
||||
- accessibility.json → accessibility/accessibility.json
|
||||
- outlinks.json → outlinks/outlinks.json
|
||||
- redirects.json → redirects/redirects.json
|
||||
- console.jsonl → consolelog/console.jsonl
|
||||
|
||||
Usage: on_Snapshot__91_canonical_outputs.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_CANONICAL_SYMLINKS: Enable canonical symlinks (default: true)
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.plugins.canonical_outputs'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
|
||||
# Configure Django if running standalone
|
||||
if __name__ == '__main__':
|
||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Mapping from canonical path to plugin output path
|
||||
CANONICAL_MAPPINGS = {
|
||||
# Legacy extractors
|
||||
'favicon.ico': 'favicon/favicon.ico',
|
||||
'singlefile.html': 'singlefile/singlefile.html',
|
||||
'readability/content.html': 'readability/content.html',
|
||||
'mercury/content.html': 'mercury/content.html',
|
||||
'htmltotext.txt': 'htmltotext/htmltotext.txt',
|
||||
'output.pdf': 'pdf/output.pdf',
|
||||
'screenshot.png': 'screenshot/screenshot.png',
|
||||
'output.html': 'dom/output.html',
|
||||
'headers.json': 'headers/headers.json',
|
||||
|
||||
# New plugins
|
||||
'ssl.json': 'ssl/ssl.json',
|
||||
'seo.json': 'seo/seo.json',
|
||||
'accessibility.json': 'accessibility/accessibility.json',
|
||||
'outlinks.json': 'parse_dom_outlinks/outlinks.json',
|
||||
'redirects.json': 'redirects/redirects.json',
|
||||
'console.jsonl': 'consolelog/console.jsonl',
|
||||
}
|
||||
|
||||
|
||||
def create_symlink(target: Path, link: Path, relative: bool = True) -> bool:
|
||||
"""
|
||||
Create a symlink from link to target.
|
||||
|
||||
Args:
|
||||
target: The actual file/directory (source)
|
||||
link: The symlink to create (destination)
|
||||
relative: Whether to create a relative symlink (default: True)
|
||||
|
||||
Returns:
|
||||
True if symlink was created or already exists, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Skip if target doesn't exist
|
||||
if not target.exists():
|
||||
return False
|
||||
|
||||
# Remove existing symlink/file if present
|
||||
if link.exists() or link.is_symlink():
|
||||
if link.is_symlink() and link.resolve() == target.resolve():
|
||||
# Already correctly symlinked
|
||||
return True
|
||||
link.unlink()
|
||||
|
||||
# Create parent directory
|
||||
link.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create relative or absolute symlink
|
||||
if relative:
|
||||
# Calculate relative path from link to target
|
||||
rel_target = os.path.relpath(target, link.parent)
|
||||
link.symlink_to(rel_target)
|
||||
else:
|
||||
link.symlink_to(target)
|
||||
|
||||
return True
|
||||
except (OSError, FileNotFoundError, PermissionError) as e:
|
||||
# Symlink creation failed, skip
|
||||
return False
|
||||
|
||||
|
||||
def create_canonical_symlinks(snapshot_dir: Path) -> Dict[str, bool]:
|
||||
"""
|
||||
Create all canonical symlinks for a snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: The snapshot directory (e.g., archive/<timestamp>/)
|
||||
|
||||
Returns:
|
||||
Dict mapping canonical path to success status
|
||||
"""
|
||||
results = {}
|
||||
|
||||
for canonical_path, plugin_output in CANONICAL_MAPPINGS.items():
|
||||
target = snapshot_dir / plugin_output
|
||||
link = snapshot_dir / canonical_path
|
||||
|
||||
success = create_symlink(target, link, relative=True)
|
||||
results[canonical_path] = success
|
||||
|
||||
# Special handling for warc/ directory symlink
|
||||
# wget plugin outputs to wget/warc/, but canonical expects warc/ at root
|
||||
wget_warc = snapshot_dir / 'wget' / 'warc'
|
||||
canonical_warc = snapshot_dir / 'warc'
|
||||
if wget_warc.exists():
|
||||
results['warc/'] = create_symlink(wget_warc, canonical_warc, relative=True)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL being archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Create symlinks from plugin outputs to canonical legacy locations."""
|
||||
from datetime import datetime
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
start_ts = datetime.now()
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
symlinks_created = 0
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
from archivebox.config import CONSTANTS
|
||||
save_canonical = os.getenv('SAVE_CANONICAL_SYMLINKS', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_canonical:
|
||||
click.echo('Skipping canonical symlinks (SAVE_CANONICAL_SYMLINKS=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now()
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'STATUS={status}')
|
||||
click.echo(f'RESULT_JSON={{"extractor": "canonical_outputs", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
||||
sys.exit(0)
|
||||
|
||||
# Get snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
error = f'Snapshot {snapshot_id} not found'
|
||||
raise ValueError(error)
|
||||
|
||||
# Get snapshot directory
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
if not snapshot_dir.exists():
|
||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
|
||||
# Create canonical symlinks
|
||||
results = create_canonical_symlinks(snapshot_dir)
|
||||
|
||||
# Count successful symlinks
|
||||
symlinks_created = sum(1 for success in results.values() if success)
|
||||
total_mappings = len(results)
|
||||
|
||||
status = 'succeeded'
|
||||
output = str(snapshot_dir)
|
||||
click.echo(f'Created {symlinks_created}/{total_mappings} canonical symlinks')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
end_ts = datetime.now()
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
# Print results
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
click.echo(f'OUTPUT={output}')
|
||||
click.echo(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
click.echo(f'ERROR={error}', err=True)
|
||||
|
||||
# Print JSON result
|
||||
import json
|
||||
result_json = {
|
||||
'extractor': 'canonical_outputs',
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'symlinks_created': symlinks_created,
|
||||
'error': error or None,
|
||||
}
|
||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
121
archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js
Executable file
121
archivebox/plugins/captcha2/on_Snapshot__01_captcha2.js
Executable file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* 2Captcha Extension Plugin
|
||||
*
|
||||
* Installs and configures the 2captcha Chrome extension for automatic
|
||||
* CAPTCHA solving during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
|
||||
* Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
|
||||
*
|
||||
* Priority: 01 (early) - Must install before Chrome session starts
|
||||
* Hook: on_Snapshot
|
||||
*
|
||||
* Requirements:
|
||||
* - API_KEY_2CAPTCHA environment variable must be set
|
||||
* - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
|
||||
name: 'captcha2',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
/**
|
||||
* Install and configure the 2captcha extension
|
||||
*/
|
||||
async function installCaptchaExtension() {
|
||||
console.log('[*] Installing 2captcha extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install 2captcha extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if API key is configured
|
||||
const apiKey = process.env.API_KEY_2CAPTCHA;
|
||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
|
||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
||||
} else {
|
||||
console.log('[+] 2captcha extension installed and API key configured');
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: 2captcha configuration is now handled by chrome_session plugin
|
||||
* during first-time browser setup to avoid repeated configuration on every snapshot.
|
||||
* The API key is injected via chrome.storage API once per browser session.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] 2captcha extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installCaptchaExtension();
|
||||
|
||||
// Export extension metadata for chrome_session to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome_session can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installCaptchaExtension,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] 2captcha extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] 2captcha extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
284
archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js
Executable file
284
archivebox/plugins/captcha2/on_Snapshot__21_captcha2_config.js
Executable file
@@ -0,0 +1,284 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* 2Captcha Extension Configuration
|
||||
*
|
||||
* Configures the 2captcha extension with API key after Chrome session starts.
|
||||
* Runs once per browser session to inject API key into extension storage.
|
||||
*
|
||||
* Priority: 21 (after chrome_session at 20, before navigation at 30)
|
||||
* Hook: on_Snapshot
|
||||
*
|
||||
* Requirements:
|
||||
* - API_KEY_2CAPTCHA environment variable must be set
|
||||
* - chrome_session must have loaded extensions (extensions.json must exist)
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const OUTPUT_DIR = 'chrome_session';
|
||||
const CONFIG_MARKER = path.join(OUTPUT_DIR, '.captcha2_configured');
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
async function configure2Captcha() {
|
||||
// Check if already configured in this session
|
||||
if (fs.existsSync(CONFIG_MARKER)) {
|
||||
console.log('[*] 2captcha already configured in this browser session');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
// Check if API key is set
|
||||
const apiKey = getEnv('API_KEY_2CAPTCHA');
|
||||
if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
|
||||
console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
|
||||
console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
|
||||
return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
|
||||
}
|
||||
|
||||
// Load extensions metadata
|
||||
const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
|
||||
if (!fs.existsSync(extensionsFile)) {
|
||||
return { success: false, error: 'extensions.json not found - chrome_session must run first' };
|
||||
}
|
||||
|
||||
const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
|
||||
const captchaExt = extensions.find(ext => ext.name === 'captcha2');
|
||||
|
||||
if (!captchaExt) {
|
||||
console.log('[*] 2captcha extension not installed, skipping configuration');
|
||||
return { success: true, skipped: true };
|
||||
}
|
||||
|
||||
console.log('[*] Configuring 2captcha extension with API key...');
|
||||
|
||||
try {
|
||||
// Connect to the existing Chrome session via CDP
|
||||
const cdpFile = path.join(OUTPUT_DIR, 'cdp_url.txt');
|
||||
if (!fs.existsSync(cdpFile)) {
|
||||
return { success: false, error: 'CDP URL not found - chrome_session must run first' };
|
||||
}
|
||||
|
||||
const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
|
||||
const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
|
||||
|
||||
try {
|
||||
// Method 1: Try to inject via extension background page
|
||||
if (captchaExt.target && captchaExt.target_ctx) {
|
||||
console.log('[*] Attempting to configure via extension background page...');
|
||||
|
||||
// Reconnect to the browser to get fresh target context
|
||||
const targets = await browser.targets();
|
||||
const extTarget = targets.find(t =>
|
||||
t.url().startsWith(`chrome-extension://${captchaExt.id}`)
|
||||
);
|
||||
|
||||
if (extTarget) {
|
||||
const extContext = await extTarget.worker() || await extTarget.page();
|
||||
|
||||
if (extContext) {
|
||||
await extContext.evaluate((key) => {
|
||||
// Try all common storage patterns
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({
|
||||
apiKey: key,
|
||||
api_key: key,
|
||||
'2captcha_apikey': key,
|
||||
apikey: key,
|
||||
'solver-api-key': key,
|
||||
});
|
||||
chrome.storage.sync.set({
|
||||
apiKey: key,
|
||||
api_key: key,
|
||||
'2captcha_apikey': key,
|
||||
apikey: key,
|
||||
'solver-api-key': key,
|
||||
});
|
||||
}
|
||||
|
||||
// Also try localStorage as fallback
|
||||
if (typeof localStorage !== 'undefined') {
|
||||
localStorage.setItem('apiKey', key);
|
||||
localStorage.setItem('2captcha_apikey', key);
|
||||
localStorage.setItem('solver-api-key', key);
|
||||
}
|
||||
}, apiKey);
|
||||
|
||||
console.log('[+] 2captcha API key configured successfully via background page');
|
||||
|
||||
// Mark as configured
|
||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
||||
|
||||
return { success: true, method: 'background_page' };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Method 2: Try to configure via options page
|
||||
console.log('[*] Attempting to configure via options page...');
|
||||
const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
|
||||
const configPage = await browser.newPage();
|
||||
|
||||
try {
|
||||
await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
|
||||
|
||||
const configured = await configPage.evaluate((key) => {
|
||||
// Try to find API key input field
|
||||
const selectors = [
|
||||
'input[name*="apikey" i]',
|
||||
'input[id*="apikey" i]',
|
||||
'input[name*="api-key" i]',
|
||||
'input[id*="api-key" i]',
|
||||
'input[name*="key" i]',
|
||||
'input[placeholder*="api" i]',
|
||||
'input[type="text"]',
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const input = document.querySelector(selector);
|
||||
if (input) {
|
||||
input.value = key;
|
||||
input.dispatchEvent(new Event('input', { bubbles: true }));
|
||||
input.dispatchEvent(new Event('change', { bubbles: true }));
|
||||
|
||||
// Try to find and click save button
|
||||
const saveSelectors = [
|
||||
'button[type="submit"]',
|
||||
'input[type="submit"]',
|
||||
'button:contains("Save")',
|
||||
'button:contains("Apply")',
|
||||
];
|
||||
|
||||
for (const btnSel of saveSelectors) {
|
||||
const btn = document.querySelector(btnSel);
|
||||
if (btn) {
|
||||
btn.click();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Also save to storage
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Just save to storage
|
||||
if (typeof chrome !== 'undefined' && chrome.storage) {
|
||||
chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}, apiKey);
|
||||
|
||||
await configPage.close();
|
||||
|
||||
if (configured) {
|
||||
console.log('[+] 2captcha API key configured successfully via options page');
|
||||
|
||||
// Mark as configured
|
||||
fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
|
||||
|
||||
return { success: true, method: 'options_page' };
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
|
||||
try {
|
||||
await configPage.close();
|
||||
} catch (e2) {}
|
||||
}
|
||||
|
||||
return { success: false, error: 'Could not configure via any method' };
|
||||
} finally {
|
||||
browser.disconnect();
|
||||
}
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
const result = await configure2Captcha();
|
||||
|
||||
if (result.skipped) {
|
||||
status = 'skipped';
|
||||
} else if (result.success) {
|
||||
status = 'succeeded';
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error || 'Configuration failed';
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: 'captcha2_config',
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
184
archivebox/plugins/captcha2/tests/test_captcha2.py
Normal file
184
archivebox/plugins/captcha2/tests/test_captcha2.py
Normal file
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
Unit tests for captcha2 plugin
|
||||
|
||||
Tests invoke the plugin hooks as external processes and verify outputs/side effects.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__01_captcha2.js"
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / "on_Snapshot__21_captcha2_config.js"
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
|
||||
|
||||
def test_config_script_exists():
|
||||
"""Verify config script exists"""
|
||||
assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that captcha2 extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
# Just check the script can be loaded
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert metadata["name"] == "captcha2"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "captcha2.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
|
||||
assert cache_data["name"] == "captcha2"
|
||||
assert "unpacked_path" in cache_data
|
||||
assert "version" in cache_data
|
||||
|
||||
|
||||
def test_install_uses_existing_cache():
|
||||
"""Test that install uses existing cache when available"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
# Create fake cache
|
||||
fake_extension_dir = ext_dir / "ifibfemgeogfhoebkmokieepdoobkbpo__captcha2"
|
||||
fake_extension_dir.mkdir(parents=True)
|
||||
|
||||
manifest = {"version": "3.7.0", "name": "2Captcha Solver"}
|
||||
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
|
||||
|
||||
cache_data = {
|
||||
"webstore_id": "ifibfemgeogfhoebkmokieepdoobkbpo",
|
||||
"name": "captcha2",
|
||||
"unpacked_path": str(fake_extension_dir),
|
||||
"version": "3.7.0"
|
||||
}
|
||||
(ext_dir / "captcha2.extension.json").write_text(json.dumps(cache_data))
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_api_key"
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should use cache
|
||||
assert "already installed (using cache)" in result.stdout or "Installed extension captcha2" in result.stdout
|
||||
|
||||
|
||||
def test_install_warns_without_api_key():
|
||||
"""Test that install warns when API key not configured"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# Don't set API_KEY_2CAPTCHA
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should warn about missing API key
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
|
||||
|
||||
|
||||
def test_install_success_with_api_key():
|
||||
"""Test that install succeeds when API key is configured"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
|
||||
|
||||
# Run install script
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should mention API key configured
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
|
||||
|
||||
|
||||
def test_config_script_structure():
|
||||
"""Test that config script has proper structure"""
|
||||
# Verify the script exists and contains expected markers
|
||||
script_content = CONFIG_SCRIPT.read_text()
|
||||
|
||||
# Should mention configuration marker file
|
||||
assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
|
||||
|
||||
# Should mention API key
|
||||
assert "API_KEY_2CAPTCHA" in script_content
|
||||
|
||||
# Should have main function or be executable
|
||||
assert "async function" in script_content or "main" in script_content
|
||||
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Clean up Chrome browser session started by chrome_session extractor.
|
||||
|
||||
This extractor runs after all Chrome-based extractors (screenshot, pdf, dom)
|
||||
to terminate the Chrome process and clean up any leftover files.
|
||||
|
||||
Usage: on_Snapshot__24_chrome_cleanup.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Terminates Chrome process and removes lock files
|
||||
|
||||
Environment variables:
|
||||
CHROME_USER_DATA_DIR: Chrome profile directory (for lock file cleanup)
|
||||
CHROME_PROFILE_NAME: Chrome profile name (default: Default)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'chrome_cleanup'
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def cleanup_chrome_session() -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Clean up Chrome session started by chrome_session extractor.
|
||||
|
||||
Returns: (success, output_info, error_message)
|
||||
"""
|
||||
session_dir = Path(CHROME_SESSION_DIR)
|
||||
|
||||
if not session_dir.exists():
|
||||
return True, 'No chrome_session directory found', ''
|
||||
|
||||
pid_file = session_dir / 'pid.txt'
|
||||
killed = False
|
||||
|
||||
if pid_file.exists():
|
||||
try:
|
||||
pid = int(pid_file.read_text().strip())
|
||||
|
||||
# Try graceful termination first
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
killed = True
|
||||
|
||||
# Wait briefly for graceful shutdown
|
||||
for _ in range(10):
|
||||
try:
|
||||
os.kill(pid, 0) # Check if still running
|
||||
time.sleep(0.1)
|
||||
except OSError:
|
||||
break # Process is gone
|
||||
else:
|
||||
# Force kill if still running
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
except OSError as e:
|
||||
# Process might already be dead, that's fine
|
||||
if e.errno == 3: # No such process
|
||||
pass
|
||||
else:
|
||||
return False, None, f'Failed to kill Chrome PID {pid}: {e}'
|
||||
|
||||
except ValueError:
|
||||
return False, None, f'Invalid PID in {pid_file}'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
# Clean up Chrome profile lock files if configured
|
||||
user_data_dir = get_env('CHROME_USER_DATA_DIR', '')
|
||||
profile_name = get_env('CHROME_PROFILE_NAME', 'Default')
|
||||
|
||||
if user_data_dir:
|
||||
user_data_path = Path(user_data_dir)
|
||||
for lockfile in [
|
||||
user_data_path / 'SingletonLock',
|
||||
user_data_path / profile_name / 'SingletonLock',
|
||||
]:
|
||||
try:
|
||||
lockfile.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass # Best effort cleanup
|
||||
|
||||
result_info = f'Chrome cleanup: PID {"killed" if killed else "not found"}'
|
||||
return True, result_info, ''
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL that was loaded')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Clean up Chrome browser session."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
success, output, error = cleanup_chrome_session()
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
print(f'Chrome cleanup completed: {output}')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
483
archivebox/plugins/chrome_extensions/chrome_extension_utils.js
Executable file
483
archivebox/plugins/chrome_extensions/chrome_extension_utils.js
Executable file
@@ -0,0 +1,483 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Chrome Extension Management Utilities
|
||||
*
|
||||
* Handles downloading, installing, and managing Chrome extensions for browser automation.
|
||||
* Ported from the TypeScript implementation in archivebox.ts
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const crypto = require('crypto');
|
||||
const { exec } = require('child_process');
|
||||
const { promisify } = require('util');
|
||||
const { Readable } = require('stream');
|
||||
const { finished } = require('stream/promises');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// Try to import unzipper, fallback to system unzip if not available
|
||||
let unzip = null;
|
||||
try {
|
||||
const unzipper = require('unzipper');
|
||||
unzip = async (sourcePath, destPath) => {
|
||||
const stream = fs.createReadStream(sourcePath).pipe(unzipper.Extract({ path: destPath }));
|
||||
return stream.promise();
|
||||
};
|
||||
} catch (err) {
|
||||
// Will use system unzip command as fallback
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the extension ID from the unpacked path.
|
||||
* Chrome uses a SHA256 hash of the unpacked extension directory path to compute a dynamic id.
|
||||
*
|
||||
* @param {string} unpacked_path - Path to the unpacked extension directory
|
||||
* @returns {string} - 32-character extension ID
|
||||
*/
|
||||
function getExtensionId(unpacked_path) {
|
||||
// Chrome uses a SHA256 hash of the unpacked extension directory path
|
||||
const hash = crypto.createHash('sha256');
|
||||
hash.update(Buffer.from(unpacked_path, 'utf-8'));
|
||||
|
||||
// Convert first 32 hex chars to characters in the range 'a'-'p'
|
||||
const detected_extension_id = Array.from(hash.digest('hex'))
|
||||
.slice(0, 32)
|
||||
.map(i => String.fromCharCode(parseInt(i, 16) + 'a'.charCodeAt(0)))
|
||||
.join('');
|
||||
|
||||
return detected_extension_id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download and install a Chrome extension from the Chrome Web Store.
|
||||
*
|
||||
* @param {Object} extension - Extension metadata object
|
||||
* @param {string} extension.webstore_id - Chrome Web Store extension ID
|
||||
* @param {string} extension.name - Human-readable extension name
|
||||
* @param {string} extension.crx_url - URL to download the CRX file
|
||||
* @param {string} extension.crx_path - Local path to save the CRX file
|
||||
* @param {string} extension.unpacked_path - Path to extract the extension
|
||||
* @returns {Promise<boolean>} - True if installation succeeded
|
||||
*/
|
||||
async function installExtension(extension) {
|
||||
const manifest_path = path.join(extension.unpacked_path, 'manifest.json');
|
||||
|
||||
// Download CRX file if not already downloaded
|
||||
if (!fs.existsSync(manifest_path) && !fs.existsSync(extension.crx_path)) {
|
||||
console.log(`[🛠️] Downloading missing extension ${extension.name} ${extension.webstore_id} -> ${extension.crx_path}`);
|
||||
|
||||
try {
|
||||
// Ensure parent directory exists
|
||||
const crxDir = path.dirname(extension.crx_path);
|
||||
if (!fs.existsSync(crxDir)) {
|
||||
fs.mkdirSync(crxDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Download CRX file from Chrome Web Store
|
||||
const response = await fetch(extension.crx_url);
|
||||
|
||||
if (!response.ok) {
|
||||
console.warn(`[⚠️] Failed to download extension ${extension.name}: HTTP ${response.status}`);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (response.body) {
|
||||
const crx_file = fs.createWriteStream(extension.crx_path);
|
||||
const crx_stream = Readable.fromWeb(response.body);
|
||||
await finished(crx_stream.pipe(crx_file));
|
||||
} else {
|
||||
console.warn(`[⚠️] Failed to download extension ${extension.name}: No response body`);
|
||||
return false;
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`[❌] Failed to download extension ${extension.name}:`, err);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Unzip CRX file to unpacked_path
|
||||
await fs.promises.mkdir(extension.unpacked_path, { recursive: true });
|
||||
|
||||
try {
|
||||
// Try system unzip command first
|
||||
await execAsync(`/usr/bin/unzip -o ${extension.crx_path} -d ${extension.unpacked_path}`);
|
||||
} catch (err1) {
|
||||
if (unzip) {
|
||||
// Fallback to unzipper library
|
||||
try {
|
||||
await unzip(extension.crx_path, extension.unpacked_path);
|
||||
} catch (err2) {
|
||||
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
console.error(`[❌] Failed to unzip ${extension.crx_path}:`, err1.message);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fs.existsSync(manifest_path)) {
|
||||
console.error(`[❌] Failed to install ${extension.crx_path}: could not find manifest.json in unpacked_path`);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load or install a Chrome extension, computing all metadata.
|
||||
*
|
||||
* @param {Object} ext - Partial extension metadata (at minimum: webstore_id or unpacked_path)
|
||||
* @param {string} [ext.webstore_id] - Chrome Web Store extension ID
|
||||
* @param {string} [ext.name] - Human-readable extension name
|
||||
* @param {string} [ext.unpacked_path] - Path to unpacked extension
|
||||
* @param {string} [extensions_dir] - Directory to store extensions
|
||||
* @returns {Promise<Object>} - Complete extension metadata object
|
||||
*/
|
||||
async function loadOrInstallExtension(ext, extensions_dir = null) {
|
||||
if (!(ext.webstore_id || ext.unpacked_path)) {
|
||||
throw new Error('Extension must have either {webstore_id} or {unpacked_path}');
|
||||
}
|
||||
|
||||
// Determine extensions directory
|
||||
const EXTENSIONS_DIR = extensions_dir || process.env.CHROME_EXTENSIONS_DIR || './data/chrome_extensions';
|
||||
|
||||
// Set statically computable extension metadata
|
||||
ext.webstore_id = ext.webstore_id || ext.id;
|
||||
ext.name = ext.name || ext.webstore_id;
|
||||
ext.webstore_url = ext.webstore_url || `https://chromewebstore.google.com/detail/${ext.webstore_id}`;
|
||||
ext.crx_url = ext.crx_url || `https://clients2.google.com/service/update2/crx?response=redirect&prodversion=1230&acceptformat=crx3&x=id%3D${ext.webstore_id}%26uc`;
|
||||
ext.crx_path = ext.crx_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}.crx`);
|
||||
ext.unpacked_path = ext.unpacked_path || path.join(EXTENSIONS_DIR, `${ext.webstore_id}__${ext.name}`);
|
||||
|
||||
const manifest_path = path.join(ext.unpacked_path, 'manifest.json');
|
||||
ext.read_manifest = () => JSON.parse(fs.readFileSync(manifest_path, 'utf-8'));
|
||||
ext.read_version = () => fs.existsSync(manifest_path) && ext.read_manifest()?.version || null;
|
||||
|
||||
// If extension is not installed, download and unpack it
|
||||
if (!ext.read_version()) {
|
||||
await installExtension(ext);
|
||||
}
|
||||
|
||||
// Autodetect ID from filesystem path (unpacked extensions don't have stable IDs)
|
||||
ext.id = getExtensionId(ext.unpacked_path);
|
||||
ext.version = ext.read_version();
|
||||
|
||||
if (!ext.version) {
|
||||
console.warn(`[❌] Unable to detect ID and version of installed extension ${ext.unpacked_path}`);
|
||||
} else {
|
||||
console.log(`[➕] Installed extension ${ext.name} (${ext.version})... ${ext.unpacked_path}`);
|
||||
}
|
||||
|
||||
return ext;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a Puppeteer target is an extension background page/service worker.
|
||||
*
|
||||
* @param {Object} target - Puppeteer target object
|
||||
* @returns {Promise<Object>} - Object with target_is_bg, extension_id, manifest_version, etc.
|
||||
*/
|
||||
async function isTargetExtension(target) {
|
||||
let target_type;
|
||||
let target_ctx;
|
||||
let target_url;
|
||||
|
||||
try {
|
||||
target_type = target.type();
|
||||
target_ctx = (await target.worker()) || (await target.page()) || null;
|
||||
target_url = target.url() || target_ctx?.url() || null;
|
||||
} catch (err) {
|
||||
if (String(err).includes('No target with given id found')) {
|
||||
// Target closed during check, ignore harmless race condition
|
||||
target_type = 'closed';
|
||||
target_ctx = null;
|
||||
target_url = 'about:closed';
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if this is an extension background page or service worker
|
||||
const is_chrome_extension = target_url?.startsWith('chrome-extension://');
|
||||
const is_background_page = target_type === 'background_page';
|
||||
const is_service_worker = target_type === 'service_worker';
|
||||
const target_is_bg = is_chrome_extension && (is_background_page || is_service_worker);
|
||||
|
||||
let extension_id = null;
|
||||
let manifest_version = null;
|
||||
const target_is_extension = is_chrome_extension || target_is_bg;
|
||||
|
||||
if (target_is_extension) {
|
||||
try {
|
||||
extension_id = target_url?.split('://')[1]?.split('/')[0] || null;
|
||||
|
||||
if (target_ctx) {
|
||||
const manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
|
||||
manifest_version = manifest?.manifest_version || null;
|
||||
}
|
||||
} catch (err) {
|
||||
// Failed to get extension metadata
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
target_is_extension,
|
||||
target_is_bg,
|
||||
target_type,
|
||||
target_ctx,
|
||||
target_url,
|
||||
extension_id,
|
||||
manifest_version,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Load extension metadata and connection handlers from a browser target.
|
||||
*
|
||||
* @param {Array} extensions - Array of extension metadata objects to update
|
||||
* @param {Object} target - Puppeteer target object
|
||||
* @returns {Promise<Object|null>} - Updated extension object or null if not an extension
|
||||
*/
|
||||
async function loadExtensionFromTarget(extensions, target) {
|
||||
const {
|
||||
target_is_bg,
|
||||
target_is_extension,
|
||||
target_type,
|
||||
target_ctx,
|
||||
target_url,
|
||||
extension_id,
|
||||
manifest_version,
|
||||
} = await isTargetExtension(target);
|
||||
|
||||
if (!(target_is_bg && extension_id && target_ctx)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find matching extension in our list
|
||||
const extension = extensions.find(ext => ext.id === extension_id);
|
||||
if (!extension) {
|
||||
console.warn(`[⚠️] Found loaded extension ${extension_id} that's not in CHROME_EXTENSIONS list`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Load manifest from the extension context
|
||||
let manifest = null;
|
||||
try {
|
||||
manifest = await target_ctx.evaluate(() => chrome.runtime.getManifest());
|
||||
} catch (err) {
|
||||
console.error(`[❌] Failed to read manifest for extension ${extension_id}:`, err);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Create dispatch methods for communicating with the extension
|
||||
const new_extension = {
|
||||
...extension,
|
||||
target,
|
||||
target_type,
|
||||
target_url,
|
||||
manifest,
|
||||
manifest_version,
|
||||
|
||||
// Trigger extension toolbar button click
|
||||
dispatchAction: async (tab) => {
|
||||
return await target_ctx.evaluate((tabId) => {
|
||||
return new Promise((resolve) => {
|
||||
chrome.action.onClicked.addListener((tab) => {
|
||||
resolve({ success: true, tab });
|
||||
});
|
||||
chrome.action.openPopup();
|
||||
});
|
||||
}, tab?.id || null);
|
||||
},
|
||||
|
||||
// Send message to extension
|
||||
dispatchMessage: async (message, options = {}) => {
|
||||
return await target_ctx.evaluate((msg, opts) => {
|
||||
return new Promise((resolve) => {
|
||||
chrome.runtime.sendMessage(msg, opts, (response) => {
|
||||
resolve(response);
|
||||
});
|
||||
});
|
||||
}, message, options);
|
||||
},
|
||||
|
||||
// Trigger extension command (keyboard shortcut)
|
||||
dispatchCommand: async (command) => {
|
||||
return await target_ctx.evaluate((cmd) => {
|
||||
return new Promise((resolve) => {
|
||||
chrome.commands.onCommand.addListener((receivedCommand) => {
|
||||
if (receivedCommand === cmd) {
|
||||
resolve({ success: true, command: receivedCommand });
|
||||
}
|
||||
});
|
||||
// Note: Actually triggering commands programmatically is not directly supported
|
||||
// This would need to be done via CDP or keyboard simulation
|
||||
});
|
||||
}, command);
|
||||
},
|
||||
};
|
||||
|
||||
// Update the extension in the array
|
||||
Object.assign(extension, new_extension);
|
||||
|
||||
console.log(`[🔌] Connected to extension ${extension.name} (${extension.version})`);
|
||||
|
||||
return new_extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Install all extensions in the list if not already installed.
|
||||
*
|
||||
* @param {Array} extensions - Array of extension metadata objects
|
||||
* @param {string} [extensions_dir] - Directory to store extensions
|
||||
* @returns {Promise<Array>} - Array of installed extension objects
|
||||
*/
|
||||
async function installAllExtensions(extensions, extensions_dir = null) {
|
||||
console.log(`[⚙️] Installing ${extensions.length} chrome extensions...`);
|
||||
|
||||
for (const extension of extensions) {
|
||||
await loadOrInstallExtension(extension, extensions_dir);
|
||||
}
|
||||
|
||||
return extensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load and connect to all extensions from a running browser.
|
||||
*
|
||||
* @param {Object} browser - Puppeteer browser instance
|
||||
* @param {Array} extensions - Array of extension metadata objects
|
||||
* @returns {Promise<Array>} - Array of loaded extension objects with connection handlers
|
||||
*/
|
||||
async function loadAllExtensionsFromBrowser(browser, extensions) {
|
||||
console.log(`[⚙️] Loading ${extensions.length} chrome extensions from browser...`);
|
||||
|
||||
// Find loaded extensions at runtime by examining browser targets
|
||||
for (const target of browser.targets()) {
|
||||
await loadExtensionFromTarget(extensions, target);
|
||||
}
|
||||
|
||||
return extensions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load extension manifest.json file
|
||||
*
|
||||
* @param {string} unpacked_path - Path to unpacked extension directory
|
||||
* @returns {object|null} - Parsed manifest object or null if not found/invalid
|
||||
*/
|
||||
function loadExtensionManifest(unpacked_path) {
|
||||
const manifest_path = path.join(unpacked_path, 'manifest.json');
|
||||
|
||||
if (!fs.existsSync(manifest_path)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const manifest_content = fs.readFileSync(manifest_path, 'utf-8');
|
||||
return JSON.parse(manifest_content);
|
||||
} catch (error) {
|
||||
// Invalid JSON or read error
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate Chrome launch arguments for loading extensions.
|
||||
*
|
||||
* @param {Array} extensions - Array of extension metadata objects
|
||||
* @returns {Array<string>} - Chrome CLI arguments for loading extensions
|
||||
*/
|
||||
function getExtensionLaunchArgs(extensions) {
|
||||
if (!extensions || extensions.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Filter out extensions without unpacked_path first
|
||||
const validExtensions = extensions.filter(ext => ext.unpacked_path);
|
||||
|
||||
const unpacked_paths = validExtensions.map(ext => ext.unpacked_path);
|
||||
const webstore_ids = validExtensions.map(ext => ext.webstore_id || ext.id);
|
||||
|
||||
return [
|
||||
`--load-extension=${unpacked_paths.join(',')}`,
|
||||
`--allowlisted-extension-id=${webstore_ids.join(',')}`,
|
||||
'--allow-legacy-extension-manifests',
|
||||
'--disable-extensions-auto-update',
|
||||
];
|
||||
}
|
||||
|
||||
// Export all functions
|
||||
module.exports = {
|
||||
getExtensionId,
|
||||
loadExtensionManifest,
|
||||
installExtension,
|
||||
loadOrInstallExtension,
|
||||
isTargetExtension,
|
||||
loadExtensionFromTarget,
|
||||
installAllExtensions,
|
||||
loadAllExtensionsFromBrowser,
|
||||
getExtensionLaunchArgs,
|
||||
};
|
||||
|
||||
// CLI usage
|
||||
if (require.main === module) {
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
if (args.length === 0) {
|
||||
console.log('Usage: chrome_extension_utils.js <command> [args...]');
|
||||
console.log('');
|
||||
console.log('Commands:');
|
||||
console.log(' getExtensionId <path>');
|
||||
console.log(' loadExtensionManifest <path>');
|
||||
console.log(' getExtensionLaunchArgs <extensions_json>');
|
||||
console.log(' loadOrInstallExtension <webstore_id> <name> [extensions_dir]');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const [command, ...commandArgs] = args;
|
||||
|
||||
(async () => {
|
||||
try {
|
||||
switch (command) {
|
||||
case 'getExtensionId': {
|
||||
const [unpacked_path] = commandArgs;
|
||||
const id = getExtensionId(unpacked_path);
|
||||
console.log(id);
|
||||
break;
|
||||
}
|
||||
|
||||
case 'loadExtensionManifest': {
|
||||
const [unpacked_path] = commandArgs;
|
||||
const manifest = loadExtensionManifest(unpacked_path);
|
||||
console.log(JSON.stringify(manifest));
|
||||
break;
|
||||
}
|
||||
|
||||
case 'getExtensionLaunchArgs': {
|
||||
const [extensions_json] = commandArgs;
|
||||
const extensions = JSON.parse(extensions_json);
|
||||
const args = getExtensionLaunchArgs(extensions);
|
||||
console.log(JSON.stringify(args));
|
||||
break;
|
||||
}
|
||||
|
||||
case 'loadOrInstallExtension': {
|
||||
const [webstore_id, name, extensions_dir] = commandArgs;
|
||||
const ext = await loadOrInstallExtension({ webstore_id, name }, extensions_dir);
|
||||
console.log(JSON.stringify(ext, null, 2));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
console.error(`Unknown command: ${command}`);
|
||||
process.exit(1);
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error: ${error.message}`);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
}
|
||||
@@ -0,0 +1,329 @@
|
||||
/**
|
||||
* Unit tests for chrome_extension_utils.js
|
||||
*
|
||||
* Run with: npm test
|
||||
* Or: node --test tests/test_chrome_extension_utils.js
|
||||
*/
|
||||
|
||||
const assert = require('assert');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
|
||||
|
||||
// Import module under test
|
||||
const extensionUtils = require('../chrome_extension_utils.js');
|
||||
|
||||
// Test fixtures
|
||||
const TEST_DIR = path.join(__dirname, '.test_fixtures');
|
||||
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
|
||||
|
||||
describe('chrome_extension_utils', () => {
|
||||
before(() => {
|
||||
// Create test directory
|
||||
if (!fs.existsSync(TEST_DIR)) {
|
||||
fs.mkdirSync(TEST_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
after(() => {
|
||||
// Cleanup test directory
|
||||
if (fs.existsSync(TEST_DIR)) {
|
||||
fs.rmSync(TEST_DIR, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('getExtensionId', () => {
|
||||
it('should compute extension ID from path', () => {
|
||||
const testPath = '/path/to/extension';
|
||||
const extensionId = extensionUtils.getExtensionId(testPath);
|
||||
|
||||
assert.strictEqual(typeof extensionId, 'string');
|
||||
assert.strictEqual(extensionId.length, 32);
|
||||
// Should only contain lowercase letters a-p
|
||||
assert.match(extensionId, /^[a-p]+$/);
|
||||
});
|
||||
|
||||
it('should compute ID even for non-existent paths', () => {
|
||||
const testPath = '/nonexistent/path';
|
||||
const extensionId = extensionUtils.getExtensionId(testPath);
|
||||
|
||||
// Should still compute an ID from the path string
|
||||
assert.strictEqual(typeof extensionId, 'string');
|
||||
assert.strictEqual(extensionId.length, 32);
|
||||
assert.match(extensionId, /^[a-p]+$/);
|
||||
});
|
||||
|
||||
it('should return consistent ID for same path', () => {
|
||||
const testPath = '/path/to/extension';
|
||||
const id1 = extensionUtils.getExtensionId(testPath);
|
||||
const id2 = extensionUtils.getExtensionId(testPath);
|
||||
|
||||
assert.strictEqual(id1, id2);
|
||||
});
|
||||
|
||||
it('should return different IDs for different paths', () => {
|
||||
const path1 = '/path/to/extension1';
|
||||
const path2 = '/path/to/extension2';
|
||||
const id1 = extensionUtils.getExtensionId(path1);
|
||||
const id2 = extensionUtils.getExtensionId(path2);
|
||||
|
||||
assert.notStrictEqual(id1, id2);
|
||||
});
|
||||
});
|
||||
|
||||
describe('loadExtensionManifest', () => {
|
||||
beforeEach(() => {
|
||||
// Create test extension directory with manifest
|
||||
const testExtDir = path.join(TEST_DIR, 'test_extension');
|
||||
fs.mkdirSync(testExtDir, { recursive: true });
|
||||
|
||||
const manifest = {
|
||||
manifest_version: 3,
|
||||
name: "Test Extension",
|
||||
version: "1.0.0"
|
||||
};
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(testExtDir, 'manifest.json'),
|
||||
JSON.stringify(manifest)
|
||||
);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
// Cleanup test extension
|
||||
const testExtDir = path.join(TEST_DIR, 'test_extension');
|
||||
if (fs.existsSync(testExtDir)) {
|
||||
fs.rmSync(testExtDir, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('should load valid manifest.json', () => {
|
||||
const testExtDir = path.join(TEST_DIR, 'test_extension');
|
||||
const manifest = extensionUtils.loadExtensionManifest(testExtDir);
|
||||
|
||||
assert.notStrictEqual(manifest, null);
|
||||
assert.strictEqual(manifest.manifest_version, 3);
|
||||
assert.strictEqual(manifest.name, "Test Extension");
|
||||
assert.strictEqual(manifest.version, "1.0.0");
|
||||
});
|
||||
|
||||
it('should return null for missing manifest', () => {
|
||||
const nonExistentDir = path.join(TEST_DIR, 'nonexistent');
|
||||
const manifest = extensionUtils.loadExtensionManifest(nonExistentDir);
|
||||
|
||||
assert.strictEqual(manifest, null);
|
||||
});
|
||||
|
||||
it('should handle invalid JSON gracefully', () => {
|
||||
const testExtDir = path.join(TEST_DIR, 'invalid_extension');
|
||||
fs.mkdirSync(testExtDir, { recursive: true });
|
||||
|
||||
// Write invalid JSON
|
||||
fs.writeFileSync(
|
||||
path.join(testExtDir, 'manifest.json'),
|
||||
'invalid json content'
|
||||
);
|
||||
|
||||
const manifest = extensionUtils.loadExtensionManifest(testExtDir);
|
||||
|
||||
assert.strictEqual(manifest, null);
|
||||
|
||||
// Cleanup
|
||||
fs.rmSync(testExtDir, { recursive: true });
|
||||
});
|
||||
});
|
||||
|
||||
describe('getExtensionLaunchArgs', () => {
|
||||
it('should return empty array for no extensions', () => {
|
||||
const args = extensionUtils.getExtensionLaunchArgs([]);
|
||||
|
||||
assert.deepStrictEqual(args, []);
|
||||
});
|
||||
|
||||
it('should generate correct launch args for single extension', () => {
|
||||
const extensions = [{
|
||||
webstore_id: 'abcd1234',
|
||||
unpacked_path: '/path/to/extension'
|
||||
}];
|
||||
|
||||
const args = extensionUtils.getExtensionLaunchArgs(extensions);
|
||||
|
||||
assert.strictEqual(args.length, 4);
|
||||
assert.strictEqual(args[0], '--load-extension=/path/to/extension');
|
||||
assert.strictEqual(args[1], '--allowlisted-extension-id=abcd1234');
|
||||
assert.strictEqual(args[2], '--allow-legacy-extension-manifests');
|
||||
assert.strictEqual(args[3], '--disable-extensions-auto-update');
|
||||
});
|
||||
|
||||
it('should generate correct launch args for multiple extensions', () => {
|
||||
const extensions = [
|
||||
{ webstore_id: 'ext1', unpacked_path: '/path/ext1' },
|
||||
{ webstore_id: 'ext2', unpacked_path: '/path/ext2' },
|
||||
{ webstore_id: 'ext3', unpacked_path: '/path/ext3' }
|
||||
];
|
||||
|
||||
const args = extensionUtils.getExtensionLaunchArgs(extensions);
|
||||
|
||||
assert.strictEqual(args.length, 4);
|
||||
assert.strictEqual(args[0], '--load-extension=/path/ext1,/path/ext2,/path/ext3');
|
||||
assert.strictEqual(args[1], '--allowlisted-extension-id=ext1,ext2,ext3');
|
||||
});
|
||||
|
||||
it('should handle extensions with id instead of webstore_id', () => {
|
||||
const extensions = [{
|
||||
id: 'computed_id',
|
||||
unpacked_path: '/path/to/extension'
|
||||
}];
|
||||
|
||||
const args = extensionUtils.getExtensionLaunchArgs(extensions);
|
||||
|
||||
assert.strictEqual(args[1], '--allowlisted-extension-id=computed_id');
|
||||
});
|
||||
|
||||
it('should filter out extensions without paths', () => {
|
||||
const extensions = [
|
||||
{ webstore_id: 'ext1', unpacked_path: '/path/ext1' },
|
||||
{ webstore_id: 'ext2', unpacked_path: null },
|
||||
{ webstore_id: 'ext3', unpacked_path: '/path/ext3' }
|
||||
];
|
||||
|
||||
const args = extensionUtils.getExtensionLaunchArgs(extensions);
|
||||
|
||||
assert.strictEqual(args[0], '--load-extension=/path/ext1,/path/ext3');
|
||||
assert.strictEqual(args[1], '--allowlisted-extension-id=ext1,ext3');
|
||||
});
|
||||
});
|
||||
|
||||
describe('loadOrInstallExtension', () => {
|
||||
beforeEach(() => {
|
||||
// Create test extensions directory
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
// Cleanup test extensions directory
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('should throw error if neither webstore_id nor unpacked_path provided', async () => {
|
||||
await assert.rejects(
|
||||
async () => {
|
||||
await extensionUtils.loadOrInstallExtension({}, TEST_EXTENSIONS_DIR);
|
||||
},
|
||||
/Extension must have either/
|
||||
);
|
||||
});
|
||||
|
||||
it('should set correct default values for extension metadata', async () => {
|
||||
const input = {
|
||||
webstore_id: 'test123',
|
||||
name: 'test_extension'
|
||||
};
|
||||
|
||||
// Mock the installation to avoid actual download
|
||||
const originalInstall = extensionUtils.installExtension;
|
||||
extensionUtils.installExtension = async () => {
|
||||
// Create fake manifest
|
||||
const extDir = path.join(TEST_EXTENSIONS_DIR, 'test123__test_extension');
|
||||
fs.mkdirSync(extDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(extDir, 'manifest.json'),
|
||||
JSON.stringify({ version: '1.0.0' })
|
||||
);
|
||||
return true;
|
||||
};
|
||||
|
||||
const ext = await extensionUtils.loadOrInstallExtension(input, TEST_EXTENSIONS_DIR);
|
||||
|
||||
// Restore original
|
||||
extensionUtils.installExtension = originalInstall;
|
||||
|
||||
assert.strictEqual(ext.webstore_id, 'test123');
|
||||
assert.strictEqual(ext.name, 'test_extension');
|
||||
assert.ok(ext.webstore_url.includes(ext.webstore_id));
|
||||
assert.ok(ext.crx_url.includes(ext.webstore_id));
|
||||
assert.ok(ext.crx_path.includes('test123__test_extension.crx'));
|
||||
assert.ok(ext.unpacked_path.includes('test123__test_extension'));
|
||||
});
|
||||
|
||||
it('should detect version from manifest after installation', async () => {
|
||||
const input = {
|
||||
webstore_id: 'test456',
|
||||
name: 'versioned_extension'
|
||||
};
|
||||
|
||||
// Create pre-installed extension
|
||||
const extDir = path.join(TEST_EXTENSIONS_DIR, 'test456__versioned_extension');
|
||||
fs.mkdirSync(extDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(extDir, 'manifest.json'),
|
||||
JSON.stringify({
|
||||
manifest_version: 3,
|
||||
name: "Versioned Extension",
|
||||
version: "2.5.1"
|
||||
})
|
||||
);
|
||||
|
||||
const ext = await extensionUtils.loadOrInstallExtension(input, TEST_EXTENSIONS_DIR);
|
||||
|
||||
assert.strictEqual(ext.version, '2.5.1');
|
||||
});
|
||||
});
|
||||
|
||||
describe('isTargetExtension', () => {
|
||||
it('should identify extension targets by URL', async () => {
|
||||
// Mock Puppeteer target
|
||||
const mockTarget = {
|
||||
type: () => 'service_worker',
|
||||
url: () => 'chrome-extension://abcdefgh/background.js',
|
||||
worker: async () => null,
|
||||
page: async () => null
|
||||
};
|
||||
|
||||
const result = await extensionUtils.isTargetExtension(mockTarget);
|
||||
|
||||
assert.strictEqual(result.target_is_extension, true);
|
||||
assert.strictEqual(result.target_is_bg, true);
|
||||
assert.strictEqual(result.extension_id, 'abcdefgh');
|
||||
});
|
||||
|
||||
it('should not identify non-extension targets', async () => {
|
||||
const mockTarget = {
|
||||
type: () => 'page',
|
||||
url: () => 'https://example.com',
|
||||
worker: async () => null,
|
||||
page: async () => null
|
||||
};
|
||||
|
||||
const result = await extensionUtils.isTargetExtension(mockTarget);
|
||||
|
||||
assert.strictEqual(result.target_is_extension, false);
|
||||
assert.strictEqual(result.target_is_bg, false);
|
||||
assert.strictEqual(result.extension_id, null);
|
||||
});
|
||||
|
||||
it('should handle closed targets gracefully', async () => {
|
||||
const mockTarget = {
|
||||
type: () => { throw new Error('No target with given id found'); },
|
||||
url: () => { throw new Error('No target with given id found'); },
|
||||
worker: async () => { throw new Error('No target with given id found'); },
|
||||
page: async () => { throw new Error('No target with given id found'); }
|
||||
};
|
||||
|
||||
const result = await extensionUtils.isTargetExtension(mockTarget);
|
||||
|
||||
assert.strictEqual(result.target_type, 'closed');
|
||||
assert.strictEqual(result.target_url, 'about:closed');
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
// Run tests if executed directly
|
||||
if (require.main === module) {
|
||||
console.log('Run tests with: npm test');
|
||||
console.log('Or: node --test tests/test_chrome_extension_utils.js');
|
||||
}
|
||||
@@ -0,0 +1,224 @@
|
||||
"""
|
||||
Unit tests for chrome_extension_utils.js
|
||||
|
||||
Tests invoke the script as an external process and verify outputs/side effects.
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
SCRIPT_PATH = Path(__file__).parent.parent / "chrome_extension_utils.js"
|
||||
|
||||
|
||||
def test_script_exists():
|
||||
"""Verify the script file exists and is executable via node"""
|
||||
assert SCRIPT_PATH.exists(), f"Script not found: {SCRIPT_PATH}"
|
||||
|
||||
|
||||
def test_get_extension_id():
|
||||
"""Test extension ID computation from path"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
test_path = "/path/to/extension"
|
||||
|
||||
# Run script with test path
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionId", test_path],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Script failed: {result.stderr}"
|
||||
|
||||
extension_id = result.stdout.strip()
|
||||
|
||||
# Should return 32-character ID with only letters a-p
|
||||
assert len(extension_id) == 32
|
||||
assert all(c in 'abcdefghijklmnop' for c in extension_id)
|
||||
|
||||
|
||||
def test_get_extension_id_consistency():
|
||||
"""Test that same path produces same ID"""
|
||||
test_path = "/path/to/extension"
|
||||
|
||||
result1 = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionId", test_path],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
result2 = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionId", test_path],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result1.returncode == 0
|
||||
assert result2.returncode == 0
|
||||
assert result1.stdout.strip() == result2.stdout.strip()
|
||||
|
||||
|
||||
def test_get_extension_id_different_paths():
|
||||
"""Test that different paths produce different IDs"""
|
||||
result1 = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionId", "/path1"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
result2 = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionId", "/path2"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result1.returncode == 0
|
||||
assert result2.returncode == 0
|
||||
assert result1.stdout.strip() != result2.stdout.strip()
|
||||
|
||||
|
||||
def test_load_extension_manifest():
|
||||
"""Test loading extension manifest.json"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "test_extension"
|
||||
ext_dir.mkdir()
|
||||
|
||||
# Create manifest
|
||||
manifest = {
|
||||
"manifest_version": 3,
|
||||
"name": "Test Extension",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
(ext_dir / "manifest.json").write_text(json.dumps(manifest))
|
||||
|
||||
# Load manifest via script
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "loadExtensionManifest", str(ext_dir)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
loaded = json.loads(result.stdout)
|
||||
|
||||
assert loaded["manifest_version"] == 3
|
||||
assert loaded["name"] == "Test Extension"
|
||||
assert loaded["version"] == "1.0.0"
|
||||
|
||||
|
||||
def test_load_extension_manifest_missing():
|
||||
"""Test loading manifest from non-existent directory"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
nonexistent = Path(tmpdir) / "nonexistent"
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "loadExtensionManifest", str(nonexistent)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
# Should return null/empty for missing manifest
|
||||
assert result.returncode == 0
|
||||
assert result.stdout.strip() in ("null", "")
|
||||
|
||||
|
||||
def test_load_extension_manifest_invalid_json():
|
||||
"""Test handling of invalid JSON in manifest"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "test_extension"
|
||||
ext_dir.mkdir()
|
||||
|
||||
# Write invalid JSON
|
||||
(ext_dir / "manifest.json").write_text("invalid json content")
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "loadExtensionManifest", str(ext_dir)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
# Should handle gracefully
|
||||
assert result.returncode == 0
|
||||
assert result.stdout.strip() in ("null", "")
|
||||
|
||||
|
||||
def test_get_extension_launch_args_empty():
|
||||
"""Test launch args with no extensions"""
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", "[]"],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
args = json.loads(result.stdout)
|
||||
assert args == []
|
||||
|
||||
|
||||
def test_get_extension_launch_args_single():
|
||||
"""Test launch args with single extension"""
|
||||
extensions = [{
|
||||
"webstore_id": "abcd1234",
|
||||
"unpacked_path": "/path/to/extension"
|
||||
}]
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
args = json.loads(result.stdout)
|
||||
|
||||
assert len(args) == 4
|
||||
assert args[0] == "--load-extension=/path/to/extension"
|
||||
assert args[1] == "--allowlisted-extension-id=abcd1234"
|
||||
assert args[2] == "--allow-legacy-extension-manifests"
|
||||
assert args[3] == "--disable-extensions-auto-update"
|
||||
|
||||
|
||||
def test_get_extension_launch_args_multiple():
|
||||
"""Test launch args with multiple extensions"""
|
||||
extensions = [
|
||||
{"webstore_id": "ext1", "unpacked_path": "/path/ext1"},
|
||||
{"webstore_id": "ext2", "unpacked_path": "/path/ext2"},
|
||||
{"webstore_id": "ext3", "unpacked_path": "/path/ext3"}
|
||||
]
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
args = json.loads(result.stdout)
|
||||
|
||||
assert args[0] == "--load-extension=/path/ext1,/path/ext2,/path/ext3"
|
||||
assert args[1] == "--allowlisted-extension-id=ext1,ext2,ext3"
|
||||
|
||||
|
||||
def test_get_extension_launch_args_filter_null_paths():
|
||||
"""Test that extensions without paths are filtered out"""
|
||||
extensions = [
|
||||
{"webstore_id": "ext1", "unpacked_path": "/path/ext1"},
|
||||
{"webstore_id": "ext2", "unpacked_path": None},
|
||||
{"webstore_id": "ext3", "unpacked_path": "/path/ext3"}
|
||||
]
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(SCRIPT_PATH), "getExtensionLaunchArgs", json.dumps(extensions)],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
args = json.loads(result.stdout)
|
||||
|
||||
assert args[0] == "--load-extension=/path/ext1,/path/ext3"
|
||||
assert args[1] == "--allowlisted-extension-id=ext1,ext3"
|
||||
@@ -0,0 +1,309 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Navigate the Chrome browser to the target URL.
|
||||
*
|
||||
* This extractor runs AFTER pre-load extractors (21-29) have registered their
|
||||
* CDP listeners. It connects to the existing Chrome session, navigates to the URL,
|
||||
* waits for page load, and captures response headers.
|
||||
*
|
||||
* Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes to chrome_session/:
|
||||
* - response_headers.json: HTTP response headers from main document
|
||||
* - final_url.txt: Final URL after any redirects
|
||||
* - page_loaded.txt: Marker file indicating navigation is complete
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_PAGELOAD_TIMEOUT: Timeout for page load in seconds (default: 60)
|
||||
* CHROME_DELAY_AFTER_LOAD: Extra delay after load in seconds (default: 0)
|
||||
* CHROME_WAIT_FOR: Wait condition (default: networkidle2)
|
||||
* - domcontentloaded: DOM is ready, resources may still load
|
||||
* - load: Page fully loaded including resources
|
||||
* - networkidle0: No network activity for 500ms (strictest)
|
||||
* - networkidle2: At most 2 network connections for 500ms
|
||||
*
|
||||
* # Fallbacks
|
||||
* TIMEOUT: Fallback timeout
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'chrome_navigate';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
function getEnvFloat(name, defaultValue = 0) {
|
||||
const val = parseFloat(getEnv(name, String(defaultValue)));
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Read CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (!fs.existsSync(cdpFile)) {
|
||||
return null;
|
||||
}
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
|
||||
// Read URL from chrome_session (set by chrome_session extractor)
|
||||
function getTargetUrl() {
|
||||
const urlFile = path.join(CHROME_SESSION_DIR, 'url.txt');
|
||||
if (!fs.existsSync(urlFile)) {
|
||||
return null;
|
||||
}
|
||||
return fs.readFileSync(urlFile, 'utf8').trim();
|
||||
}
|
||||
|
||||
// Validate wait condition
|
||||
function getWaitCondition() {
|
||||
const waitFor = getEnv('CHROME_WAIT_FOR', 'networkidle2').toLowerCase();
|
||||
const validConditions = ['domcontentloaded', 'load', 'networkidle0', 'networkidle2'];
|
||||
if (validConditions.includes(waitFor)) {
|
||||
return waitFor;
|
||||
}
|
||||
console.error(`Warning: Invalid CHROME_WAIT_FOR="${waitFor}", using networkidle2`);
|
||||
return 'networkidle2';
|
||||
}
|
||||
|
||||
// Sleep helper
|
||||
function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function navigateToUrl(url, cdpUrl) {
|
||||
const timeout = (getEnvInt('CHROME_PAGELOAD_TIMEOUT') || getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const delayAfterLoad = getEnvFloat('CHROME_DELAY_AFTER_LOAD', 0) * 1000;
|
||||
const waitUntil = getWaitCondition();
|
||||
|
||||
let browser = null;
|
||||
let responseHeaders = {};
|
||||
let redirectChain = [];
|
||||
let finalUrl = url;
|
||||
|
||||
try {
|
||||
// Connect to existing browser
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get all pages and find our target page
|
||||
const pages = await browser.pages();
|
||||
if (pages.length === 0) {
|
||||
return { success: false, error: 'No pages found in browser' };
|
||||
}
|
||||
|
||||
// Use the last created page (most likely the one chrome_session created)
|
||||
const page = pages[pages.length - 1];
|
||||
|
||||
// Set up response interception to capture headers and redirects
|
||||
page.on('response', async (response) => {
|
||||
const request = response.request();
|
||||
|
||||
// Track redirects
|
||||
if (response.status() >= 300 && response.status() < 400) {
|
||||
redirectChain.push({
|
||||
url: response.url(),
|
||||
status: response.status(),
|
||||
location: response.headers()['location'] || null,
|
||||
});
|
||||
}
|
||||
|
||||
// Capture headers from the main document request
|
||||
if (request.isNavigationRequest() && request.frame() === page.mainFrame()) {
|
||||
try {
|
||||
responseHeaders = {
|
||||
url: response.url(),
|
||||
status: response.status(),
|
||||
statusText: response.statusText(),
|
||||
headers: response.headers(),
|
||||
};
|
||||
finalUrl = response.url();
|
||||
} catch (e) {
|
||||
// Ignore errors capturing headers
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Navigate to URL and wait for load
|
||||
console.log(`Navigating to ${url} (wait: ${waitUntil}, timeout: ${timeout}ms)`);
|
||||
|
||||
const response = await page.goto(url, {
|
||||
waitUntil,
|
||||
timeout,
|
||||
});
|
||||
|
||||
// Capture final response if not already captured
|
||||
if (response && Object.keys(responseHeaders).length === 0) {
|
||||
responseHeaders = {
|
||||
url: response.url(),
|
||||
status: response.status(),
|
||||
statusText: response.statusText(),
|
||||
headers: response.headers(),
|
||||
};
|
||||
finalUrl = response.url();
|
||||
}
|
||||
|
||||
// Apply optional delay after load
|
||||
if (delayAfterLoad > 0) {
|
||||
console.log(`Waiting ${delayAfterLoad}ms after load...`);
|
||||
await sleep(delayAfterLoad);
|
||||
}
|
||||
|
||||
// Write response headers
|
||||
if (Object.keys(responseHeaders).length > 0) {
|
||||
// Add redirect chain to headers
|
||||
responseHeaders.redirect_chain = redirectChain;
|
||||
|
||||
fs.writeFileSync(
|
||||
path.join(CHROME_SESSION_DIR, 'response_headers.json'),
|
||||
JSON.stringify(responseHeaders, null, 2)
|
||||
);
|
||||
}
|
||||
|
||||
// Write final URL (after redirects)
|
||||
fs.writeFileSync(path.join(CHROME_SESSION_DIR, 'final_url.txt'), finalUrl);
|
||||
|
||||
// Write marker file indicating page is loaded
|
||||
fs.writeFileSync(
|
||||
path.join(CHROME_SESSION_DIR, 'page_loaded.txt'),
|
||||
new Date().toISOString()
|
||||
);
|
||||
|
||||
// Disconnect but leave browser running for post-load extractors
|
||||
browser.disconnect();
|
||||
|
||||
return {
|
||||
success: true,
|
||||
output: CHROME_SESSION_DIR,
|
||||
finalUrl,
|
||||
status: responseHeaders.status,
|
||||
redirectCount: redirectChain.length,
|
||||
};
|
||||
|
||||
} catch (e) {
|
||||
// Don't close browser on error - let cleanup handle it
|
||||
if (browser) {
|
||||
try {
|
||||
browser.disconnect();
|
||||
} catch (disconnectErr) {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__30_chrome_navigate.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check for chrome_session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
console.error('ERROR: chrome_session not found (cdp_url.txt missing)');
|
||||
console.error('chrome_navigate requires chrome_session to run first');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Get URL from chrome_session or use provided URL
|
||||
const targetUrl = getTargetUrl() || url;
|
||||
|
||||
const result = await navigateToUrl(targetUrl, cdpUrl);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
console.log(`Page loaded: ${result.finalUrl}`);
|
||||
console.log(`HTTP status: ${result.status}`);
|
||||
if (result.redirectCount > 0) {
|
||||
console.log(`Redirects: ${result.redirectCount}`);
|
||||
}
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
80
archivebox/plugins/chrome_session/config.json
Normal file
80
archivebox/plugins/chrome_session/config.json
Normal file
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"CHROME_BINARY": {
|
||||
"type": "string",
|
||||
"default": "chromium",
|
||||
"x-aliases": ["CHROMIUM_BINARY", "GOOGLE_CHROME_BINARY"],
|
||||
"description": "Path to Chrome/Chromium binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"x-aliases": ["NODEJS_BINARY"],
|
||||
"description": "Path to Node.js binary (for Puppeteer)"
|
||||
},
|
||||
"CHROME_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for Chrome operations in seconds"
|
||||
},
|
||||
"CHROME_HEADLESS": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Run Chrome in headless mode"
|
||||
},
|
||||
"CHROME_SANDBOX": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable Chrome sandbox (disable in Docker with --no-sandbox)"
|
||||
},
|
||||
"CHROME_RESOLUTION": {
|
||||
"type": "string",
|
||||
"default": "1440,2000",
|
||||
"pattern": "^\\d+,\\d+$",
|
||||
"x-fallback": "RESOLUTION",
|
||||
"description": "Browser viewport resolution (width,height)"
|
||||
},
|
||||
"CHROME_USER_DATA_DIR": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Path to Chrome user data directory for persistent sessions"
|
||||
},
|
||||
"CHROME_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string for Chrome"
|
||||
},
|
||||
"CHROME_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra command-line arguments for Chrome (space-separated)"
|
||||
},
|
||||
"CHROME_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"SAVE_SCREENSHOT": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable screenshot capture"
|
||||
},
|
||||
"SAVE_PDF": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable PDF generation"
|
||||
},
|
||||
"SAVE_DOM": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable DOM capture"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for Chrome/Chromium binary.
|
||||
|
||||
Runs at crawl start to verify Chrome is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Common Chrome/Chromium binary names and paths
|
||||
CHROME_NAMES = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'chrome',
|
||||
]
|
||||
|
||||
CHROME_PATHS = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
'/snap/bin/chromium',
|
||||
'/opt/google/chrome/chrome',
|
||||
]
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from Chrome binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# Chrome version string: "Google Chrome 120.0.6099.109" or "Chromium 120.0.6099.109"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
# Find version number (looks like 120.0.6099.109)
|
||||
for part in parts:
|
||||
if '.' in part and part[0].isdigit():
|
||||
return part
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_chrome() -> dict | None:
|
||||
"""Find Chrome/Chromium binary."""
|
||||
# Check env var first
|
||||
env_path = os.environ.get('CHROME_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'abspath': env_path,
|
||||
'version': get_binary_version(env_path),
|
||||
'sha256': get_binary_hash(env_path),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
# Try shutil.which for various names
|
||||
for name in CHROME_NAMES:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
# Check common paths
|
||||
for path in CHROME_PATHS:
|
||||
if Path(path).is_file():
|
||||
return {
|
||||
'name': 'chrome',
|
||||
'abspath': path,
|
||||
'version': get_binary_version(path),
|
||||
'sha256': get_binary_hash(path),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_chrome()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/CHROME_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/CHROME_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'chrome',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"Chrome/Chromium binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate and compute derived Chrome config values.
|
||||
|
||||
This hook runs early in the Crawl lifecycle to:
|
||||
1. Auto-detect Chrome binary location
|
||||
2. Compute sandbox settings based on Docker detection
|
||||
3. Validate binary availability and version
|
||||
4. Set computed env vars for subsequent hooks
|
||||
|
||||
Output:
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
- InstalledBinary JSONL records to stdout when binaries are found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
# Chrome binary search order
|
||||
CHROME_BINARY_NAMES = [
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'chrome',
|
||||
]
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def detect_docker() -> bool:
|
||||
"""Detect if running inside Docker container."""
|
||||
return (
|
||||
os.path.exists('/.dockerenv') or
|
||||
os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
|
||||
os.path.exists('/run/.containerenv')
|
||||
)
|
||||
|
||||
|
||||
def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
|
||||
"""Find Chrome binary using abx-pkg, checking configured path first."""
|
||||
# Try configured binary first
|
||||
if configured:
|
||||
try:
|
||||
binary = Binary(name=configured, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
return binary
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Search common names
|
||||
for name in CHROME_BINARY_NAMES:
|
||||
try:
|
||||
binary = Binary(name=name, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
return binary
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def output_installed_binary(binary: Binary, name: str):
|
||||
"""Output InstalledBinary JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Get config values
|
||||
chrome_binary = get_env('CHROME_BINARY', 'chromium')
|
||||
chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
|
||||
save_screenshot = get_env_bool('SAVE_SCREENSHOT', True)
|
||||
save_pdf = get_env_bool('SAVE_PDF', True)
|
||||
save_dom = get_env_bool('SAVE_DOM', True)
|
||||
|
||||
# Compute USE_CHROME (derived from SAVE_* flags)
|
||||
use_chrome = save_screenshot or save_pdf or save_dom
|
||||
computed['USE_CHROME'] = str(use_chrome).lower()
|
||||
|
||||
# Detect Docker and adjust sandbox
|
||||
in_docker = detect_docker()
|
||||
computed['IN_DOCKER'] = str(in_docker).lower()
|
||||
|
||||
if in_docker and chrome_sandbox:
|
||||
warnings.append(
|
||||
"Running in Docker with CHROME_SANDBOX=true. "
|
||||
"Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
|
||||
)
|
||||
# Auto-disable sandbox in Docker unless explicitly set
|
||||
if not get_env('CHROME_SANDBOX'):
|
||||
computed['CHROME_SANDBOX'] = 'false'
|
||||
|
||||
# Find Chrome binary using abx-pkg
|
||||
provider = EnvProvider()
|
||||
if use_chrome:
|
||||
chrome = find_chrome_binary(chrome_binary, provider)
|
||||
if not chrome or not chrome.abspath:
|
||||
errors.append(
|
||||
f"Chrome binary not found (tried: {chrome_binary}). "
|
||||
"Install Chrome/Chromium or set CHROME_BINARY path."
|
||||
)
|
||||
computed['CHROME_BINARY'] = ''
|
||||
else:
|
||||
computed['CHROME_BINARY'] = str(chrome.abspath)
|
||||
computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
|
||||
|
||||
# Output InstalledBinary JSONL record for Chrome
|
||||
output_installed_binary(chrome, name='chrome')
|
||||
|
||||
# Check Node.js for Puppeteer
|
||||
node_binary_name = get_env('NODE_BINARY', 'node')
|
||||
try:
|
||||
node = Binary(name=node_binary_name, binproviders=[provider]).load()
|
||||
node_path = str(node.abspath) if node.abspath else ''
|
||||
except Exception:
|
||||
node = None
|
||||
node_path = ''
|
||||
|
||||
if use_chrome and not node_path:
|
||||
errors.append(
|
||||
f"Node.js not found (tried: {node_binary_name}). "
|
||||
"Install Node.js or set NODE_BINARY path for Puppeteer."
|
||||
)
|
||||
else:
|
||||
computed['NODE_BINARY'] = node_path
|
||||
if node and node.abspath:
|
||||
# Output InstalledBinary JSONL record for Node
|
||||
output_installed_binary(node, name='node')
|
||||
|
||||
# Output computed values
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
sys.exit(1 if errors else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
350
archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js
Executable file
350
archivebox/plugins/chrome_session/on_Snapshot__20_chrome_session.js
Executable file
@@ -0,0 +1,350 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Start a Chrome browser session for use by other extractors.
|
||||
*
|
||||
* This extractor ONLY launches Chrome and creates a blank page - it does NOT navigate.
|
||||
* Pre-load extractors (21-29) can connect via CDP to register listeners before navigation.
|
||||
* The chrome_navigate extractor (30) performs the actual page load.
|
||||
*
|
||||
* Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Creates chrome_session/ with:
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - pid.txt: Chrome process ID (for cleanup)
|
||||
* - page_id.txt: Target ID of the page for other extractors to use
|
||||
* - url.txt: The URL to be navigated to (for chrome_navigate)
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'chrome_session';
|
||||
const OUTPUT_DIR = 'chrome_session';
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
// Load installed extensions from cache files
|
||||
function loadInstalledExtensions() {
|
||||
const extensions = [];
|
||||
|
||||
if (!fs.existsSync(EXTENSIONS_DIR)) {
|
||||
return extensions;
|
||||
}
|
||||
|
||||
// Look for *.extension.json cache files created by extension plugins
|
||||
const files = fs.readdirSync(EXTENSIONS_DIR);
|
||||
const extensionFiles = files.filter(f => f.endsWith('.extension.json'));
|
||||
|
||||
for (const file of extensionFiles) {
|
||||
try {
|
||||
const filePath = path.join(EXTENSIONS_DIR, file);
|
||||
const data = fs.readFileSync(filePath, 'utf-8');
|
||||
const extension = JSON.parse(data);
|
||||
|
||||
// Verify extension is actually installed
|
||||
const manifestPath = path.join(extension.unpacked_path, 'manifest.json');
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
extensions.push(extension);
|
||||
console.log(`[+] Loaded extension: ${extension.name} (${extension.webstore_id})`);
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`[⚠️] Failed to load extension from ${file}: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return extensions;
|
||||
}
|
||||
|
||||
|
||||
async function startChromeSession(url, binary) {
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
|
||||
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
|
||||
const headless = getEnvBool('CHROME_HEADLESS', true);
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Load installed extensions
|
||||
const extensions = loadInstalledExtensions();
|
||||
const extensionArgs = extensionUtils.getExtensionLaunchArgs(extensions);
|
||||
|
||||
if (extensions.length > 0) {
|
||||
console.log(`[*] Loading ${extensions.length} Chrome extensions...`);
|
||||
}
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
let browser = null;
|
||||
|
||||
try {
|
||||
// Launch browser with Puppeteer
|
||||
browser = await puppeteer.launch({
|
||||
executablePath: binary,
|
||||
headless: headless ? 'new' : false,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
'--disable-sync',
|
||||
'--no-first-run',
|
||||
'--no-default-browser-check',
|
||||
'--disable-default-apps',
|
||||
'--disable-infobars',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
'--disable-component-update',
|
||||
'--disable-domain-reliability',
|
||||
'--disable-breakpad',
|
||||
'--disable-background-networking',
|
||||
'--disable-background-timer-throttling',
|
||||
'--disable-backgrounding-occluded-windows',
|
||||
'--disable-renderer-backgrounding',
|
||||
'--disable-ipc-flooding-protection',
|
||||
'--password-store=basic',
|
||||
'--use-mock-keychain',
|
||||
'--font-render-hinting=none',
|
||||
'--force-color-profile=srgb',
|
||||
`--window-size=${width},${height}`,
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
...extensionArgs,
|
||||
],
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
|
||||
// Get the WebSocket endpoint URL
|
||||
const cdpUrl = browser.wsEndpoint();
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'cdp_url.txt'), cdpUrl);
|
||||
|
||||
// Write PID for cleanup
|
||||
const browserProcess = browser.process();
|
||||
if (browserProcess) {
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(browserProcess.pid));
|
||||
}
|
||||
|
||||
// Create a new page (but DON'T navigate yet)
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Set user agent if specified
|
||||
if (userAgent) {
|
||||
await page.setUserAgent(userAgent);
|
||||
}
|
||||
|
||||
// Write the page target ID so other extractors can find this specific page
|
||||
const target = page.target();
|
||||
const targetId = target._targetId;
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'page_id.txt'), targetId);
|
||||
|
||||
// Write the URL for chrome_navigate to use
|
||||
fs.writeFileSync(path.join(OUTPUT_DIR, 'url.txt'), url);
|
||||
|
||||
// Connect to loaded extensions at runtime (only if not already done)
|
||||
const extensionsFile = path.join(OUTPUT_DIR, 'extensions.json');
|
||||
if (extensions.length > 0 && !fs.existsSync(extensionsFile)) {
|
||||
console.log('[*] Connecting to loaded extensions (first time setup)...');
|
||||
try {
|
||||
const loadedExtensions = await extensionUtils.loadAllExtensionsFromBrowser(browser, extensions);
|
||||
|
||||
// Write loaded extensions metadata for other extractors to use
|
||||
fs.writeFileSync(extensionsFile, JSON.stringify(loadedExtensions, null, 2));
|
||||
|
||||
console.log(`[+] Extensions loaded and available at ${extensionsFile}`);
|
||||
console.log(`[+] ${loadedExtensions.length} extensions ready for configuration by subsequent plugins`);
|
||||
} catch (e) {
|
||||
console.warn(`[⚠️] Failed to load extensions from browser: ${e.message}`);
|
||||
}
|
||||
} else if (extensions.length > 0) {
|
||||
console.log('[*] Extensions already loaded from previous snapshot');
|
||||
}
|
||||
|
||||
// Don't close browser - leave it running for other extractors
|
||||
// Detach puppeteer from browser so it stays running
|
||||
browser.disconnect();
|
||||
|
||||
return { success: true, output: OUTPUT_DIR, cdpUrl, targetId };
|
||||
|
||||
} catch (e) {
|
||||
// Kill browser if startup failed
|
||||
if (browser) {
|
||||
try {
|
||||
await browser.close();
|
||||
} catch (closeErr) {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__20_chrome_session.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
let version = '';
|
||||
|
||||
try {
|
||||
// chrome_session launches Chrome and creates a blank page
|
||||
// Pre-load extractors (21-29) register CDP listeners
|
||||
// chrome_navigate (30) performs actual navigation
|
||||
const binary = findChrome();
|
||||
if (!binary) {
|
||||
console.error('ERROR: Chrome/Chromium binary not found');
|
||||
console.error('DEPENDENCY_NEEDED=chrome');
|
||||
console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
|
||||
console.error('INSTALL_HINT=npx @puppeteer/browsers install chrome@stable');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Get Chrome version
|
||||
try {
|
||||
const { execSync } = require('child_process');
|
||||
version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 }).trim().slice(0, 64);
|
||||
} catch (e) {
|
||||
version = '';
|
||||
}
|
||||
|
||||
const result = await startChromeSession(url, binary);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
console.log(`Chrome session started (no navigation yet): ${result.cdpUrl}`);
|
||||
console.log(`Page target ID: ${result.targetId}`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (version) {
|
||||
console.log(`VERSION=${version}`);
|
||||
}
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
cmd_version: version,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
297
archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
Executable file
297
archivebox/plugins/consolelog/on_Snapshot__21_consolelog.js
Executable file
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Capture console output from a page.
|
||||
*
|
||||
* Captures all console messages during page load:
|
||||
* - log, warn, error, info, debug
|
||||
* - Includes stack traces for errors
|
||||
* - Timestamps for each message
|
||||
*
|
||||
* Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes consolelog/console.jsonl (one message per line)
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_CONSOLELOG: Enable console log capture (default: true)
|
||||
* CONSOLELOG_TIMEOUT: Capture duration in seconds (default: 5)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'consolelog';
|
||||
const OUTPUT_DIR = 'consolelog';
|
||||
const OUTPUT_FILE = 'console.jsonl';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Serialize console message arguments
|
||||
async function serializeArgs(args) {
|
||||
const serialized = [];
|
||||
for (const arg of args) {
|
||||
try {
|
||||
const json = await arg.jsonValue();
|
||||
serialized.push(json);
|
||||
} catch (e) {
|
||||
// If jsonValue() fails, try to get text representation
|
||||
try {
|
||||
serialized.push(String(arg));
|
||||
} catch (e2) {
|
||||
serialized.push('[Unserializable]');
|
||||
}
|
||||
}
|
||||
}
|
||||
return serialized;
|
||||
}
|
||||
|
||||
// Capture console logs
|
||||
async function captureConsoleLogs(url) {
|
||||
const captureTimeout = (getEnvInt('CONSOLELOG_TIMEOUT') || 5) * 1000;
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Clear existing file
|
||||
fs.writeFileSync(outputPath, '');
|
||||
|
||||
let browser = null;
|
||||
const consoleLogs = [];
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Listen for console messages
|
||||
page.on('console', async (msg) => {
|
||||
try {
|
||||
const type = msg.type();
|
||||
const text = msg.text();
|
||||
const location = msg.location();
|
||||
const args = await serializeArgs(msg.args());
|
||||
|
||||
const logEntry = {
|
||||
timestamp: new Date().toISOString(),
|
||||
type,
|
||||
text,
|
||||
args,
|
||||
location: {
|
||||
url: location.url || '',
|
||||
lineNumber: location.lineNumber,
|
||||
columnNumber: location.columnNumber,
|
||||
},
|
||||
};
|
||||
|
||||
// Write immediately to file
|
||||
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
|
||||
consoleLogs.push(logEntry);
|
||||
} catch (e) {
|
||||
// Error processing console message, skip it
|
||||
console.error(`Error processing console message: ${e.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Listen for page errors
|
||||
page.on('pageerror', (error) => {
|
||||
try {
|
||||
const logEntry = {
|
||||
timestamp: new Date().toISOString(),
|
||||
type: 'error',
|
||||
text: error.message,
|
||||
stack: error.stack || '',
|
||||
location: {},
|
||||
};
|
||||
|
||||
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
|
||||
consoleLogs.push(logEntry);
|
||||
} catch (e) {
|
||||
console.error(`Error processing page error: ${e.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Listen for request failures
|
||||
page.on('requestfailed', (request) => {
|
||||
try {
|
||||
const failure = request.failure();
|
||||
const logEntry = {
|
||||
timestamp: new Date().toISOString(),
|
||||
type: 'request_failed',
|
||||
text: `Request failed: ${request.url()}`,
|
||||
error: failure ? failure.errorText : 'Unknown error',
|
||||
url: request.url(),
|
||||
location: {},
|
||||
};
|
||||
|
||||
fs.appendFileSync(outputPath, JSON.stringify(logEntry) + '\n');
|
||||
consoleLogs.push(logEntry);
|
||||
} catch (e) {
|
||||
console.error(`Error processing request failure: ${e.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Wait to capture logs
|
||||
await new Promise(resolve => setTimeout(resolve, captureTimeout));
|
||||
|
||||
// Group logs by type
|
||||
const logStats = consoleLogs.reduce((acc, log) => {
|
||||
acc[log.type] = (acc[log.type] || 0) + 1;
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
return {
|
||||
success: true,
|
||||
output: outputPath,
|
||||
logCount: consoleLogs.length,
|
||||
logStats,
|
||||
};
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__14_consolelog.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
let logCount = 0;
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_CONSOLELOG', true)) {
|
||||
console.log('Skipping console log (SAVE_CONSOLELOG=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await captureConsoleLogs(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
logCount = result.logCount || 0;
|
||||
const statsStr = Object.entries(result.logStats || {})
|
||||
.map(([type, count]) => `${count} ${type}`)
|
||||
.join(', ');
|
||||
console.log(`Captured ${logCount} console messages: ${statsStr}`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
log_count: logCount,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using a custom bash command.
|
||||
|
||||
This provider runs arbitrary shell commands to install binaries
|
||||
that don't fit into standard package managers.
|
||||
|
||||
Usage: on_Dependency__install_using_custom_bash.py --dependency-id=<uuid> --bin-name=<name> --custom-cmd=<cmd>
|
||||
Output: InstalledBinary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dependency-id', required=True, help="Dependency UUID")
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', required=True, help="Custom bash command to run")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str):
|
||||
"""Install binary using custom bash command."""
|
||||
|
||||
if bin_providers != '*' and 'custom' not in bin_providers.split(','):
|
||||
click.echo(f"custom provider not allowed for {bin_name}", err=True)
|
||||
sys.exit(0)
|
||||
|
||||
if not custom_cmd:
|
||||
click.echo("custom provider requires --custom-cmd", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {bin_name} via custom command: {custom_cmd}", err=True)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
custom_cmd,
|
||||
shell=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600, # 10 minute timeout for custom installs
|
||||
)
|
||||
if result.returncode != 0:
|
||||
click.echo(f"Custom install failed: {result.stderr}", err=True)
|
||||
sys.exit(1)
|
||||
except subprocess.TimeoutExpired:
|
||||
click.echo("Custom install timed out", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Use abx-pkg to load the installed binary and get its info
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).load()
|
||||
except Exception as e:
|
||||
click.echo(f"{bin_name} not found after custom install: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{bin_name} not found after custom install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output InstalledBinary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': bin_name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'custom',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
296
archivebox/plugins/dom/on_Snapshot__36_dom.js
Normal file
296
archivebox/plugins/dom/on_Snapshot__36_dom.js
Normal file
@@ -0,0 +1,296 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Dump the DOM of a URL using Chrome/Puppeteer.
|
||||
*
|
||||
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
|
||||
* Otherwise launches a new Chrome instance.
|
||||
*
|
||||
* Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes dom/output.html
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
|
||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
* SAVE_DOM: Enable DOM extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'dom';
|
||||
const OUTPUT_DIR = 'dom';
|
||||
const OUTPUT_FILE = 'output.html';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session if available
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
async function dumpDom(url) {
|
||||
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
|
||||
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
|
||||
const headless = getEnvBool('CHROME_HEADLESS', true);
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
let connectedToSession = false;
|
||||
|
||||
try {
|
||||
// Try to connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
try {
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
connectedToSession = true;
|
||||
|
||||
// Get existing pages or create new one
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
page = await browser.newPage();
|
||||
}
|
||||
|
||||
// Set viewport on the page
|
||||
await page.setViewport({ width, height });
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Failed to connect to CDP session: ${e.message}`);
|
||||
browser = null;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to launching new browser
|
||||
if (!browser) {
|
||||
const executablePath = findChrome();
|
||||
if (!executablePath) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
executablePath,
|
||||
headless: headless ? 'new' : false,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
`--window-size=${width},${height}`,
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
],
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
|
||||
page = await browser.newPage();
|
||||
|
||||
// Navigate to URL (only if we launched fresh browser)
|
||||
if (userAgent) {
|
||||
await page.setUserAgent(userAgent);
|
||||
}
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
}
|
||||
|
||||
// Get the full DOM content
|
||||
const domContent = await page.content();
|
||||
|
||||
if (domContent && domContent.length > 100) {
|
||||
fs.writeFileSync(outputPath, domContent, 'utf8');
|
||||
return { success: true, output: outputPath };
|
||||
} else {
|
||||
return { success: false, error: 'DOM content too short or empty' };
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
// Only close browser if we launched it (not if we connected to session)
|
||||
if (browser && !connectedToSession) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if DOM is enabled (permanent skip - don't retry)
|
||||
if (!getEnvBool('SAVE_DOM', true)) {
|
||||
console.log('Skipping DOM (SAVE_DOM=False)');
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${new Date().toISOString()}`);
|
||||
console.log(`STATUS=skipped`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0); // Permanent skip - feature disabled
|
||||
}
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.log(`Skipping DOM - staticfile extractor already downloaded this`);
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${new Date().toISOString()}`);
|
||||
console.log(`STATUS=skipped`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
const result = await dumpDom(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const size = fs.statSync(output).size;
|
||||
console.log(`DOM saved (${size} bytes)`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
70
archivebox/plugins/env/on_Dependency__install_using_env_provider.py
vendored
Normal file
70
archivebox/plugins/env/on_Dependency__install_using_env_provider.py
vendored
Normal file
@@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Check if a binary is already available in the system PATH.
|
||||
|
||||
This is the simplest "provider" - it doesn't install anything,
|
||||
it just discovers binaries that are already installed.
|
||||
|
||||
Usage: on_Dependency__install_using_env_provider.py --dependency-id=<uuid> --bin-name=<name>
|
||||
Output: InstalledBinary JSONL record to stdout if binary found in PATH
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dependency-id', required=True, help="Dependency UUID")
|
||||
@click.option('--bin-name', required=True, help="Binary name to find")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str):
|
||||
"""Check if binary is available in PATH and record it."""
|
||||
|
||||
# Check if env provider is allowed
|
||||
if bin_providers != '*' and 'env' not in bin_providers.split(','):
|
||||
click.echo(f"env provider not allowed for {bin_name}", err=True)
|
||||
sys.exit(0) # Not an error, just skip
|
||||
|
||||
# Use abx-pkg EnvProvider to find binary
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).load()
|
||||
except Exception as e:
|
||||
click.echo(f"{bin_name} not found in PATH: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{bin_name} not found in PATH", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output InstalledBinary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': bin_name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Found {bin_name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
395
archivebox/plugins/extractor_utils.py
Normal file
395
archivebox/plugins/extractor_utils.py
Normal file
@@ -0,0 +1,395 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Shared utilities for extractor hooks.
|
||||
|
||||
This module provides common functionality for all extractors to ensure
|
||||
consistent behavior, output format, error handling, and timing.
|
||||
|
||||
All extractors should:
|
||||
1. Import and use these utilities
|
||||
2. Output consistent metadata (CMD, VERSION, OUTPUT, timing)
|
||||
3. Write all files to $PWD
|
||||
4. Return proper exit codes (0=success, 1=failure)
|
||||
5. Be runnable standalone without any archivebox imports
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
# Static file extensions that generally don't need browser-based extraction
|
||||
STATIC_EXTENSIONS = (
|
||||
'.pdf', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.ico',
|
||||
'.mp4', '.mp3', '.m4a', '.webm', '.mkv', '.avi', '.mov',
|
||||
'.zip', '.tar', '.gz', '.bz2', '.xz', '.7z', '.rar',
|
||||
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
|
||||
'.exe', '.dmg', '.apk', '.deb', '.rpm',
|
||||
)
|
||||
|
||||
|
||||
def is_static_file(url: str) -> bool:
|
||||
"""Check if URL points to a static file that may not need browser extraction."""
|
||||
return url.lower().split('?')[0].split('#')[0].endswith(STATIC_EXTENSIONS)
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
"""Get environment variable with default."""
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
"""Get boolean environment variable."""
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
"""Get integer environment variable."""
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def find_binary(bin_name: str, env_var: str | None = None) -> str | None:
|
||||
"""Find binary from environment variable or PATH."""
|
||||
if env_var:
|
||||
binary = get_env(env_var)
|
||||
if binary and os.path.isfile(binary):
|
||||
return binary
|
||||
return shutil.which(bin_name)
|
||||
|
||||
|
||||
def get_version(binary: str, version_args: list[str] | None = None) -> str:
|
||||
"""Get binary version string."""
|
||||
if not binary or not os.path.isfile(binary):
|
||||
return ''
|
||||
|
||||
args = version_args or ['--version']
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[binary] + args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
# Return first non-empty line, truncated
|
||||
for line in result.stdout.split('\n'):
|
||||
line = line.strip()
|
||||
if line:
|
||||
return line[:64]
|
||||
return ''
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
class ExtractorResult:
|
||||
"""
|
||||
Tracks extractor execution and produces consistent output.
|
||||
|
||||
Usage:
|
||||
result = ExtractorResult(name='wget', url=url)
|
||||
result.cmd = ['wget', url]
|
||||
result.version = '1.21'
|
||||
|
||||
# ... do extraction ...
|
||||
|
||||
result.output = 'example.com/index.html'
|
||||
result.status = 'succeeded'
|
||||
result.finish()
|
||||
|
||||
sys.exit(result.exit_code)
|
||||
"""
|
||||
|
||||
def __init__(self, name: str, url: str, snapshot_id: str = ''):
|
||||
self.name = name
|
||||
self.url = url
|
||||
self.snapshot_id = snapshot_id
|
||||
self.start_ts = datetime.now(timezone.utc)
|
||||
self.end_ts: datetime | None = None
|
||||
|
||||
self.cmd: list[str] = []
|
||||
self.version: str = ''
|
||||
self.output: str | Path | None = None
|
||||
self.status: str = 'failed' # 'succeeded', 'failed', 'skipped'
|
||||
|
||||
self.stdout: str = ''
|
||||
self.stderr: str = ''
|
||||
self.returncode: int | None = None
|
||||
|
||||
self.error: str = ''
|
||||
self.hints: list[str] = []
|
||||
|
||||
# Dependency info for missing binary
|
||||
self.dependency_needed: str = ''
|
||||
self.bin_providers: str = ''
|
||||
|
||||
@property
|
||||
def duration(self) -> float:
|
||||
"""Duration in seconds."""
|
||||
if self.end_ts:
|
||||
return (self.end_ts - self.start_ts).total_seconds()
|
||||
return (datetime.now(timezone.utc) - self.start_ts).total_seconds()
|
||||
|
||||
@property
|
||||
def exit_code(self) -> int:
|
||||
"""Exit code based on status."""
|
||||
if self.status == 'succeeded':
|
||||
return 0
|
||||
if self.status == 'skipped':
|
||||
return 0 # Skipped is not a failure
|
||||
return 1
|
||||
|
||||
def finish(self, status: str | None = None):
|
||||
"""Mark extraction as finished and print results."""
|
||||
self.end_ts = datetime.now(timezone.utc)
|
||||
if status:
|
||||
self.status = status
|
||||
self._print_results()
|
||||
|
||||
def _print_results(self):
|
||||
"""Print consistent output for hooks.py to parse."""
|
||||
import sys
|
||||
|
||||
# Print timing
|
||||
print(f"START_TS={self.start_ts.isoformat()}")
|
||||
print(f"END_TS={self.end_ts.isoformat() if self.end_ts else ''}")
|
||||
print(f"DURATION={self.duration:.2f}")
|
||||
|
||||
# Print command info
|
||||
if self.cmd:
|
||||
print(f"CMD={' '.join(str(c) for c in self.cmd)}")
|
||||
if self.version:
|
||||
print(f"VERSION={self.version}")
|
||||
|
||||
# Print output path
|
||||
if self.output:
|
||||
print(f"OUTPUT={self.output}")
|
||||
|
||||
# Print status
|
||||
print(f"STATUS={self.status}")
|
||||
|
||||
# Print dependency info if needed
|
||||
if self.dependency_needed:
|
||||
print(f"DEPENDENCY_NEEDED={self.dependency_needed}", file=sys.stderr)
|
||||
if self.bin_providers:
|
||||
print(f"BIN_PROVIDERS={self.bin_providers}", file=sys.stderr)
|
||||
|
||||
# Print error info
|
||||
if self.error:
|
||||
print(f"ERROR={self.error}", file=sys.stderr)
|
||||
for hint in self.hints:
|
||||
print(f"HINT={hint}", file=sys.stderr)
|
||||
|
||||
# Print JSON result for structured parsing
|
||||
result_json = {
|
||||
'extractor': self.name,
|
||||
'url': self.url,
|
||||
'snapshot_id': self.snapshot_id,
|
||||
'status': self.status,
|
||||
'start_ts': self.start_ts.isoformat(),
|
||||
'end_ts': self.end_ts.isoformat() if self.end_ts else None,
|
||||
'duration': round(self.duration, 2),
|
||||
'cmd': self.cmd,
|
||||
'cmd_version': self.version,
|
||||
'output': str(self.output) if self.output else None,
|
||||
'returncode': self.returncode,
|
||||
'error': self.error or None,
|
||||
}
|
||||
print(f"RESULT_JSON={json.dumps(result_json)}")
|
||||
|
||||
|
||||
def run_shell_command(
|
||||
cmd: list[str],
|
||||
cwd: str | Path | None = None,
|
||||
timeout: int = 60,
|
||||
result: ExtractorResult | None = None,
|
||||
) -> subprocess.CompletedProcess:
|
||||
"""
|
||||
Run a shell command with proper capturing and timing.
|
||||
|
||||
Updates result object if provided with stdout, stderr, returncode.
|
||||
"""
|
||||
cwd = cwd or Path.cwd()
|
||||
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
cwd=str(cwd),
|
||||
capture_output=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
if result:
|
||||
result.stdout = proc.stdout.decode('utf-8', errors='replace')
|
||||
result.stderr = proc.stderr.decode('utf-8', errors='replace')
|
||||
result.returncode = proc.returncode
|
||||
|
||||
return proc
|
||||
|
||||
except subprocess.TimeoutExpired as e:
|
||||
if result:
|
||||
result.error = f"Command timed out after {timeout} seconds"
|
||||
result.stdout = e.stdout.decode('utf-8', errors='replace') if e.stdout else ''
|
||||
result.stderr = e.stderr.decode('utf-8', errors='replace') if e.stderr else ''
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
if result:
|
||||
result.error = f"{type(e).__name__}: {e}"
|
||||
raise
|
||||
|
||||
|
||||
def chrome_args(
|
||||
headless: bool = True,
|
||||
sandbox: bool = False,
|
||||
resolution: str = '1440,900',
|
||||
user_agent: str = '',
|
||||
check_ssl: bool = True,
|
||||
user_data_dir: str = '',
|
||||
profile_name: str = 'Default',
|
||||
extra_args: list[str] | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Build Chrome/Chromium command line arguments.
|
||||
|
||||
Based on the old CHROME_CONFIG.chrome_args() implementation.
|
||||
"""
|
||||
args = [
|
||||
# Disable unnecessary features
|
||||
'--disable-sync',
|
||||
'--no-pings',
|
||||
'--no-first-run',
|
||||
'--no-default-browser-check',
|
||||
'--disable-default-apps',
|
||||
'--disable-infobars',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
|
||||
# Deterministic behavior
|
||||
'--js-flags=--random-seed=1157259159',
|
||||
'--deterministic-mode',
|
||||
'--deterministic-fetch',
|
||||
|
||||
# Performance
|
||||
'--disable-background-networking',
|
||||
'--disable-background-timer-throttling',
|
||||
'--disable-backgrounding-occluded-windows',
|
||||
'--disable-renderer-backgrounding',
|
||||
'--disable-ipc-flooding-protection',
|
||||
|
||||
# Disable prompts/popups
|
||||
'--deny-permission-prompts',
|
||||
'--disable-notifications',
|
||||
'--disable-popup-blocking',
|
||||
'--noerrdialogs',
|
||||
|
||||
# Security/privacy
|
||||
'--disable-client-side-phishing-detection',
|
||||
'--disable-domain-reliability',
|
||||
'--disable-component-update',
|
||||
'--safebrowsing-disable-auto-update',
|
||||
'--password-store=basic',
|
||||
'--use-mock-keychain',
|
||||
|
||||
# GPU/rendering
|
||||
'--force-gpu-mem-available-mb=4096',
|
||||
'--font-render-hinting=none',
|
||||
'--force-color-profile=srgb',
|
||||
'--disable-partial-raster',
|
||||
'--disable-skia-runtime-opts',
|
||||
'--disable-2d-canvas-clip-aa',
|
||||
'--disable-lazy-loading',
|
||||
|
||||
# Media
|
||||
'--use-fake-device-for-media-stream',
|
||||
'--disable-gesture-requirement-for-media-playback',
|
||||
]
|
||||
|
||||
if headless:
|
||||
args.append('--headless=new')
|
||||
|
||||
if not sandbox:
|
||||
args.extend([
|
||||
'--no-sandbox',
|
||||
'--no-zygote',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-software-rasterizer',
|
||||
])
|
||||
|
||||
if resolution:
|
||||
args.append(f'--window-size={resolution}')
|
||||
|
||||
if not check_ssl:
|
||||
args.extend([
|
||||
'--disable-web-security',
|
||||
'--ignore-certificate-errors',
|
||||
])
|
||||
|
||||
if user_agent:
|
||||
args.append(f'--user-agent={user_agent}')
|
||||
|
||||
if user_data_dir:
|
||||
args.append(f'--user-data-dir={user_data_dir}')
|
||||
args.append(f'--profile-directory={profile_name}')
|
||||
|
||||
if extra_args:
|
||||
args.extend(extra_args)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def chrome_cleanup_lockfile(user_data_dir: str | Path):
|
||||
"""Remove Chrome SingletonLock file that can prevent browser from starting."""
|
||||
if not user_data_dir:
|
||||
return
|
||||
lockfile = Path(user_data_dir) / 'SingletonLock'
|
||||
try:
|
||||
lockfile.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# Common Chrome binary names to search for
|
||||
CHROME_BINARY_NAMES = [
|
||||
'google-chrome',
|
||||
'google-chrome-stable',
|
||||
'chromium',
|
||||
'chromium-browser',
|
||||
'chrome',
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
]
|
||||
|
||||
|
||||
def find_chrome() -> str | None:
|
||||
"""Find Chrome/Chromium binary."""
|
||||
# Check environment first
|
||||
chrome = get_env('CHROME_BINARY')
|
||||
if chrome and os.path.isfile(chrome):
|
||||
return chrome
|
||||
|
||||
# Search PATH
|
||||
for name in CHROME_BINARY_NAMES:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
# Check macOS locations
|
||||
for path in CHROME_BINARY_NAMES_MACOS:
|
||||
if os.path.isfile(path):
|
||||
return path
|
||||
|
||||
return None
|
||||
31
archivebox/plugins/favicon/config.json
Normal file
31
archivebox/plugins/favicon/config.json
Normal file
@@ -0,0 +1,31 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_FAVICON": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable favicon downloading"
|
||||
},
|
||||
"FAVICON_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for favicon fetch in seconds"
|
||||
},
|
||||
"FAVICON_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
},
|
||||
"FAVICON_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
}
|
||||
}
|
||||
}
|
||||
169
archivebox/plugins/favicon/on_Snapshot__11_favicon.py
Normal file
169
archivebox/plugins/favicon/on_Snapshot__11_favicon.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract favicon from a URL.
|
||||
|
||||
Usage: on_Snapshot__favicon.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes favicon.ico to $PWD
|
||||
|
||||
Environment variables:
|
||||
TIMEOUT: Timeout in seconds (default: 30)
|
||||
USER_AGENT: User agent string
|
||||
|
||||
Note: This extractor uses the 'requests' library which is bundled with ArchiveBox.
|
||||
It can run standalone if requests is installed: pip install requests
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'favicon'
|
||||
OUTPUT_DIR = 'favicon'
|
||||
OUTPUT_FILE = 'favicon.ico'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_favicon(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Fetch favicon from URL.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
try:
|
||||
import requests
|
||||
except ImportError:
|
||||
return False, None, 'requests library not installed'
|
||||
|
||||
timeout = get_env_int('TIMEOUT', 30)
|
||||
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
headers = {'User-Agent': user_agent}
|
||||
|
||||
# Build list of possible favicon URLs
|
||||
parsed = urlparse(url)
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
|
||||
favicon_urls = [
|
||||
urljoin(base_url, '/favicon.ico'),
|
||||
urljoin(base_url, '/favicon.png'),
|
||||
urljoin(base_url, '/apple-touch-icon.png'),
|
||||
]
|
||||
|
||||
# Try to extract favicon URL from HTML link tags
|
||||
try:
|
||||
response = requests.get(url, timeout=timeout, headers=headers)
|
||||
if response.ok:
|
||||
# Look for <link rel="icon" href="...">
|
||||
for match in re.finditer(
|
||||
r'<link[^>]+rel=["\'](?:shortcut )?icon["\'][^>]+href=["\']([^"\']+)["\']',
|
||||
response.text,
|
||||
re.I
|
||||
):
|
||||
favicon_urls.insert(0, urljoin(url, match.group(1)))
|
||||
|
||||
# Also check reverse order: href before rel
|
||||
for match in re.finditer(
|
||||
r'<link[^>]+href=["\']([^"\']+)["\'][^>]+rel=["\'](?:shortcut )?icon["\']',
|
||||
response.text,
|
||||
re.I
|
||||
):
|
||||
favicon_urls.insert(0, urljoin(url, match.group(1)))
|
||||
except Exception:
|
||||
pass # Continue with default favicon URLs
|
||||
|
||||
# Try each URL until we find one that works
|
||||
for favicon_url in favicon_urls:
|
||||
try:
|
||||
response = requests.get(favicon_url, timeout=15, headers=headers)
|
||||
if response.ok and len(response.content) > 0:
|
||||
Path(OUTPUT_FILE).write_bytes(response.content)
|
||||
return True, OUTPUT_FILE, ''
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Try Google's favicon service as fallback
|
||||
try:
|
||||
google_url = f'https://www.google.com/s2/favicons?domain={parsed.netloc}'
|
||||
response = requests.get(google_url, timeout=15, headers=headers)
|
||||
if response.ok and len(response.content) > 0:
|
||||
Path(OUTPUT_FILE).write_bytes(response.content)
|
||||
return True, OUTPUT_FILE, ''
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return False, None, 'No favicon found'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to extract favicon from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Extract favicon from a URL."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = get_favicon(url)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
print(f'Favicon saved ({Path(output).stat().st_size} bytes)')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
262
archivebox/plugins/favicon/tests/test_favicon.py
Normal file
262
archivebox/plugins/favicon/tests/test_favicon.py
Normal file
@@ -0,0 +1,262 @@
|
||||
"""
|
||||
Integration tests for favicon plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin script exists
|
||||
2. requests library is available
|
||||
3. Favicon extraction works for real example.com
|
||||
4. Output file is actual image data
|
||||
5. Tries multiple favicon URLs
|
||||
6. Falls back to Google's favicon service
|
||||
7. Config options work (TIMEOUT, USER_AGENT)
|
||||
8. Handles failures gracefully
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
FAVICON_HOOK = PLUGIN_DIR / 'on_Snapshot__11_favicon.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert FAVICON_HOOK.exists(), f"Hook script not found: {FAVICON_HOOK}"
|
||||
|
||||
|
||||
def test_requests_library_available():
|
||||
"""Test that requests library is available."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests; print(requests.__version__)'],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.skip("requests library not installed")
|
||||
|
||||
assert len(result.stdout.strip()) > 0, "Should report requests version"
|
||||
|
||||
|
||||
def test_extracts_favicon_from_example_com():
|
||||
"""Test full workflow: extract favicon from real example.com.
|
||||
|
||||
Note: example.com doesn't have a favicon and Google's service may also fail,
|
||||
so we test that the extraction completes and reports appropriate status.
|
||||
"""
|
||||
|
||||
# Check requests is available
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run favicon extraction
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# May succeed (if Google service works) or fail (if no favicon)
|
||||
assert result.returncode in (0, 1), "Should complete extraction attempt"
|
||||
|
||||
# Verify RESULT_JSON is present
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
# If it succeeded, verify the favicon file
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'Favicon saved' in result.stdout, "Should report completion"
|
||||
|
||||
favicon_file = tmpdir / 'favicon.ico'
|
||||
assert favicon_file.exists(), "favicon.ico not created"
|
||||
|
||||
# Verify file is not empty and contains actual image data
|
||||
file_size = favicon_file.stat().st_size
|
||||
assert file_size > 0, "Favicon file should not be empty"
|
||||
assert file_size < 1024 * 1024, f"Favicon file suspiciously large: {file_size} bytes"
|
||||
|
||||
# Check for common image magic bytes
|
||||
favicon_data = favicon_file.read_bytes()
|
||||
# ICO, PNG, GIF, JPEG, or WebP
|
||||
is_image = (
|
||||
favicon_data[:4] == b'\x00\x00\x01\x00' or # ICO
|
||||
favicon_data[:8] == b'\x89PNG\r\n\x1a\n' or # PNG
|
||||
favicon_data[:3] == b'GIF' or # GIF
|
||||
favicon_data[:2] == b'\xff\xd8' or # JPEG
|
||||
favicon_data[8:12] == b'WEBP' # WebP
|
||||
)
|
||||
assert is_image, "Favicon file should be a valid image format"
|
||||
else:
|
||||
# Failed as expected
|
||||
assert 'STATUS=failed' in result.stdout
|
||||
assert 'No favicon found' in result.stdout or 'No favicon found' in result.stderr
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that TIMEOUT config is respected."""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout (but example.com should still succeed)
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should complete (success or fail, but not hang)
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
def test_config_user_agent():
|
||||
"""Test that USER_AGENT config is used."""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set custom user agent
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['USER_AGENT'] = 'TestBot/1.0'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout
|
||||
|
||||
|
||||
def test_handles_https_urls():
|
||||
"""Test that HTTPS URLs work correctly."""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.org', '--snapshot-id', 'testhttps'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
favicon_file = tmpdir / 'favicon.ico'
|
||||
if favicon_file.exists():
|
||||
assert favicon_file.stat().st_size > 0
|
||||
|
||||
|
||||
def test_handles_missing_favicon_gracefully():
|
||||
"""Test that favicon plugin handles sites without favicons gracefully.
|
||||
|
||||
Note: The plugin falls back to Google's favicon service, which generates
|
||||
a generic icon even if the site doesn't have one, so extraction usually succeeds.
|
||||
"""
|
||||
|
||||
check_result = subprocess.run(
|
||||
[sys.executable, '-c', 'import requests'],
|
||||
capture_output=True
|
||||
)
|
||||
if check_result.returncode != 0:
|
||||
pytest.skip("requests not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Try a URL that likely doesn't have a favicon
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(FAVICON_HOOK), '--url', 'https://example.com/nonexistent', '--snapshot-id', 'test404'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# May succeed (Google fallback) or fail gracefully
|
||||
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
|
||||
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'No favicon found' in combined or 'ERROR=' in combined
|
||||
|
||||
|
||||
def test_reports_missing_requests_library():
|
||||
"""Test that script reports error when requests library is missing."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run with PYTHONPATH cleared to simulate missing requests
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
# Keep only minimal PATH, clear PYTHONPATH
|
||||
env['PYTHONPATH'] = '/nonexistent'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, '-S', str(FAVICON_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail and report missing requests
|
||||
if result.returncode != 0:
|
||||
combined = result.stdout + result.stderr
|
||||
# May report missing requests or other import errors
|
||||
assert 'requests' in combined.lower() or 'import' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
40
archivebox/plugins/git/config.json
Normal file
40
archivebox/plugins/git/config.json
Normal file
@@ -0,0 +1,40 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_GIT": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable git repository cloning"
|
||||
},
|
||||
"GIT_BINARY": {
|
||||
"type": "string",
|
||||
"default": "git",
|
||||
"description": "Path to git binary"
|
||||
},
|
||||
"GIT_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 120,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for git operations in seconds"
|
||||
},
|
||||
"GIT_DOMAINS": {
|
||||
"type": "string",
|
||||
"default": "github.com,gitlab.com,bitbucket.org,gist.github.com,codeberg.org,gitea.com,git.sr.ht",
|
||||
"description": "Comma-separated list of domains to treat as git repositories"
|
||||
},
|
||||
"GIT_CLONE_DEPTH": {
|
||||
"type": "integer",
|
||||
"default": 1,
|
||||
"minimum": 0,
|
||||
"description": "Depth of git clone (0 for full history, 1 for shallow)"
|
||||
},
|
||||
"GIT_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for git clone"
|
||||
}
|
||||
}
|
||||
}
|
||||
126
archivebox/plugins/git/on_Crawl__00_validate_git.py
Normal file
126
archivebox/plugins/git/on_Crawl__00_validate_git.py
Normal file
@@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for git binary.
|
||||
|
||||
Runs at crawl start to verify git is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# git version string: "git version 2.43.0"
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
parts = first_line.split()
|
||||
if len(parts) >= 3 and parts[0] == 'git':
|
||||
return parts[2]
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_git() -> dict | None:
|
||||
"""Find git binary."""
|
||||
try:
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
class GitBinary(Binary):
|
||||
name: str = 'git'
|
||||
binproviders_supported = [EnvProvider()]
|
||||
|
||||
binary = GitBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'git',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('git') or os.environ.get('GIT_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'git',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_git()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/GIT_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/GIT_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'git',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
print(f"git binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
193
archivebox/plugins/git/on_Snapshot__12_git.py
Normal file
193
archivebox/plugins/git/on_Snapshot__12_git.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Clone a git repository from a URL.
|
||||
|
||||
Usage: on_Snapshot__git.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Clones repository to $PWD/repo
|
||||
|
||||
Environment variables:
|
||||
GIT_BINARY: Path to git binary
|
||||
TIMEOUT: Timeout in seconds (default: 120)
|
||||
GIT_ARGS: Extra arguments for git clone (space-separated)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'git'
|
||||
BIN_NAME = 'git'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'repo'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def is_git_url(url: str) -> bool:
|
||||
"""Check if URL looks like a git repository."""
|
||||
git_patterns = [
|
||||
'.git',
|
||||
'github.com',
|
||||
'gitlab.com',
|
||||
'bitbucket.org',
|
||||
'git://',
|
||||
'ssh://git@',
|
||||
]
|
||||
return any(p in url.lower() for p in git_patterns)
|
||||
|
||||
|
||||
def find_git() -> str | None:
|
||||
"""Find git binary."""
|
||||
git = get_env('GIT_BINARY')
|
||||
if git and os.path.isfile(git):
|
||||
return git
|
||||
|
||||
return shutil.which('git')
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get git version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Clone git repository.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('TIMEOUT', 120)
|
||||
extra_args = get_env('GIT_ARGS')
|
||||
|
||||
cmd = [
|
||||
binary,
|
||||
'clone',
|
||||
'--depth=1',
|
||||
'--recursive',
|
||||
]
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
cmd.extend([url, OUTPUT_DIR])
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if result.returncode == 0 and Path(OUTPUT_DIR).is_dir():
|
||||
return True, OUTPUT_DIR, ''
|
||||
else:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'git clone failed: {stderr[:200]}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='Git repository URL')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Clone a git repository from a URL."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
|
||||
try:
|
||||
# Check if URL looks like a git repo
|
||||
if not is_git_url(url):
|
||||
print(f'Skipping git clone for non-git URL: {url}')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Find binary
|
||||
binary = find_git()
|
||||
if not binary:
|
||||
print(f'ERROR: git binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
|
||||
# Run extraction
|
||||
success, output, error = clone_git(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
print(f'git clone completed')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if binary:
|
||||
print(f'CMD={binary} clone {url}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
203
archivebox/plugins/headers/on_Snapshot__33_headers.js
Normal file
203
archivebox/plugins/headers/on_Snapshot__33_headers.js
Normal file
@@ -0,0 +1,203 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract HTTP response headers for a URL.
|
||||
*
|
||||
* If a Chrome session exists (from chrome_session extractor), reads the captured
|
||||
* response headers from chrome_session/response_headers.json.
|
||||
* Otherwise falls back to making an HTTP HEAD request.
|
||||
*
|
||||
* Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes headers/headers.json
|
||||
*
|
||||
* Environment variables:
|
||||
* TIMEOUT: Timeout in seconds (default: 30)
|
||||
* USER_AGENT: User agent string (optional)
|
||||
* CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const https = require('https');
|
||||
const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'headers';
|
||||
const OUTPUT_DIR = 'headers';
|
||||
const OUTPUT_FILE = 'headers.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
const CHROME_HEADERS_FILE = 'response_headers.json';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Get headers from chrome_session if available
|
||||
function getHeadersFromChromeSession() {
|
||||
const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE);
|
||||
if (fs.existsSync(headersFile)) {
|
||||
try {
|
||||
const data = JSON.parse(fs.readFileSync(headersFile, 'utf8'));
|
||||
return data;
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Fetch headers via HTTP HEAD request (fallback)
|
||||
function fetchHeaders(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = getEnvInt('TIMEOUT', 30) * 1000;
|
||||
const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
|
||||
const checkSsl = getEnvBool('CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
|
||||
|
||||
const parsedUrl = new URL(url);
|
||||
const client = parsedUrl.protocol === 'https:' ? https : http;
|
||||
|
||||
const options = {
|
||||
method: 'HEAD',
|
||||
hostname: parsedUrl.hostname,
|
||||
port: parsedUrl.port || (parsedUrl.protocol === 'https:' ? 443 : 80),
|
||||
path: parsedUrl.pathname + parsedUrl.search,
|
||||
headers: { 'User-Agent': userAgent },
|
||||
timeout,
|
||||
rejectUnauthorized: checkSsl,
|
||||
};
|
||||
|
||||
const req = client.request(options, (res) => {
|
||||
resolve({
|
||||
url: url,
|
||||
status: res.statusCode,
|
||||
statusText: res.statusMessage,
|
||||
headers: res.headers,
|
||||
});
|
||||
});
|
||||
|
||||
req.on('error', reject);
|
||||
req.on('timeout', () => {
|
||||
req.destroy();
|
||||
reject(new Error('Request timeout'));
|
||||
});
|
||||
|
||||
req.end();
|
||||
});
|
||||
}
|
||||
|
||||
async function extractHeaders(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Try Chrome session first
|
||||
const chromeHeaders = getHeadersFromChromeSession();
|
||||
if (chromeHeaders && chromeHeaders.headers) {
|
||||
fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8');
|
||||
return { success: true, output: outputPath, method: 'chrome_session', status: chromeHeaders.status };
|
||||
}
|
||||
|
||||
// Fallback to HTTP HEAD request
|
||||
try {
|
||||
const headers = await fetchHeaders(url);
|
||||
fs.writeFileSync(outputPath, JSON.stringify(headers, null, 2), 'utf8');
|
||||
return { success: true, output: outputPath, method: 'http', status: headers.status };
|
||||
} catch (e) {
|
||||
return { success: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
const result = await extractHeaders(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
console.log(`Headers extracted (${result.method}): HTTP ${result.status}`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
319
archivebox/plugins/headers/tests/test_headers.py
Normal file
319
archivebox/plugins/headers/tests/test_headers.py
Normal file
@@ -0,0 +1,319 @@
|
||||
"""
|
||||
Integration tests for headers plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin script exists and is executable
|
||||
2. Node.js is available
|
||||
3. Headers extraction works for real example.com
|
||||
4. Output JSON contains actual HTTP headers
|
||||
5. Fallback to HTTP HEAD when chrome_session not available
|
||||
6. Uses chrome_session headers when available
|
||||
7. Config options work (TIMEOUT, USER_AGENT, CHECK_SSL_VALIDITY)
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
HEADERS_HOOK = PLUGIN_DIR / 'on_Snapshot__33_headers.js'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert HEADERS_HOOK.exists(), f"Hook script not found: {HEADERS_HOOK}"
|
||||
|
||||
|
||||
def test_node_is_available():
|
||||
"""Test that Node.js is available on the system."""
|
||||
result = subprocess.run(
|
||||
['which', 'node'],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.skip("node not installed on system")
|
||||
|
||||
binary_path = result.stdout.strip()
|
||||
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
|
||||
|
||||
# Test that node is executable and get version
|
||||
result = subprocess.run(
|
||||
['node', '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
assert result.returncode == 0, f"node not executable: {result.stderr}"
|
||||
assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
|
||||
|
||||
|
||||
def test_extracts_headers_from_example_com():
|
||||
"""Test full workflow: extract headers from real example.com."""
|
||||
|
||||
# Check node is available
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run headers extraction
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify output in stdout
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'Headers extracted' in result.stdout, "Should report completion"
|
||||
|
||||
# Verify output directory created
|
||||
headers_dir = tmpdir / 'headers'
|
||||
assert headers_dir.exists(), "Output directory not created"
|
||||
|
||||
# Verify output file exists
|
||||
headers_file = headers_dir / 'headers.json'
|
||||
assert headers_file.exists(), "headers.json not created"
|
||||
|
||||
# Verify headers JSON contains REAL example.com response
|
||||
headers_data = json.loads(headers_file.read_text())
|
||||
|
||||
assert 'url' in headers_data, "Should have url field"
|
||||
assert headers_data['url'] == TEST_URL, f"URL should be {TEST_URL}"
|
||||
|
||||
assert 'status' in headers_data, "Should have status field"
|
||||
assert headers_data['status'] in [200, 301, 302], \
|
||||
f"Should have valid HTTP status, got {headers_data['status']}"
|
||||
|
||||
assert 'headers' in headers_data, "Should have headers field"
|
||||
assert isinstance(headers_data['headers'], dict), "Headers should be a dict"
|
||||
assert len(headers_data['headers']) > 0, "Headers dict should not be empty"
|
||||
|
||||
# Verify common HTTP headers are present
|
||||
headers_lower = {k.lower(): v for k, v in headers_data['headers'].items()}
|
||||
assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
|
||||
"Should have at least one common HTTP header"
|
||||
|
||||
# Verify RESULT_JSON is present and valid
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.replace('RESULT_JSON=', ''))
|
||||
assert result_json['extractor'] == 'headers'
|
||||
assert result_json['status'] == 'succeeded'
|
||||
assert result_json['url'] == TEST_URL
|
||||
assert result_json['snapshot_id'] == 'test789'
|
||||
assert 'duration' in result_json
|
||||
assert result_json['duration'] >= 0
|
||||
break
|
||||
|
||||
|
||||
def test_uses_chrome_session_headers_when_available():
|
||||
"""Test that headers plugin prefers chrome_session headers over HTTP HEAD."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create mock chrome_session directory with response_headers.json
|
||||
chrome_session_dir = tmpdir / 'chrome_session'
|
||||
chrome_session_dir.mkdir()
|
||||
|
||||
mock_headers = {
|
||||
'url': TEST_URL,
|
||||
'status': 200,
|
||||
'statusText': 'OK',
|
||||
'headers': {
|
||||
'content-type': 'text/html; charset=UTF-8',
|
||||
'server': 'MockChromeServer',
|
||||
'x-test-header': 'from-chrome-session'
|
||||
}
|
||||
}
|
||||
|
||||
headers_file = chrome_session_dir / 'response_headers.json'
|
||||
headers_file.write_text(json.dumps(mock_headers))
|
||||
|
||||
# Run headers extraction
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testchrome'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'chrome_session' in result.stdout, "Should report using chrome_session method"
|
||||
|
||||
# Verify it used chrome_session headers
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
assert output_headers_file.exists(), "Output headers.json not created"
|
||||
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
assert output_data['headers']['x-test-header'] == 'from-chrome-session', \
|
||||
"Should use headers from chrome_session"
|
||||
assert output_data['headers']['server'] == 'MockChromeServer', \
|
||||
"Should use headers from chrome_session"
|
||||
|
||||
|
||||
def test_falls_back_to_http_when_chrome_session_unavailable():
|
||||
"""Test that headers plugin falls back to HTTP HEAD when chrome_session unavailable."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Don't create chrome_session directory - force HTTP fallback
|
||||
|
||||
# Run headers extraction
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'http' in result.stdout.lower() or 'HEAD' not in result.stdout, \
|
||||
"Should use HTTP method"
|
||||
|
||||
# Verify output exists and has real HTTP headers
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
assert output_headers_file.exists(), "Output headers.json not created"
|
||||
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
assert output_data['url'] == TEST_URL
|
||||
assert output_data['status'] in [200, 301, 302]
|
||||
assert isinstance(output_data['headers'], dict)
|
||||
assert len(output_data['headers']) > 0
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that TIMEOUT config is respected."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout (but example.com should still succeed)
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should complete (success or fail, but not hang)
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
def test_config_user_agent():
|
||||
"""Test that USER_AGENT config is used."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set custom user agent
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['USER_AGENT'] = 'TestBot/1.0'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout
|
||||
|
||||
|
||||
def test_handles_https_urls():
|
||||
"""Test that HTTPS URLs work correctly."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
if output_headers_file.exists():
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
assert output_data['url'] == 'https://example.org'
|
||||
assert output_data['status'] in [200, 301, 302]
|
||||
|
||||
|
||||
def test_handles_404_gracefully():
|
||||
"""Test that headers plugin handles 404s gracefully."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(HEADERS_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# May succeed or fail depending on server behavior
|
||||
# If it succeeds, verify 404 status is captured
|
||||
if result.returncode == 0:
|
||||
output_headers_file = tmpdir / 'headers' / 'headers.json'
|
||||
if output_headers_file.exists():
|
||||
output_data = json.loads(output_headers_file.read_text())
|
||||
assert output_data['status'] == 404, "Should capture 404 status"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
182
archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
Normal file
182
archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py
Normal file
@@ -0,0 +1,182 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Convert HTML to plain text for search indexing.
|
||||
|
||||
This extractor reads HTML from other extractors (wget, singlefile, dom)
|
||||
and converts it to plain text for full-text search.
|
||||
|
||||
Usage: on_Snapshot__htmltotext.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes htmltotext.txt to $PWD
|
||||
|
||||
Environment variables:
|
||||
TIMEOUT: Timeout in seconds (not used, but kept for consistency)
|
||||
|
||||
Note: This extractor does not require any external binaries.
|
||||
It uses Python's built-in html.parser module.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'htmltotext'
|
||||
OUTPUT_DIR = 'htmltotext'
|
||||
OUTPUT_FILE = 'htmltotext.txt'
|
||||
|
||||
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
"""Extract text content from HTML, ignoring scripts/styles."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.result = []
|
||||
self.skip_tags = {'script', 'style', 'head', 'meta', 'link', 'noscript'}
|
||||
self.current_tag = None
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self.current_tag = tag.lower()
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.current_tag = None
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.current_tag not in self.skip_tags:
|
||||
text = data.strip()
|
||||
if text:
|
||||
self.result.append(text)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return ' '.join(self.result)
|
||||
|
||||
|
||||
def html_to_text(html: str) -> str:
|
||||
"""Convert HTML to plain text."""
|
||||
parser = HTMLTextExtractor()
|
||||
try:
|
||||
parser.feed(html)
|
||||
return parser.get_text()
|
||||
except Exception:
|
||||
# Fallback: strip HTML tags with regex
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def find_html_source() -> str | None:
|
||||
"""Find HTML content from other extractors in the snapshot directory."""
|
||||
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
|
||||
search_patterns = [
|
||||
'singlefile/singlefile.html',
|
||||
'singlefile/*.html',
|
||||
'dom/output.html',
|
||||
'dom/*.html',
|
||||
'wget/**/*.html',
|
||||
'wget/**/*.htm',
|
||||
]
|
||||
|
||||
cwd = Path.cwd()
|
||||
for pattern in search_patterns:
|
||||
matches = list(cwd.glob(pattern))
|
||||
for match in matches:
|
||||
if match.is_file() and match.stat().st_size > 0:
|
||||
try:
|
||||
return match.read_text(errors='ignore')
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_htmltotext(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Extract plain text from HTML sources.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Find HTML source from other extractors
|
||||
html_content = find_html_source()
|
||||
if not html_content:
|
||||
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
|
||||
|
||||
# Convert HTML to text
|
||||
text = html_to_text(html_content)
|
||||
|
||||
if not text or len(text) < 10:
|
||||
return False, None, 'No meaningful text extracted from HTML'
|
||||
|
||||
# Create output directory and write output
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
output_path.write_text(text, encoding='utf-8')
|
||||
|
||||
return True, str(output_path), ''
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL that was archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Convert HTML to plain text for search indexing."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = extract_htmltotext(url)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
text_len = Path(output).stat().st_size
|
||||
print(f'Extracted {text_len} characters of text')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,115 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* I Still Don't Care About Cookies Extension Plugin
|
||||
*
|
||||
* Installs and configures the "I still don't care about cookies" Chrome extension
|
||||
* for automatic cookie consent banner dismissal during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
|
||||
*
|
||||
* Priority: 02 (early) - Must install before Chrome session starts
|
||||
* Hook: on_Snapshot
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Dismisses cookie consent popups
|
||||
* - Removes cookie banners
|
||||
* - Accepts necessary cookies to proceed with browsing
|
||||
* - Works on thousands of websites out of the box
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
/**
|
||||
* Install the I Still Don't Care About Cookies extension
|
||||
*/
|
||||
async function installCookiesExtension() {
|
||||
console.log('[*] Installing I Still Don\'t Care About Cookies extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log('[+] I Still Don\'t Care About Cookies extension installed');
|
||||
console.log('[+] Cookie banners will be automatically dismissed during archiving');
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: This extension works out of the box with no configuration needed.
|
||||
* It automatically detects and dismisses cookie banners on page load.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installCookiesExtension();
|
||||
|
||||
// Export extension metadata for chrome_session to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome_session can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installCookiesExtension,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
@@ -0,0 +1,279 @@
|
||||
/**
|
||||
* Unit tests for istilldontcareaboutcookies plugin
|
||||
*
|
||||
* Run with: node --test tests/test_istilldontcareaboutcookies.js
|
||||
*/
|
||||
|
||||
const assert = require('assert');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
|
||||
|
||||
// Test fixtures
|
||||
const TEST_DIR = path.join(__dirname, '.test_fixtures');
|
||||
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
|
||||
|
||||
describe('istilldontcareaboutcookies plugin', () => {
|
||||
before(() => {
|
||||
if (!fs.existsSync(TEST_DIR)) {
|
||||
fs.mkdirSync(TEST_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
after(() => {
|
||||
if (fs.existsSync(TEST_DIR)) {
|
||||
fs.rmSync(TEST_DIR, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('EXTENSION metadata', () => {
|
||||
it('should have correct webstore_id', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
|
||||
});
|
||||
|
||||
it('should have correct name', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.name, 'istilldontcareaboutcookies');
|
||||
});
|
||||
});
|
||||
|
||||
describe('installCookiesExtension', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should use cached extension if available', async () => {
|
||||
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
// Create fake cache
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies');
|
||||
|
||||
fs.mkdirSync(fakeExtensionDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(fakeExtensionDir, 'manifest.json'),
|
||||
JSON.stringify({ version: '1.1.8' })
|
||||
);
|
||||
|
||||
const fakeCache = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
unpacked_path: fakeExtensionDir,
|
||||
version: '1.1.8'
|
||||
};
|
||||
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
|
||||
|
||||
const result = await installCookiesExtension();
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
assert.strictEqual(result.webstore_id, 'edibdbjcniadpccecjdfdjjppcpchdlm');
|
||||
});
|
||||
|
||||
it('should not require any configuration', async () => {
|
||||
// This extension works out of the box
|
||||
// No API keys or config needed
|
||||
const { EXTENSION } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
assert.ok(EXTENSION);
|
||||
// No config fields should be required
|
||||
});
|
||||
});
|
||||
|
||||
describe('cache file creation', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should create cache file with correct extension name', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
|
||||
// Create mock extension
|
||||
const mockExtension = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
version: '1.1.9'
|
||||
};
|
||||
|
||||
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
|
||||
|
||||
assert.ok(fs.existsSync(cacheFile));
|
||||
|
||||
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
assert.strictEqual(cache.name, 'istilldontcareaboutcookies');
|
||||
});
|
||||
|
||||
it('should use correct filename pattern', () => {
|
||||
const expectedPattern = 'istilldontcareaboutcookies.extension.json';
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, expectedPattern);
|
||||
|
||||
// Pattern should match expected format
|
||||
assert.ok(path.basename(cacheFile).endsWith('.extension.json'));
|
||||
assert.ok(path.basename(cacheFile).includes('istilldontcareaboutcookies'));
|
||||
});
|
||||
});
|
||||
|
||||
describe('extension functionality', () => {
|
||||
it('should work automatically without configuration', () => {
|
||||
// This extension automatically dismisses cookie banners
|
||||
// No manual trigger or configuration needed
|
||||
|
||||
const features = {
|
||||
automaticBannerDismissal: true,
|
||||
requiresConfiguration: false,
|
||||
requiresApiKey: false,
|
||||
requiresUserAction: false
|
||||
};
|
||||
|
||||
assert.strictEqual(features.automaticBannerDismissal, true);
|
||||
assert.strictEqual(features.requiresConfiguration, false);
|
||||
assert.strictEqual(features.requiresApiKey, false);
|
||||
assert.strictEqual(features.requiresUserAction, false);
|
||||
});
|
||||
|
||||
it('should not require any runtime hooks', () => {
|
||||
// Extension works purely via Chrome's content script injection
|
||||
// No need for additional hooks or configuration
|
||||
|
||||
const requiresHooks = {
|
||||
preNavigation: false,
|
||||
postNavigation: false,
|
||||
onPageLoad: false
|
||||
};
|
||||
|
||||
assert.strictEqual(requiresHooks.preNavigation, false);
|
||||
assert.strictEqual(requiresHooks.postNavigation, false);
|
||||
assert.strictEqual(requiresHooks.onPageLoad, false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('priority and execution order', () => {
|
||||
it('should have priority 02 (early)', () => {
|
||||
const filename = 'on_Snapshot__02_istilldontcareaboutcookies.js';
|
||||
|
||||
// Extract priority from filename
|
||||
const match = filename.match(/on_Snapshot__(\d+)_/);
|
||||
assert.ok(match);
|
||||
|
||||
const priority = parseInt(match[1]);
|
||||
assert.strictEqual(priority, 2);
|
||||
});
|
||||
|
||||
it('should run before chrome_session (priority 20)', () => {
|
||||
const extensionPriority = 2;
|
||||
const chromeSessionPriority = 20;
|
||||
|
||||
assert.ok(extensionPriority < chromeSessionPriority);
|
||||
});
|
||||
});
|
||||
|
||||
describe('error handling', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should handle corrupted cache gracefully', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
|
||||
// Create corrupted cache
|
||||
fs.writeFileSync(cacheFile, 'invalid json content');
|
||||
|
||||
// Should detect corruption and proceed with fresh install
|
||||
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
// Mock loadOrInstallExtension to avoid actual download
|
||||
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
|
||||
const originalFunc = extensionUtils.loadOrInstallExtension;
|
||||
|
||||
extensionUtils.loadOrInstallExtension = async () => ({
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
version: '1.1.9'
|
||||
});
|
||||
|
||||
const result = await installCookiesExtension();
|
||||
|
||||
extensionUtils.loadOrInstallExtension = originalFunc;
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
});
|
||||
|
||||
it('should handle missing manifest gracefully', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
|
||||
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_cookies_no_manifest');
|
||||
|
||||
// Create directory without manifest
|
||||
fs.mkdirSync(fakeExtensionDir, { recursive: true });
|
||||
|
||||
const fakeCache = {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
unpacked_path: fakeExtensionDir
|
||||
};
|
||||
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
|
||||
|
||||
const { installCookiesExtension } = require('../on_Snapshot__02_istilldontcareaboutcookies.js');
|
||||
|
||||
// Mock to return fresh extension when manifest missing
|
||||
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
|
||||
const originalFunc = extensionUtils.loadOrInstallExtension;
|
||||
|
||||
let freshInstallCalled = false;
|
||||
extensionUtils.loadOrInstallExtension = async () => {
|
||||
freshInstallCalled = true;
|
||||
return {
|
||||
webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
|
||||
name: 'istilldontcareaboutcookies',
|
||||
version: '1.1.9'
|
||||
};
|
||||
};
|
||||
|
||||
const result = await installCookiesExtension();
|
||||
|
||||
extensionUtils.loadOrInstallExtension = originalFunc;
|
||||
|
||||
// Should trigger fresh install when manifest missing
|
||||
assert.ok(freshInstallCalled || result);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,122 @@
|
||||
"""
|
||||
Unit tests for istilldontcareaboutcookies plugin
|
||||
|
||||
Tests invoke the plugin hook as an external process and verify outputs/side effects.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__02_istilldontcareaboutcookies.js"
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm"
|
||||
assert metadata["name"] == "istilldontcareaboutcookies"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "Installing" in result.stdout or "installed" in result.stdout or "istilldontcareaboutcookies" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "istilldontcareaboutcookies.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "edibdbjcniadpccecjdfdjjppcpchdlm"
|
||||
assert cache_data["name"] == "istilldontcareaboutcookies"
|
||||
|
||||
|
||||
def test_install_uses_existing_cache():
|
||||
"""Test that install uses existing cache when available"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
# Create fake cache
|
||||
fake_extension_dir = ext_dir / "edibdbjcniadpccecjdfdjjppcpchdlm__istilldontcareaboutcookies"
|
||||
fake_extension_dir.mkdir(parents=True)
|
||||
|
||||
manifest = {"version": "1.1.8", "name": "I still don't care about cookies"}
|
||||
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should use cache or install successfully
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_no_configuration_required():
|
||||
"""Test that extension works without any configuration"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# No special env vars needed - works out of the box
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should not require any API keys or configuration
|
||||
assert "API" not in (result.stdout + result.stderr) or result.returncode == 0
|
||||
55
archivebox/plugins/media/config.json
Normal file
55
archivebox/plugins/media/config.json
Normal file
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_MEDIA": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["USE_YTDLP", "FETCH_MEDIA"],
|
||||
"description": "Enable media downloading with yt-dlp"
|
||||
},
|
||||
"YOUTUBEDL_BINARY": {
|
||||
"type": "string",
|
||||
"default": "yt-dlp",
|
||||
"x-aliases": ["YTDLP_BINARY", "YOUTUBE_DL_BINARY"],
|
||||
"description": "Path to yt-dlp binary"
|
||||
},
|
||||
"MEDIA_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 3600,
|
||||
"minimum": 30,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for media downloads in seconds"
|
||||
},
|
||||
"MEDIA_MAX_SIZE": {
|
||||
"type": "string",
|
||||
"default": "750m",
|
||||
"pattern": "^\\d+[kmgKMG]?$",
|
||||
"description": "Maximum file size for media downloads"
|
||||
},
|
||||
"YTDLP_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"YTDLP_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [
|
||||
"--write-info-json",
|
||||
"--write-thumbnail",
|
||||
"--write-sub",
|
||||
"--embed-subs",
|
||||
"--write-auto-sub"
|
||||
],
|
||||
"description": "Default yt-dlp arguments"
|
||||
},
|
||||
"YTDLP_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for yt-dlp (space-separated)"
|
||||
}
|
||||
}
|
||||
}
|
||||
306
archivebox/plugins/media/on_Snapshot__51_media.py
Normal file
306
archivebox/plugins/media/on_Snapshot__51_media.py
Normal file
@@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download media from a URL using yt-dlp.
|
||||
|
||||
Usage: on_Snapshot__media.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads media files to $PWD/media/
|
||||
|
||||
Environment variables:
|
||||
YTDLP_BINARY: Path to yt-dlp binary
|
||||
YTDLP_TIMEOUT: Timeout in seconds (default: 3600 for large media)
|
||||
YTDLP_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
YTDLP_EXTRA_ARGS: Extra arguments for yt-dlp (space-separated)
|
||||
|
||||
# Media feature toggles
|
||||
USE_YTDLP: Enable yt-dlp media extraction (default: True)
|
||||
SAVE_MEDIA: Alias for USE_YTDLP
|
||||
|
||||
# Media size limits
|
||||
MEDIA_MAX_SIZE: Maximum media file size (default: 750m)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if YTDLP_* not set:
|
||||
MEDIA_TIMEOUT: Fallback timeout for media
|
||||
TIMEOUT: Fallback timeout
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'media'
|
||||
BIN_NAME = 'yt-dlp'
|
||||
BIN_PROVIDERS = 'pip,apt,brew,env'
|
||||
OUTPUT_DIR = 'media'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
staticfile_dir = Path(STATICFILE_DIR)
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
def find_ytdlp() -> str | None:
|
||||
"""Find yt-dlp binary."""
|
||||
ytdlp = get_env('YTDLP_BINARY') or get_env('YOUTUBEDL_BINARY')
|
||||
if ytdlp and os.path.isfile(ytdlp):
|
||||
return ytdlp
|
||||
|
||||
for name in ['yt-dlp', 'youtube-dl']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get yt-dlp version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
# Default yt-dlp args (from old YTDLP_CONFIG)
|
||||
def get_ytdlp_default_args(media_max_size: str = '750m') -> list[str]:
|
||||
"""Build default yt-dlp arguments."""
|
||||
return [
|
||||
'--restrict-filenames',
|
||||
'--trim-filenames', '128',
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
'--no-abort-on-error',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
f'--format=(bv*+ba/b)[filesize<={media_max_size}][filesize_approx<=?{media_max_size}]/(bv*+ba/b)',
|
||||
]
|
||||
|
||||
|
||||
def save_media(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Download media using yt-dlp.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with YTDLP_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
timeout = get_env_int('YTDLP_TIMEOUT') or get_env_int('MEDIA_TIMEOUT') or get_env_int('TIMEOUT', 3600)
|
||||
check_ssl = get_env_bool('YTDLP_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
extra_args = get_env('YTDLP_EXTRA_ARGS') or get_env('YOUTUBEDL_EXTRA_ARGS', '')
|
||||
media_max_size = get_env('MEDIA_MAX_SIZE', '750m')
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Build command (later options take precedence)
|
||||
cmd = [
|
||||
binary,
|
||||
*get_ytdlp_default_args(media_max_size),
|
||||
'--no-progress',
|
||||
'-o', f'{OUTPUT_DIR}/%(title)s.%(ext)s',
|
||||
]
|
||||
|
||||
if not check_ssl:
|
||||
cmd.append('--no-check-certificate')
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout, text=True)
|
||||
|
||||
# Check if any media files were downloaded
|
||||
media_extensions = (
|
||||
'.mp4', '.webm', '.mkv', '.avi', '.mov', '.flv', '.wmv', '.m4v',
|
||||
'.mp3', '.m4a', '.ogg', '.wav', '.flac', '.aac', '.opus',
|
||||
'.json', '.jpg', '.png', '.webp', '.jpeg',
|
||||
'.vtt', '.srt', '.ass', '.lrc',
|
||||
'.description',
|
||||
)
|
||||
|
||||
downloaded_files = [
|
||||
f for f in output_dir.glob('*')
|
||||
if f.is_file() and f.suffix.lower() in media_extensions
|
||||
]
|
||||
|
||||
if downloaded_files:
|
||||
# Return first video/audio file, or first file if no media
|
||||
video_audio = [
|
||||
f for f in downloaded_files
|
||||
if f.suffix.lower() in ('.mp4', '.webm', '.mkv', '.avi', '.mov', '.mp3', '.m4a', '.ogg', '.wav', '.flac')
|
||||
]
|
||||
output = str(video_audio[0]) if video_audio else str(downloaded_files[0])
|
||||
return True, output, ''
|
||||
else:
|
||||
stderr = result.stderr
|
||||
|
||||
# These are NOT errors - page simply has no downloadable media
|
||||
# Return success with no output (legitimate "nothing to download")
|
||||
if 'ERROR: Unsupported URL' in stderr:
|
||||
return True, None, '' # Not a media site - success, no output
|
||||
if 'URL could be a direct video link' in stderr:
|
||||
return True, None, '' # Not a supported media URL - success, no output
|
||||
if result.returncode == 0:
|
||||
return True, None, '' # yt-dlp exited cleanly, just no media - success
|
||||
|
||||
# These ARE errors - something went wrong
|
||||
if 'HTTP Error 404' in stderr:
|
||||
return False, None, '404 Not Found'
|
||||
if 'HTTP Error 403' in stderr:
|
||||
return False, None, '403 Forbidden'
|
||||
if 'Unable to extract' in stderr:
|
||||
return False, None, 'Unable to extract media info'
|
||||
|
||||
return False, None, f'yt-dlp error: {stderr[:200]}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to download media from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Download media from a URL using yt-dlp."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
cmd_str = ''
|
||||
|
||||
try:
|
||||
# Check if yt-dlp is enabled
|
||||
if not (get_env_bool('USE_YTDLP', True) and get_env_bool('SAVE_MEDIA', True)):
|
||||
print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile extractor already handled this (permanent skip)
|
||||
if has_staticfile_output():
|
||||
print(f'Skipping media - staticfile extractor already downloaded this')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Find binary
|
||||
binary = find_ytdlp()
|
||||
if not binary:
|
||||
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
print(f'INSTALL_HINT=pip install yt-dlp OR brew install yt-dlp', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} {url}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_media(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
files = list(output_dir.glob('*'))
|
||||
file_count = len([f for f in files if f.is_file()])
|
||||
if file_count > 0:
|
||||
print(f'yt-dlp completed: {file_count} files downloaded')
|
||||
else:
|
||||
print(f'yt-dlp completed: no media found on page (this is normal)')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if cmd_str:
|
||||
print(f'CMD={cmd_str}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
30
archivebox/plugins/mercury/config.json
Normal file
30
archivebox/plugins/mercury/config.json
Normal file
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_MERCURY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable Mercury text extraction"
|
||||
},
|
||||
"MERCURY_BINARY": {
|
||||
"type": "string",
|
||||
"default": "postlight-parser",
|
||||
"x-aliases": ["POSTLIGHT_PARSER_BINARY"],
|
||||
"description": "Path to Mercury/Postlight parser binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"description": "Path to Node.js binary"
|
||||
},
|
||||
"MERCURY_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for Mercury in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
201
archivebox/plugins/mercury/on_Snapshot__53_mercury.py
Normal file
201
archivebox/plugins/mercury/on_Snapshot__53_mercury.py
Normal file
@@ -0,0 +1,201 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract article content using Postlight's Mercury Parser.
|
||||
|
||||
Usage: on_Snapshot__mercury.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Creates mercury/ directory with content.html, content.txt, article.json
|
||||
|
||||
Environment variables:
|
||||
MERCURY_BINARY: Path to mercury-parser binary
|
||||
TIMEOUT: Timeout in seconds (default: 60)
|
||||
|
||||
Note: Requires mercury-parser: npm install -g @postlight/mercury-parser
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'mercury'
|
||||
BIN_NAME = 'mercury-parser'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'mercury'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def find_mercury() -> str | None:
|
||||
"""Find mercury-parser binary."""
|
||||
mercury = get_env('MERCURY_BINARY')
|
||||
if mercury and os.path.isfile(mercury):
|
||||
return mercury
|
||||
|
||||
for name in ['mercury-parser', 'mercury']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get mercury-parser version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
def extract_mercury(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Extract article using Mercury Parser.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('TIMEOUT', 60)
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Get text version
|
||||
cmd_text = [binary, url, '--format=text']
|
||||
result_text = subprocess.run(cmd_text, capture_output=True, timeout=timeout)
|
||||
|
||||
if result_text.returncode != 0:
|
||||
stderr = result_text.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'mercury-parser failed: {stderr[:200]}'
|
||||
|
||||
try:
|
||||
text_json = json.loads(result_text.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return False, None, 'mercury-parser returned invalid JSON'
|
||||
|
||||
if text_json.get('failed'):
|
||||
return False, None, 'Mercury was not able to extract article'
|
||||
|
||||
# Save text content
|
||||
text_content = text_json.get('content', '')
|
||||
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
|
||||
|
||||
# Get HTML version
|
||||
cmd_html = [binary, url, '--format=html']
|
||||
result_html = subprocess.run(cmd_html, capture_output=True, timeout=timeout)
|
||||
|
||||
try:
|
||||
html_json = json.loads(result_html.stdout)
|
||||
except json.JSONDecodeError:
|
||||
html_json = {}
|
||||
|
||||
# Save HTML content and metadata
|
||||
html_content = html_json.pop('content', '')
|
||||
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
|
||||
|
||||
# Save article metadata
|
||||
metadata = {k: v for k, v in text_json.items() if k != 'content'}
|
||||
(output_dir / 'article.json').write_text(json.dumps(metadata, indent=2), encoding='utf-8')
|
||||
|
||||
return True, OUTPUT_DIR, ''
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to extract article from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Extract article content using Postlight's Mercury Parser."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
|
||||
try:
|
||||
# Find binary
|
||||
binary = find_mercury()
|
||||
if not binary:
|
||||
print(f'ERROR: mercury-parser binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
|
||||
# Run extraction
|
||||
success, output, error = extract_mercury(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
text_file = Path(output) / 'content.txt'
|
||||
html_file = Path(output) / 'content.html'
|
||||
text_len = text_file.stat().st_size if text_file.exists() else 0
|
||||
html_len = html_file.stat().st_size if html_file.exists() else 0
|
||||
print(f'Mercury extracted: {text_len} chars text, {html_len} chars HTML')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if binary:
|
||||
print(f'CMD={binary} {url}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
295
archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
Executable file
295
archivebox/plugins/merkletree/on_Snapshot__93_merkletree.py
Executable file
@@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create a Merkle tree of all archived outputs.
|
||||
|
||||
This plugin runs after all extractors and post-processing complete (priority 92)
|
||||
and generates a cryptographic Merkle tree of all files in the snapshot directory.
|
||||
This provides:
|
||||
- Tamper detection: verify archive integrity
|
||||
- Efficient updates: only re-hash changed files
|
||||
- Compact proofs: prove file inclusion without sending all files
|
||||
- Deduplication: identify identical content across snapshots
|
||||
|
||||
Output: merkletree/merkletree.json containing:
|
||||
- root_hash: SHA256 hash of the Merkle root
|
||||
- tree: Full tree structure with internal nodes
|
||||
- files: List of all files with their hashes
|
||||
- metadata: Timestamp, file count, total size
|
||||
|
||||
Usage: on_Snapshot__92_merkletree.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SAVE_MERKLETREE: Enable merkle tree generation (default: true)
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.plugins.merkletree'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple, Any
|
||||
|
||||
# Configure Django if running standalone
|
||||
if __name__ == '__main__':
|
||||
parent_dir = str(Path(__file__).resolve().parent.parent.parent)
|
||||
if parent_dir not in sys.path:
|
||||
sys.path.insert(0, parent_dir)
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
def sha256_file(filepath: Path) -> str:
|
||||
"""Compute SHA256 hash of a file."""
|
||||
h = hashlib.sha256()
|
||||
try:
|
||||
with open(filepath, 'rb') as f:
|
||||
# Read in 64kb chunks
|
||||
while chunk := f.read(65536):
|
||||
h.update(chunk)
|
||||
return h.hexdigest()
|
||||
except (OSError, PermissionError):
|
||||
# If we can't read the file, return a null hash
|
||||
return '0' * 64
|
||||
|
||||
|
||||
def sha256_data(data: bytes) -> str:
|
||||
"""Compute SHA256 hash of raw data."""
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def collect_files(snapshot_dir: Path, exclude_dirs: Optional[List[str]] = None) -> List[Tuple[Path, str, int]]:
|
||||
"""
|
||||
Recursively collect all files in snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: Root directory to scan
|
||||
exclude_dirs: Directory names to exclude (e.g., ['merkletree', '.git'])
|
||||
|
||||
Returns:
|
||||
List of (relative_path, sha256_hash, file_size) tuples
|
||||
"""
|
||||
exclude_dirs = exclude_dirs or ['merkletree', '.git', '__pycache__']
|
||||
files = []
|
||||
|
||||
for root, dirs, filenames in os.walk(snapshot_dir):
|
||||
# Filter out excluded directories
|
||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
||||
|
||||
for filename in filenames:
|
||||
filepath = Path(root) / filename
|
||||
rel_path = filepath.relative_to(snapshot_dir)
|
||||
|
||||
# Skip symlinks (we hash the target, not the link)
|
||||
if filepath.is_symlink():
|
||||
continue
|
||||
|
||||
# Compute hash and size
|
||||
file_hash = sha256_file(filepath)
|
||||
file_size = filepath.stat().st_size if filepath.exists() else 0
|
||||
|
||||
files.append((rel_path, file_hash, file_size))
|
||||
|
||||
# Sort by path for deterministic tree
|
||||
files.sort(key=lambda x: str(x[0]))
|
||||
return files
|
||||
|
||||
|
||||
def build_merkle_tree(file_hashes: List[str]) -> Tuple[str, List[List[str]]]:
|
||||
"""
|
||||
Build a Merkle tree from a list of leaf hashes.
|
||||
|
||||
Args:
|
||||
file_hashes: List of SHA256 hashes (leaves)
|
||||
|
||||
Returns:
|
||||
(root_hash, tree_levels) where tree_levels is a list of hash lists per level
|
||||
"""
|
||||
if not file_hashes:
|
||||
# Empty tree
|
||||
return sha256_data(b''), [[]]
|
||||
|
||||
# Initialize with leaf level
|
||||
tree_levels = [file_hashes.copy()]
|
||||
|
||||
# Build tree bottom-up
|
||||
while len(tree_levels[-1]) > 1:
|
||||
current_level = tree_levels[-1]
|
||||
next_level = []
|
||||
|
||||
# Process pairs
|
||||
for i in range(0, len(current_level), 2):
|
||||
left = current_level[i]
|
||||
|
||||
if i + 1 < len(current_level):
|
||||
# Combine left + right
|
||||
right = current_level[i + 1]
|
||||
combined = left + right
|
||||
else:
|
||||
# Odd number of nodes: duplicate the last one
|
||||
combined = left + left
|
||||
|
||||
parent_hash = sha256_data(combined.encode('utf-8'))
|
||||
next_level.append(parent_hash)
|
||||
|
||||
tree_levels.append(next_level)
|
||||
|
||||
# Root is the single hash at the top level
|
||||
root_hash = tree_levels[-1][0]
|
||||
return root_hash, tree_levels
|
||||
|
||||
|
||||
def create_merkle_tree(snapshot_dir: Path) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a complete Merkle tree of all files in snapshot directory.
|
||||
|
||||
Args:
|
||||
snapshot_dir: The snapshot directory to scan
|
||||
|
||||
Returns:
|
||||
Dict containing root_hash, tree structure, file list, and metadata
|
||||
"""
|
||||
# Collect all files
|
||||
files = collect_files(snapshot_dir)
|
||||
|
||||
# Extract just the hashes for tree building
|
||||
file_hashes = [file_hash for _, file_hash, _ in files]
|
||||
|
||||
# Build Merkle tree
|
||||
root_hash, tree_levels = build_merkle_tree(file_hashes)
|
||||
|
||||
# Calculate total size
|
||||
total_size = sum(size for _, _, size in files)
|
||||
|
||||
# Prepare file list with metadata
|
||||
file_list = [
|
||||
{
|
||||
'path': str(path),
|
||||
'hash': file_hash,
|
||||
'size': size,
|
||||
}
|
||||
for path, file_hash, size in files
|
||||
]
|
||||
|
||||
# Prepare result
|
||||
result = {
|
||||
'root_hash': root_hash,
|
||||
'tree_levels': tree_levels,
|
||||
'files': file_list,
|
||||
'metadata': {
|
||||
'timestamp': datetime.now().isoformat(),
|
||||
'file_count': len(files),
|
||||
'total_size': total_size,
|
||||
'tree_depth': len(tree_levels),
|
||||
},
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL being archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Generate Merkle tree of all archived outputs."""
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
start_ts = datetime.now()
|
||||
status = 'failed'
|
||||
output = None
|
||||
error = ''
|
||||
root_hash = None
|
||||
file_count = 0
|
||||
|
||||
try:
|
||||
# Check if enabled
|
||||
save_merkletree = os.getenv('SAVE_MERKLETREE', 'true').lower() in ('true', '1', 'yes', 'on')
|
||||
|
||||
if not save_merkletree:
|
||||
click.echo('Skipping merkle tree (SAVE_MERKLETREE=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now()
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'STATUS={status}')
|
||||
click.echo(f'RESULT_JSON={{"extractor": "merkletree", "status": "{status}", "url": "{url}", "snapshot_id": "{snapshot_id}"}}')
|
||||
sys.exit(0)
|
||||
|
||||
# Get snapshot
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
except Snapshot.DoesNotExist:
|
||||
error = f'Snapshot {snapshot_id} not found'
|
||||
raise ValueError(error)
|
||||
|
||||
# Get snapshot directory
|
||||
snapshot_dir = Path(snapshot.output_dir)
|
||||
if not snapshot_dir.exists():
|
||||
error = f'Snapshot directory not found: {snapshot_dir}'
|
||||
raise FileNotFoundError(error)
|
||||
|
||||
# Create output directory
|
||||
output_dir = snapshot_dir / 'merkletree'
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / 'merkletree.json'
|
||||
|
||||
# Generate Merkle tree
|
||||
merkle_data = create_merkle_tree(snapshot_dir)
|
||||
|
||||
# Write output
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(merkle_data, f, indent=2)
|
||||
|
||||
status = 'succeeded'
|
||||
output = str(output_path)
|
||||
root_hash = merkle_data['root_hash']
|
||||
file_count = merkle_data['metadata']['file_count']
|
||||
total_size = merkle_data['metadata']['total_size']
|
||||
|
||||
click.echo(f'Merkle tree created: {file_count} files, root={root_hash[:16]}..., size={total_size:,} bytes')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
click.echo(f'Error: {error}', err=True)
|
||||
|
||||
end_ts = datetime.now()
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
# Print results
|
||||
click.echo(f'START_TS={start_ts.isoformat()}')
|
||||
click.echo(f'END_TS={end_ts.isoformat()}')
|
||||
click.echo(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
click.echo(f'OUTPUT={output}')
|
||||
click.echo(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
click.echo(f'ERROR={error}', err=True)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': 'merkletree',
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'root_hash': root_hash,
|
||||
'file_count': file_count,
|
||||
'error': error or None,
|
||||
}
|
||||
click.echo(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using npm package manager.
|
||||
|
||||
Usage: on_Dependency__install_using_npm_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
|
||||
Output: InstalledBinary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
|
||||
|
||||
# Fix pydantic forward reference issue
|
||||
NpmProvider.model_rebuild()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dependency-id', required=True, help="Dependency UUID")
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
"""Install binary using npm."""
|
||||
|
||||
if bin_providers != '*' and 'npm' not in bin_providers.split(','):
|
||||
click.echo(f"npm provider not allowed for {bin_name}", err=True)
|
||||
sys.exit(0)
|
||||
|
||||
# Use abx-pkg NpmProvider to install binary
|
||||
provider = NpmProvider()
|
||||
if not provider.INSTALLER_BIN:
|
||||
click.echo("npm not available on this system", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {bin_name} via npm...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
except Exception as e:
|
||||
click.echo(f"npm install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{bin_name} not found after npm install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output InstalledBinary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': bin_name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'npm',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
281
archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
Executable file
281
archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js
Executable file
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract and categorize outgoing links from a page's DOM.
|
||||
*
|
||||
* Categorizes links by type:
|
||||
* - hrefs: All <a> links
|
||||
* - images: <img src>
|
||||
* - css_stylesheets: <link rel=stylesheet>
|
||||
* - css_images: CSS background-image: url()
|
||||
* - js_scripts: <script src>
|
||||
* - iframes: <iframe src>
|
||||
* - links: <link> tags with rel/href
|
||||
*
|
||||
* Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_DOM_OUTLINKS: Enable DOM outlinks extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'parse_dom_outlinks';
|
||||
const OUTPUT_DIR = 'parse_dom_outlinks';
|
||||
const OUTPUT_FILE = 'outlinks.json';
|
||||
const URLS_FILE = 'urls.jsonl'; // For crawl system
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract outlinks
|
||||
async function extractOutlinks(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Extract outlinks by category
|
||||
const outlinksData = await page.evaluate(() => {
|
||||
const LINK_REGEX = /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/gi;
|
||||
|
||||
const filterDataUrls = (urls) => urls.filter(url => url && !url.startsWith('data:'));
|
||||
const filterW3Urls = (urls) => urls.filter(url => url && !url.startsWith('http://www.w3.org/'));
|
||||
|
||||
// Get raw links from HTML
|
||||
const html = document.documentElement.outerHTML;
|
||||
const raw = Array.from(html.matchAll(LINK_REGEX)).map(m => m[0]);
|
||||
|
||||
// Get all <a href> links
|
||||
const hrefs = Array.from(document.querySelectorAll('a[href]'))
|
||||
.map(elem => elem.href)
|
||||
.filter(url => url);
|
||||
|
||||
// Get all <link> tags (not just stylesheets)
|
||||
const linksMap = {};
|
||||
document.querySelectorAll('link[href]').forEach(elem => {
|
||||
const rel = elem.rel || '';
|
||||
const href = elem.href;
|
||||
if (href && rel !== 'stylesheet') {
|
||||
linksMap[href] = { rel, href };
|
||||
}
|
||||
});
|
||||
const links = Object.values(linksMap);
|
||||
|
||||
// Get iframes
|
||||
const iframes = Array.from(document.querySelectorAll('iframe[src]'))
|
||||
.map(elem => elem.src)
|
||||
.filter(url => url);
|
||||
|
||||
// Get images
|
||||
const images = Array.from(document.querySelectorAll('img[src]'))
|
||||
.map(elem => elem.src)
|
||||
.filter(url => url && !url.startsWith('data:'));
|
||||
|
||||
// Get CSS background images
|
||||
const css_images = Array.from(document.querySelectorAll('*'))
|
||||
.map(elem => {
|
||||
const bgImg = window.getComputedStyle(elem).getPropertyValue('background-image');
|
||||
const match = /url\(\s*?['"]?\s*?(\S+?)\s*?["']?\s*?\)/i.exec(bgImg);
|
||||
return match ? match[1] : null;
|
||||
})
|
||||
.filter(url => url);
|
||||
|
||||
// Get stylesheets
|
||||
const css_stylesheets = Array.from(document.querySelectorAll('link[rel=stylesheet]'))
|
||||
.map(elem => elem.href)
|
||||
.filter(url => url);
|
||||
|
||||
// Get JS scripts
|
||||
const js_scripts = Array.from(document.querySelectorAll('script[src]'))
|
||||
.map(elem => elem.src)
|
||||
.filter(url => url);
|
||||
|
||||
return {
|
||||
url: window.location.href,
|
||||
raw: [...new Set(filterDataUrls(filterW3Urls(raw)))],
|
||||
hrefs: [...new Set(filterDataUrls(hrefs))],
|
||||
links,
|
||||
iframes: [...new Set(iframes)],
|
||||
images: [...new Set(filterDataUrls(images))],
|
||||
css_images: [...new Set(filterDataUrls(css_images))],
|
||||
css_stylesheets: [...new Set(filterDataUrls(css_stylesheets))],
|
||||
js_scripts: [...new Set(filterDataUrls(js_scripts))],
|
||||
};
|
||||
});
|
||||
|
||||
// Write detailed output (for archival)
|
||||
fs.writeFileSync(outputPath, JSON.stringify(outlinksData, null, 2));
|
||||
|
||||
// Write urls.jsonl for crawl system (only hrefs that are crawlable pages)
|
||||
const urlsPath = path.join(OUTPUT_DIR, URLS_FILE);
|
||||
const crawlableUrls = outlinksData.hrefs.filter(href => {
|
||||
// Only include http/https URLs, exclude static assets
|
||||
if (!href.startsWith('http://') && !href.startsWith('https://')) return false;
|
||||
// Exclude common static file extensions
|
||||
const staticExts = ['.css', '.js', '.png', '.jpg', '.jpeg', '.gif', '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot', '.mp4', '.webm', '.mp3', '.pdf'];
|
||||
const urlPath = href.split('?')[0].split('#')[0].toLowerCase();
|
||||
return !staticExts.some(ext => urlPath.endsWith(ext));
|
||||
});
|
||||
|
||||
const urlsJsonl = crawlableUrls.map(href => JSON.stringify({
|
||||
type: 'Snapshot',
|
||||
url: href,
|
||||
via_extractor: EXTRACTOR_NAME,
|
||||
})).join('\n');
|
||||
|
||||
if (urlsJsonl) {
|
||||
fs.writeFileSync(urlsPath, urlsJsonl + '\n');
|
||||
}
|
||||
|
||||
return { success: true, output: outputPath, outlinksData, crawlableCount: crawlableUrls.length };
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_DOM_OUTLINKS', true)) {
|
||||
console.log('Skipping DOM outlinks (SAVE_DOM_OUTLINKS=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await extractOutlinks(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const total = result.outlinksData.hrefs.length;
|
||||
const crawlable = result.crawlableCount;
|
||||
const images = result.outlinksData.images.length;
|
||||
const scripts = result.outlinksData.js_scripts.length;
|
||||
console.log(`DOM outlinks extracted: ${total} links (${crawlable} crawlable), ${images} images, ${scripts} scripts`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
188
archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
Executable file
188
archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
Executable file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse HTML files and extract href URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads HTML content and extracts all <a href="..."> URLs.
|
||||
|
||||
NOTE: If parse_dom_outlinks already ran (parse_dom_outlinks/urls.jsonl exists),
|
||||
this extractor will skip since parse_dom_outlinks provides better coverage via Chrome.
|
||||
|
||||
Usage: ./on_Snapshot__60_parse_html_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Examples:
|
||||
./on_Snapshot__60_parse_html_urls.py --url=file:///path/to/page.html
|
||||
./on_Snapshot__60_parse_html_urls.py --url=https://example.com/page.html
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_html_urls'
|
||||
|
||||
# Check if parse_dom_outlinks extractor already ran
|
||||
DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
|
||||
|
||||
|
||||
# URL regex from archivebox/misc/util.py
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://'
|
||||
r'(?:[a-zA-Z]|[0-9]'
|
||||
r'|[-_$@.&+!*\(\),]'
|
||||
r'|[^\u0000-\u007F])+'
|
||||
r'[^\]\[<>"\'\s]+'
|
||||
r'))',
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
class HrefParser(HTMLParser):
|
||||
"""Extract href attributes from anchor tags."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'a':
|
||||
for attr, value in attrs:
|
||||
if attr == 'href' and value:
|
||||
self.urls.append(value)
|
||||
|
||||
|
||||
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
|
||||
"""Check if urljoin incorrectly stripped // from sub-URLs."""
|
||||
relative_path = relative_path.lower()
|
||||
if relative_path.startswith('http://') or relative_path.startswith('https://'):
|
||||
relative_path = relative_path.split('://', 1)[-1]
|
||||
|
||||
original_path_had_suburl = '://' in relative_path
|
||||
original_root_had_suburl = '://' in root_url[8:]
|
||||
final_joined_has_suburl = '://' in final_url[8:]
|
||||
|
||||
return (original_root_had_suburl or original_path_had_suburl) and not final_joined_has_suburl
|
||||
|
||||
|
||||
def fix_urljoin_bug(url: str, nesting_limit=5) -> str:
|
||||
"""Fix broken sub-URLs where :// was changed to :/."""
|
||||
input_url = url
|
||||
for _ in range(nesting_limit):
|
||||
url = re.sub(
|
||||
r'(?P<root>.+?)'
|
||||
r'(?P<separator>[-=/_&+%$#@!*\(\\])'
|
||||
r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/'
|
||||
r'(?P<suburl>[^/\\]+)',
|
||||
r'\1\2\3://\4',
|
||||
input_url,
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
if url == input_url:
|
||||
break
|
||||
input_url = url
|
||||
return url
|
||||
|
||||
|
||||
def normalize_url(url: str, root_url: str = None) -> str:
|
||||
"""Normalize a URL, resolving relative paths if root_url provided."""
|
||||
if not root_url:
|
||||
return url
|
||||
|
||||
url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')
|
||||
|
||||
if url_is_absolute:
|
||||
return url
|
||||
|
||||
# Resolve relative URL
|
||||
resolved = urljoin(root_url, url)
|
||||
|
||||
# Fix urljoin bug with sub-URLs
|
||||
if did_urljoin_misbehave(root_url, url, resolved):
|
||||
resolved = fix_urljoin_bug(resolved)
|
||||
|
||||
return resolved
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='HTML URL to parse')
|
||||
def main(url: str):
|
||||
"""Parse HTML and extract href URLs."""
|
||||
|
||||
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
|
||||
# If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
|
||||
if DOM_OUTLINKS_URLS_FILE.exists() and DOM_OUTLINKS_URLS_FILE.stat().st_size > 0:
|
||||
click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Parse HTML for hrefs
|
||||
parser = HrefParser()
|
||||
try:
|
||||
parser.feed(content)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to parse HTML: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = set()
|
||||
for href in parser.urls:
|
||||
# Normalize URL
|
||||
normalized = normalize_url(href, root_url=url)
|
||||
|
||||
# Only include http/https URLs
|
||||
if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
|
||||
# Skip the source URL itself
|
||||
if normalized != url:
|
||||
urls_found.add(unescape(normalized))
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No URLs found', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
for found_url in sorted(urls_found):
|
||||
f.write(json.dumps({
|
||||
'type': 'Snapshot',
|
||||
'url': found_url,
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
240
archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
Normal file
240
archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
Normal file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_html_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseHtmlUrls:
|
||||
"""Test the parse_html_urls extractor CLI."""
|
||||
|
||||
def test_parses_real_example_com(self, tmp_path):
|
||||
"""Test parsing real https://example.com and extracting its links."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'https://example.com'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists(), "Output file not created"
|
||||
|
||||
# Verify output contains IANA link (example.com links to iana.org)
|
||||
content = output_file.read_text()
|
||||
assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
|
||||
|
||||
def test_extracts_href_urls(self, tmp_path):
|
||||
"""Test extracting URLs from anchor tags."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com">Example</a>
|
||||
<a href="https://foo.bar/page">Foo</a>
|
||||
<a href="http://test.org">Test</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 3
|
||||
|
||||
urls = set()
|
||||
for line in lines:
|
||||
entry = json.loads(line)
|
||||
assert 'url' in entry
|
||||
urls.add(entry['url'])
|
||||
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://foo.bar/page' in urls
|
||||
assert 'http://test.org' in urls
|
||||
|
||||
def test_ignores_non_http_schemes(self, tmp_path):
|
||||
"""Test that non-http schemes are ignored."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="mailto:test@example.com">Email</a>
|
||||
<a href="javascript:void(0)">JS</a>
|
||||
<a href="tel:+1234567890">Phone</a>
|
||||
<a href="https://valid.com">Valid</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 1
|
||||
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://valid.com'
|
||||
|
||||
def test_handles_html_entities(self, tmp_path):
|
||||
"""Test that HTML entities in URLs are decoded."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com/page?a=1&b=2">Link</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
|
||||
def test_deduplicates_urls(self, tmp_path):
|
||||
"""Test that duplicate URLs are deduplicated."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com">Link 1</a>
|
||||
<a href="https://example.com">Link 2</a>
|
||||
<a href="https://example.com">Link 3</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 1
|
||||
|
||||
def test_excludes_source_url(self, tmp_path):
|
||||
"""Test that the source URL itself is excluded from results."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
source_url = f'file://{input_file}'
|
||||
input_file.write_text(f'''
|
||||
<html>
|
||||
<body>
|
||||
<a href="{source_url}">Self</a>
|
||||
<a href="https://other.com">Other</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', source_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 1
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://other.com'
|
||||
|
||||
def test_exits_1_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no URLs found."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('<html><body>No links here</body></html>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No URLs found' in result.stderr
|
||||
|
||||
def test_handles_malformed_html(self, tmp_path):
|
||||
"""Test handling of malformed HTML."""
|
||||
input_file = tmp_path / 'malformed.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com">Unclosed tag
|
||||
<a href="https://other.com">Another link</a>
|
||||
</body>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_output_is_valid_json(self, tmp_path):
|
||||
"""Test that output contains required fields."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('<a href="https://example.com">Link</a>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
184
archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
Executable file
184
archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
Executable file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse JSONL bookmark files and extract URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads JSONL-format bookmark exports (one JSON object per line).
|
||||
|
||||
Usage: ./on_Snapshot__54_parse_jsonl_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Expected JSONL format (one object per line):
|
||||
{"url": "https://example.com", "title": "Example", "tags": "tag1,tag2"}
|
||||
{"href": "https://other.com", "description": "Other Site"}
|
||||
|
||||
Supports various field names for URL, title, timestamp, and tags.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from html import unescape
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_jsonl_urls'
|
||||
|
||||
|
||||
def parse_bookmarked_at(link: dict) -> str | None:
|
||||
"""Parse timestamp from various JSON formats, return ISO 8601."""
|
||||
from datetime import timezone
|
||||
|
||||
def json_date(s: str) -> datetime:
|
||||
# Try ISO 8601 format
|
||||
return datetime.strptime(s.split(',', 1)[0], '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
def to_iso(dt: datetime) -> str:
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.isoformat()
|
||||
|
||||
try:
|
||||
if link.get('bookmarked_at'):
|
||||
# Already in our format, pass through
|
||||
return link['bookmarked_at']
|
||||
elif link.get('timestamp'):
|
||||
# Chrome/Firefox histories use microseconds
|
||||
return to_iso(datetime.fromtimestamp(link['timestamp'] / 1000000, tz=timezone.utc))
|
||||
elif link.get('time'):
|
||||
return to_iso(json_date(link['time']))
|
||||
elif link.get('created_at'):
|
||||
return to_iso(json_date(link['created_at']))
|
||||
elif link.get('created'):
|
||||
return to_iso(json_date(link['created']))
|
||||
elif link.get('date'):
|
||||
return to_iso(json_date(link['date']))
|
||||
elif link.get('bookmarked'):
|
||||
return to_iso(json_date(link['bookmarked']))
|
||||
elif link.get('saved'):
|
||||
return to_iso(json_date(link['saved']))
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def json_object_to_entry(link: dict) -> dict | None:
|
||||
"""Convert a JSON bookmark object to a URL entry."""
|
||||
# Parse URL (try various field names)
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
return None
|
||||
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}
|
||||
|
||||
# Parse title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
if title:
|
||||
entry['title'] = unescape(title)
|
||||
|
||||
# Parse bookmarked_at (ISO 8601)
|
||||
bookmarked_at = parse_bookmarked_at(link)
|
||||
if bookmarked_at:
|
||||
entry['bookmarked_at'] = bookmarked_at
|
||||
|
||||
# Parse tags
|
||||
tags = link.get('tags', '')
|
||||
if isinstance(tags, list):
|
||||
tags = ','.join(tags)
|
||||
elif isinstance(tags, str) and ',' not in tags and tags:
|
||||
# If no comma, assume space-separated
|
||||
tags = tags.replace(' ', ',')
|
||||
if tags:
|
||||
entry['tags'] = unescape(tags)
|
||||
|
||||
return entry
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='JSONL file URL to parse')
|
||||
def main(url: str):
|
||||
"""Parse JSONL bookmark file and extract URLs."""
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = []
|
||||
for line in content.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
link = json.loads(line)
|
||||
entry = json_object_to_entry(link)
|
||||
if entry:
|
||||
urls_found.append(entry)
|
||||
except json.JSONDecodeError:
|
||||
# Skip malformed lines
|
||||
continue
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No URLs found', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Collect unique tags
|
||||
all_tags = set()
|
||||
for entry in urls_found:
|
||||
if entry.get('tags'):
|
||||
for tag in entry['tags'].split(','):
|
||||
tag = tag.strip()
|
||||
if tag:
|
||||
all_tags.add(tag)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
# Write Tag records first
|
||||
for tag_name in sorted(all_tags):
|
||||
f.write(json.dumps({
|
||||
'type': 'Tag',
|
||||
'name': tag_name,
|
||||
}) + '\n')
|
||||
# Write Snapshot records
|
||||
for entry in urls_found:
|
||||
f.write(json.dumps(entry) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,272 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_jsonl_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseJsonlUrls:
|
||||
"""Test the parse_jsonl_urls extractor CLI."""
|
||||
|
||||
def test_extracts_urls_from_jsonl(self, tmp_path):
|
||||
"""Test extracting URLs from JSONL bookmark file."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://example.com", "title": "Example"}\n'
|
||||
'{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
|
||||
'{"url": "https://test.org", "title": "Test Org"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 3
|
||||
|
||||
entries = [json.loads(line) for line in lines]
|
||||
urls = {e['url'] for e in entries}
|
||||
titles = {e.get('title') for e in entries}
|
||||
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://foo.bar/page' in urls
|
||||
assert 'https://test.org' in urls
|
||||
assert 'Example' in titles
|
||||
assert 'Foo Bar' in titles
|
||||
assert 'Test Org' in titles
|
||||
|
||||
def test_supports_href_field(self, tmp_path):
|
||||
"""Test that 'href' field is recognized as URL."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
|
||||
def test_supports_description_as_title(self, tmp_path):
|
||||
"""Test that 'description' field is used as title fallback."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['title'] == 'A description'
|
||||
|
||||
def test_parses_various_timestamp_formats(self, tmp_path):
|
||||
"""Test parsing of different timestamp field names."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
# Parser converts timestamp to bookmarked_at
|
||||
assert 'bookmarked_at' in entry
|
||||
|
||||
def test_parses_tags_as_string(self, tmp_path):
|
||||
"""Test parsing tags as comma-separated string."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
# Parser converts tags to separate Tag objects in the output
|
||||
content = output_file.read_text()
|
||||
assert 'tech' in content or 'news' in content or 'Tag' in content
|
||||
|
||||
def test_parses_tags_as_list(self, tmp_path):
|
||||
"""Test parsing tags as JSON array."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
# Parser converts tags to separate Tag objects in the output
|
||||
content = output_file.read_text()
|
||||
assert 'tech' in content or 'news' in content or 'Tag' in content
|
||||
|
||||
def test_skips_malformed_lines(self, tmp_path):
|
||||
"""Test that malformed JSON lines are skipped."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://valid.com"}\n'
|
||||
'not valid json\n'
|
||||
'{"url": "https://also-valid.com"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_skips_entries_without_url(self, tmp_path):
|
||||
"""Test that entries without URL field are skipped."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://valid.com"}\n'
|
||||
'{"title": "No URL here"}\n'
|
||||
'{"url": "https://also-valid.com"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_exits_1_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no URLs found."""
|
||||
input_file = tmp_path / 'empty.jsonl'
|
||||
input_file.write_text('{"title": "No URL"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No URLs found' in result.stderr
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when file doesn't exist."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'Failed to fetch' in result.stderr
|
||||
|
||||
def test_handles_html_entities(self, tmp_path):
|
||||
"""Test that HTML entities in URLs and titles are decoded."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com/page?a=1&b=2", "title": "Test & Title"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
assert entry['title'] == 'Test & Title'
|
||||
|
||||
def test_skips_empty_lines(self, tmp_path):
|
||||
"""Test that empty lines are skipped."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://example.com"}\n'
|
||||
'\n'
|
||||
' \n'
|
||||
'{"url": "https://other.com"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_output_includes_required_fields(self, tmp_path):
|
||||
"""Test that output includes required fields."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
116
archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
Executable file
116
archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
Executable file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse Netscape bookmark HTML files and extract URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads Netscape-format bookmark exports (produced by all major browsers).
|
||||
|
||||
Usage: ./on_Snapshot__53_parse_netscape_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Examples:
|
||||
./on_Snapshot__53_parse_netscape_urls.py --url=file:///path/to/bookmarks.html
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_netscape_urls'
|
||||
|
||||
# Regex pattern for Netscape bookmark format
|
||||
# Example: <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" TAGS="tag1,tag2">example title</A>
|
||||
NETSCAPE_PATTERN = re.compile(
|
||||
r'<a\s+href="([^"]+)"\s+add_date="(\d+)"(?:\s+[^>]*?tags="([^"]*)")?[^>]*>([^<]+)</a>',
|
||||
re.UNICODE | re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
|
||||
def main(url: str):
|
||||
"""Parse Netscape bookmark HTML and extract URLs."""
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = []
|
||||
all_tags = set()
|
||||
|
||||
for line in content.splitlines():
|
||||
match = NETSCAPE_PATTERN.search(line)
|
||||
if match:
|
||||
bookmark_url = match.group(1)
|
||||
tags_str = match.group(3) or ''
|
||||
title = match.group(4).strip()
|
||||
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(bookmark_url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}
|
||||
if title:
|
||||
entry['title'] = unescape(title)
|
||||
if tags_str:
|
||||
entry['tags'] = tags_str
|
||||
# Collect unique tags
|
||||
for tag in tags_str.split(','):
|
||||
tag = tag.strip()
|
||||
if tag:
|
||||
all_tags.add(tag)
|
||||
try:
|
||||
# Convert unix timestamp to ISO 8601
|
||||
entry['bookmarked_at'] = datetime.fromtimestamp(float(match.group(2)), tz=timezone.utc).isoformat()
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
urls_found.append(entry)
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No bookmarks found', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
# Write Tag records first
|
||||
for tag_name in sorted(all_tags):
|
||||
f.write(json.dumps({
|
||||
'type': 'Tag',
|
||||
'name': tag_name,
|
||||
}) + '\n')
|
||||
# Write Snapshot records
|
||||
for entry in urls_found:
|
||||
f.write(json.dumps(entry) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,205 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_netscape_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseNetscapeUrls:
|
||||
"""Test the parse_netscape_urls extractor CLI."""
|
||||
|
||||
def test_extracts_urls_from_netscape_bookmarks(self, tmp_path):
|
||||
"""Test extracting URLs from Netscape bookmark HTML format."""
|
||||
input_file = tmp_path / 'bookmarks.html'
|
||||
input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
||||
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
|
||||
<TITLE>Bookmarks</TITLE>
|
||||
<H1>Bookmarks</H1>
|
||||
<DL><p>
|
||||
<DT><A HREF="https://example.com" ADD_DATE="1609459200">Example Site</A>
|
||||
<DT><A HREF="https://foo.bar/page" ADD_DATE="1609545600">Foo Bar</A>
|
||||
<DT><A HREF="https://test.org" ADD_DATE="1609632000">Test Org</A>
|
||||
</DL><p>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 3
|
||||
|
||||
entries = [json.loads(line) for line in lines]
|
||||
urls = {e['url'] for e in entries}
|
||||
titles = {e.get('title') for e in entries}
|
||||
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://foo.bar/page' in urls
|
||||
assert 'https://test.org' in urls
|
||||
assert 'Example Site' in titles
|
||||
assert 'Foo Bar' in titles
|
||||
assert 'Test Org' in titles
|
||||
|
||||
def test_parses_add_date_timestamps(self, tmp_path):
|
||||
"""Test that ADD_DATE timestamps are parsed correctly."""
|
||||
input_file = tmp_path / 'bookmarks.html'
|
||||
input_file.write_text('''
|
||||
<DT><A HREF="https://example.com" ADD_DATE="1609459200">Test</A>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
# Parser converts timestamp to bookmarked_at
|
||||
assert 'bookmarked_at' in entry
|
||||
|
||||
def test_handles_query_params_in_urls(self, tmp_path):
|
||||
"""Test that URLs with query parameters are preserved."""
|
||||
input_file = tmp_path / 'bookmarks.html'
|
||||
input_file.write_text('''
|
||||
<DT><A HREF="https://example.com/search?q=test+query&page=1" ADD_DATE="1609459200">Search</A>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert 'q=test+query' in entry['url']
|
||||
assert 'page=1' in entry['url']
|
||||
|
||||
def test_handles_html_entities(self, tmp_path):
|
||||
"""Test that HTML entities in URLs and titles are decoded."""
|
||||
input_file = tmp_path / 'bookmarks.html'
|
||||
input_file.write_text('''
|
||||
<DT><A HREF="https://example.com/page?a=1&b=2" ADD_DATE="1609459200">Test & Title</A>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
assert entry['title'] == 'Test & Title'
|
||||
|
||||
def test_exits_1_when_no_bookmarks_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no bookmarks found."""
|
||||
input_file = tmp_path / 'empty.html'
|
||||
input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
||||
<TITLE>Bookmarks</TITLE>
|
||||
<H1>Bookmarks</H1>
|
||||
<DL><p>
|
||||
</DL><p>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No bookmarks found' in result.stderr
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when file doesn't exist."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.html'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'Failed to fetch' in result.stderr
|
||||
|
||||
def test_handles_nested_folders(self, tmp_path):
|
||||
"""Test parsing bookmarks in nested folder structure."""
|
||||
input_file = tmp_path / 'bookmarks.html'
|
||||
input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
||||
<DL><p>
|
||||
<DT><H3>Folder 1</H3>
|
||||
<DL><p>
|
||||
<DT><A HREF="https://example.com/nested1" ADD_DATE="1609459200">Nested 1</A>
|
||||
<DT><H3>Subfolder</H3>
|
||||
<DL><p>
|
||||
<DT><A HREF="https://example.com/nested2" ADD_DATE="1609459200">Nested 2</A>
|
||||
</DL><p>
|
||||
</DL><p>
|
||||
<DT><A HREF="https://example.com/top" ADD_DATE="1609459200">Top Level</A>
|
||||
</DL><p>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
|
||||
assert 'https://example.com/nested1' in urls
|
||||
assert 'https://example.com/nested2' in urls
|
||||
assert 'https://example.com/top' in urls
|
||||
|
||||
def test_case_insensitive_parsing(self, tmp_path):
|
||||
"""Test that parsing is case-insensitive for HTML tags."""
|
||||
input_file = tmp_path / 'bookmarks.html'
|
||||
input_file.write_text('''
|
||||
<dt><a HREF="https://example.com" ADD_DATE="1609459200">Test</a>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
140
archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
Executable file
140
archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
Executable file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse RSS/Atom feeds and extract URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads feed content from a URL and extracts article URLs.
|
||||
|
||||
Usage: ./on_Snapshot__51_parse_rss_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Examples:
|
||||
./on_Snapshot__51_parse_rss_urls.py --url=https://example.com/feed.rss
|
||||
./on_Snapshot__51_parse_rss_urls.py --url=file:///path/to/feed.xml
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from time import mktime
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_rss_urls'
|
||||
|
||||
try:
|
||||
import feedparser
|
||||
except ImportError:
|
||||
feedparser = None
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
|
||||
def main(url: str):
|
||||
"""Parse RSS/Atom feed and extract article URLs."""
|
||||
|
||||
if feedparser is None:
|
||||
click.echo('feedparser library not installed', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Parse the feed
|
||||
feed = feedparser.parse(content)
|
||||
|
||||
if not feed.entries:
|
||||
click.echo('No entries found in feed', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = []
|
||||
for item in feed.entries:
|
||||
item_url = getattr(item, 'link', None)
|
||||
if not item_url:
|
||||
continue
|
||||
|
||||
title = getattr(item, 'title', None)
|
||||
|
||||
# Get bookmarked_at (published/updated date as ISO 8601)
|
||||
bookmarked_at = None
|
||||
if hasattr(item, 'published_parsed') and item.published_parsed:
|
||||
bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
|
||||
elif hasattr(item, 'updated_parsed') and item.updated_parsed:
|
||||
bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
|
||||
|
||||
# Get tags
|
||||
tags = ''
|
||||
if hasattr(item, 'tags') and item.tags:
|
||||
try:
|
||||
tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
|
||||
except (AttributeError, TypeError):
|
||||
pass
|
||||
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(item_url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}
|
||||
if title:
|
||||
entry['title'] = unescape(title)
|
||||
if bookmarked_at:
|
||||
entry['bookmarked_at'] = bookmarked_at
|
||||
if tags:
|
||||
entry['tags'] = tags
|
||||
urls_found.append(entry)
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No valid URLs found in feed entries', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Collect unique tags
|
||||
all_tags = set()
|
||||
for entry in urls_found:
|
||||
if entry.get('tags'):
|
||||
for tag in entry['tags'].split(','):
|
||||
tag = tag.strip()
|
||||
if tag:
|
||||
all_tags.add(tag)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
# Write Tag records first
|
||||
for tag_name in sorted(all_tags):
|
||||
f.write(json.dumps({
|
||||
'type': 'Tag',
|
||||
'name': tag_name,
|
||||
}) + '\n')
|
||||
# Write Snapshot records
|
||||
for entry in urls_found:
|
||||
f.write(json.dumps(entry) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
213
archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
Normal file
213
archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
Normal file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_rss_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseRssUrls:
|
||||
"""Test the parse_rss_urls extractor CLI."""
|
||||
|
||||
def test_parses_real_rss_feed(self, tmp_path):
|
||||
"""Test parsing a real RSS feed from the web."""
|
||||
# Use httpbin.org which provides a sample RSS feed
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'https://news.ycombinator.com/rss'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# HN RSS feed should parse successfully
|
||||
if result.returncode == 0:
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists(), "Output file not created"
|
||||
|
||||
content = output_file.read_text()
|
||||
assert len(content) > 0, "No URLs extracted from real RSS feed"
|
||||
|
||||
# Verify at least one URL was extracted
|
||||
lines = content.strip().split('\n')
|
||||
assert len(lines) > 0, "No entries found in RSS feed"
|
||||
|
||||
def test_extracts_urls_from_rss_feed(self, tmp_path):
|
||||
"""Test extracting URLs from an RSS 2.0 feed."""
|
||||
input_file = tmp_path / 'feed.rss'
|
||||
input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<link>https://example.com</link>
|
||||
<item>
|
||||
<title>First Post</title>
|
||||
<link>https://example.com/post/1</link>
|
||||
<pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Second Post</title>
|
||||
<link>https://example.com/post/2</link>
|
||||
<pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 2 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
entries = [json.loads(line) for line in lines]
|
||||
urls = {e['url'] for e in entries}
|
||||
titles = {e.get('title') for e in entries}
|
||||
|
||||
assert 'https://example.com/post/1' in urls
|
||||
assert 'https://example.com/post/2' in urls
|
||||
assert 'First Post' in titles
|
||||
assert 'Second Post' in titles
|
||||
|
||||
def test_extracts_urls_from_atom_feed(self, tmp_path):
|
||||
"""Test extracting URLs from an Atom feed."""
|
||||
input_file = tmp_path / 'feed.atom'
|
||||
input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<title>Test Atom Feed</title>
|
||||
<entry>
|
||||
<title>Atom Post 1</title>
|
||||
<link href="https://atom.example.com/entry/1"/>
|
||||
<updated>2024-01-01T12:00:00Z</updated>
|
||||
</entry>
|
||||
<entry>
|
||||
<title>Atom Post 2</title>
|
||||
<link href="https://atom.example.com/entry/2"/>
|
||||
<updated>2024-01-02T12:00:00Z</updated>
|
||||
</entry>
|
||||
</feed>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
|
||||
assert 'https://atom.example.com/entry/1' in urls
|
||||
assert 'https://atom.example.com/entry/2' in urls
|
||||
|
||||
def test_exits_1_when_no_entries(self, tmp_path):
|
||||
"""Test that script exits with code 1 when feed has no entries."""
|
||||
input_file = tmp_path / 'empty.rss'
|
||||
input_file.write_text('''<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Empty Feed</title>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No entries found' in result.stderr
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when file doesn't exist."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/feed.rss'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'Failed to fetch' in result.stderr
|
||||
|
||||
def test_handles_html_entities_in_urls(self, tmp_path):
|
||||
"""Test that HTML entities in URLs are decoded."""
|
||||
input_file = tmp_path / 'feed.rss'
|
||||
input_file.write_text('''<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Entity Test</title>
|
||||
<link>https://example.com/page?a=1&b=2</link>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
|
||||
def test_includes_optional_metadata(self, tmp_path):
|
||||
"""Test that title and timestamp are included when present."""
|
||||
input_file = tmp_path / 'feed.rss'
|
||||
input_file.write_text('''<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Test Title</title>
|
||||
<link>https://example.com/test</link>
|
||||
<pubDate>Wed, 15 Jan 2020 10:30:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/test'
|
||||
assert entry['title'] == 'Test Title'
|
||||
# Parser converts timestamp to bookmarked_at
|
||||
assert 'bookmarked_at' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
137
archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
Executable file
137
archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py
Executable file
@@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse plain text files and extract URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads text content from a URL (file:// or https://) and extracts all URLs found.
|
||||
|
||||
Usage: ./on_Snapshot__52_parse_txt_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Examples:
|
||||
./on_Snapshot__52_parse_txt_urls.py --url=file:///path/to/urls.txt
|
||||
./on_Snapshot__52_parse_txt_urls.py --url=https://example.com/urls.txt
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from urllib.request import urlopen
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_txt_urls'
|
||||
|
||||
# URL regex from archivebox/misc/util.py
|
||||
# https://mathiasbynens.be/demo/url-regex
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
|
||||
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
|
||||
r'))',
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
def parens_are_matched(string: str, open_char='(', close_char=')') -> bool:
|
||||
"""Check that all parentheses in a string are balanced and nested properly."""
|
||||
count = 0
|
||||
for c in string:
|
||||
if c == open_char:
|
||||
count += 1
|
||||
elif c == close_char:
|
||||
count -= 1
|
||||
if count < 0:
|
||||
return False
|
||||
return count == 0
|
||||
|
||||
|
||||
def fix_url_from_markdown(url_str: str) -> str:
|
||||
"""
|
||||
Cleanup a regex-parsed URL that may contain trailing parens from markdown syntax.
|
||||
Example: https://wiki.org/article_(Disambiguation).html?q=1).text -> https://wiki.org/article_(Disambiguation).html?q=1
|
||||
"""
|
||||
trimmed_url = url_str
|
||||
|
||||
# Cut off trailing characters until parens are balanced
|
||||
while not parens_are_matched(trimmed_url):
|
||||
trimmed_url = trimmed_url[:-1]
|
||||
|
||||
# Verify trimmed URL is still valid
|
||||
if re.findall(URL_REGEX, trimmed_url):
|
||||
return trimmed_url
|
||||
|
||||
return url_str
|
||||
|
||||
|
||||
def find_all_urls(text: str):
|
||||
"""Find all URLs in a text string."""
|
||||
for url in re.findall(URL_REGEX, text):
|
||||
yield fix_url_from_markdown(url)
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
# Local file
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
# Remote URL
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to parse (file:// or https://)')
|
||||
def main(url: str):
|
||||
"""Parse plain text and extract URLs."""
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = set()
|
||||
for found_url in find_all_urls(content):
|
||||
cleaned_url = unescape(found_url)
|
||||
# Skip the source URL itself
|
||||
if cleaned_url != url:
|
||||
urls_found.add(cleaned_url)
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No URLs found', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
for found_url in sorted(urls_found):
|
||||
f.write(json.dumps({
|
||||
'type': 'Snapshot',
|
||||
'url': found_url,
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
193
archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
Normal file
193
archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_txt_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_txt_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseTxtUrls:
|
||||
"""Test the parse_txt_urls extractor CLI."""
|
||||
|
||||
def test_extracts_urls_including_real_example_com(self, tmp_path):
|
||||
"""Test extracting URLs from plain text including real example.com."""
|
||||
input_file = tmp_path / 'urls.txt'
|
||||
input_file.write_text('''
|
||||
https://example.com
|
||||
https://example.com/page
|
||||
https://www.iana.org/domains/reserved
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed: {result.stderr}"
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 3
|
||||
|
||||
urls = set()
|
||||
for line in lines:
|
||||
entry = json.loads(line)
|
||||
assert 'url' in entry
|
||||
urls.add(entry['url'])
|
||||
|
||||
# Verify real URLs are extracted correctly
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://example.com/page' in urls
|
||||
assert 'https://www.iana.org/domains/reserved' in urls
|
||||
|
||||
def test_extracts_urls_from_mixed_content(self, tmp_path):
|
||||
"""Test extracting URLs embedded in prose text."""
|
||||
input_file = tmp_path / 'mixed.txt'
|
||||
input_file.write_text('''
|
||||
Check out this great article at https://blog.example.com/post
|
||||
You can also visit http://docs.test.org for more info.
|
||||
Also see https://github.com/user/repo for the code.
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
|
||||
assert 'https://blog.example.com/post' in urls
|
||||
assert 'http://docs.test.org' in urls
|
||||
assert 'https://github.com/user/repo' in urls
|
||||
|
||||
def test_handles_markdown_urls(self, tmp_path):
|
||||
"""Test handling URLs in markdown format with parentheses."""
|
||||
input_file = tmp_path / 'markdown.txt'
|
||||
input_file.write_text('''
|
||||
[Example](https://example.com/page)
|
||||
[Wiki](https://en.wikipedia.org/wiki/Article_(Disambiguation))
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
|
||||
assert 'https://example.com/page' in urls
|
||||
assert any('wikipedia.org' in u for u in urls)
|
||||
|
||||
def test_exits_1_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no URLs found."""
|
||||
input_file = tmp_path / 'empty.txt'
|
||||
input_file.write_text('no urls here, just plain text')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No URLs found' in result.stderr
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when file doesn't exist."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/path.txt'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'Failed to fetch' in result.stderr
|
||||
|
||||
def test_deduplicates_urls(self, tmp_path):
|
||||
"""Test that duplicate URLs are deduplicated."""
|
||||
input_file = tmp_path / 'dupes.txt'
|
||||
input_file.write_text('''
|
||||
https://example.com
|
||||
https://example.com
|
||||
https://example.com
|
||||
https://other.com
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_appends_to_existing_file(self, tmp_path):
|
||||
"""Test that output creates urls.jsonl with extracted URLs."""
|
||||
input_file = tmp_path / 'urls.txt'
|
||||
input_file.write_text('https://new.com\nhttps://other.com')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
assert 'https://new.com' in urls
|
||||
assert 'https://other.com' in urls
|
||||
|
||||
def test_output_is_valid_json(self, tmp_path):
|
||||
"""Test that output contains required fields."""
|
||||
input_file = tmp_path / 'urls.txt'
|
||||
input_file.write_text('https://example.com')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
295
archivebox/plugins/pdf/on_Snapshot__35_pdf.js
Normal file
295
archivebox/plugins/pdf/on_Snapshot__35_pdf.js
Normal file
@@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Print a URL to PDF using Chrome/Puppeteer.
|
||||
*
|
||||
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
|
||||
* Otherwise launches a new Chrome instance.
|
||||
*
|
||||
* Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes pdf/output.pdf
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
|
||||
* CHROME_RESOLUTION: Page resolution (default: 1440,2000)
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'pdf';
|
||||
const OUTPUT_DIR = 'pdf';
|
||||
const OUTPUT_FILE = 'output.pdf';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session if available
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
async function printToPdf(url) {
|
||||
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
|
||||
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
|
||||
const headless = getEnvBool('CHROME_HEADLESS', true);
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
let connectedToSession = false;
|
||||
|
||||
try {
|
||||
// Try to connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
try {
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
connectedToSession = true;
|
||||
|
||||
// Get existing pages or create new one
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
page = await browser.newPage();
|
||||
}
|
||||
|
||||
// Set viewport on the page
|
||||
await page.setViewport({ width, height });
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Failed to connect to CDP session: ${e.message}`);
|
||||
browser = null;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to launching new browser
|
||||
if (!browser) {
|
||||
const executablePath = findChrome();
|
||||
if (!executablePath) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
executablePath,
|
||||
headless: headless ? 'new' : false,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
`--window-size=${width},${height}`,
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
],
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
|
||||
page = await browser.newPage();
|
||||
|
||||
// Navigate to URL (only if we launched fresh browser)
|
||||
if (userAgent) {
|
||||
await page.setUserAgent(userAgent);
|
||||
}
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
}
|
||||
|
||||
// Print to PDF
|
||||
await page.pdf({
|
||||
path: outputPath,
|
||||
format: 'A4',
|
||||
printBackground: true,
|
||||
margin: {
|
||||
top: '0.5in',
|
||||
right: '0.5in',
|
||||
bottom: '0.5in',
|
||||
left: '0.5in',
|
||||
},
|
||||
});
|
||||
|
||||
if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
|
||||
return { success: true, output: outputPath };
|
||||
} else {
|
||||
return { success: false, error: 'PDF file not created' };
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
// Only close browser if we launched it (not if we connected to session)
|
||||
if (browser && !connectedToSession) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.log(`Skipping PDF - staticfile extractor already downloaded this`);
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${new Date().toISOString()}`);
|
||||
console.log(`STATUS=skipped`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
const result = await printToPdf(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const size = fs.statSync(output).size;
|
||||
console.log(`PDF saved (${size} bytes)`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Install a binary using pip package manager.
|
||||
|
||||
Usage: on_Dependency__install_using_pip_provider.py --dependency-id=<uuid> --bin-name=<name> [--custom-cmd=<cmd>]
|
||||
Output: InstalledBinary JSONL record to stdout after installation
|
||||
|
||||
Environment variables:
|
||||
MACHINE_ID: Machine UUID (set by orchestrator)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
from abx_pkg import Binary, PipProvider, BinProviderOverrides
|
||||
|
||||
# Fix pydantic forward reference issue
|
||||
PipProvider.model_rebuild()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--dependency-id', required=True, help="Dependency UUID")
|
||||
@click.option('--bin-name', required=True, help="Binary name to install")
|
||||
@click.option('--bin-providers', default='*', help="Allowed providers (comma-separated)")
|
||||
@click.option('--custom-cmd', default=None, help="Custom install command")
|
||||
def main(dependency_id: str, bin_name: str, bin_providers: str, custom_cmd: str | None):
|
||||
"""Install binary using pip."""
|
||||
|
||||
if bin_providers != '*' and 'pip' not in bin_providers.split(','):
|
||||
click.echo(f"pip provider not allowed for {bin_name}", err=True)
|
||||
sys.exit(0)
|
||||
|
||||
# Use abx-pkg PipProvider to install binary
|
||||
provider = PipProvider()
|
||||
if not provider.INSTALLER_BIN:
|
||||
click.echo("pip not available on this system", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
click.echo(f"Installing {bin_name} via pip...", err=True)
|
||||
|
||||
try:
|
||||
binary = Binary(name=bin_name, binproviders=[provider]).install()
|
||||
except Exception as e:
|
||||
click.echo(f"pip install failed: {e}", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
if not binary.abspath:
|
||||
click.echo(f"{bin_name} not found after pip install", err=True)
|
||||
sys.exit(1)
|
||||
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
# Output InstalledBinary JSONL record to stdout
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': bin_name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'pip',
|
||||
'machine_id': machine_id,
|
||||
'dependency_id': dependency_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
# Log human-readable info to stderr
|
||||
click.echo(f"Installed {bin_name} at {binary.abspath}", err=True)
|
||||
click.echo(f" version: {binary.version}", err=True)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
29
archivebox/plugins/readability/config.json
Normal file
29
archivebox/plugins/readability/config.json
Normal file
@@ -0,0 +1,29 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_READABILITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable Readability text extraction"
|
||||
},
|
||||
"READABILITY_BINARY": {
|
||||
"type": "string",
|
||||
"default": "readability-extractor",
|
||||
"description": "Path to readability-extractor binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"description": "Path to Node.js binary"
|
||||
},
|
||||
"READABILITY_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for Readability in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
219
archivebox/plugins/readability/on_Snapshot__52_readability.py
Normal file
219
archivebox/plugins/readability/on_Snapshot__52_readability.py
Normal file
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract article content using Mozilla's Readability.
|
||||
|
||||
Usage: on_Snapshot__readability.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Creates readability/ directory with content.html, content.txt, article.json
|
||||
|
||||
Environment variables:
|
||||
READABILITY_BINARY: Path to readability-cli binary
|
||||
TIMEOUT: Timeout in seconds (default: 60)
|
||||
|
||||
Note: Requires readability-cli: npm install -g readability-cli
|
||||
This extractor looks for HTML source from other extractors (wget, singlefile, dom)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'readability'
|
||||
BIN_NAME = 'readability-cli'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'readability'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def find_readability() -> str | None:
|
||||
"""Find readability-cli binary."""
|
||||
readability = get_env('READABILITY_BINARY')
|
||||
if readability and os.path.isfile(readability):
|
||||
return readability
|
||||
|
||||
for name in ['readability-cli', 'readable']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get readability-cli version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
def find_html_source() -> str | None:
|
||||
"""Find HTML content from other extractors in the snapshot directory."""
|
||||
# Hooks run in snapshot_dir, sibling extractor outputs are in subdirectories
|
||||
search_patterns = [
|
||||
'singlefile/singlefile.html',
|
||||
'singlefile/*.html',
|
||||
'dom/output.html',
|
||||
'dom/*.html',
|
||||
'wget/**/*.html',
|
||||
'wget/**/*.htm',
|
||||
]
|
||||
|
||||
cwd = Path.cwd()
|
||||
for pattern in search_patterns:
|
||||
matches = list(cwd.glob(pattern))
|
||||
for match in matches:
|
||||
if match.is_file() and match.stat().st_size > 0:
|
||||
return str(match)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_readability(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Extract article using Readability.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
timeout = get_env_int('TIMEOUT', 60)
|
||||
|
||||
# Find HTML source
|
||||
html_source = find_html_source()
|
||||
if not html_source:
|
||||
return False, None, 'No HTML source found (run singlefile, dom, or wget first)'
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
try:
|
||||
# Run readability-cli
|
||||
cmd = [binary, '--json', html_source]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
return False, None, f'readability-cli failed: {stderr[:200]}'
|
||||
|
||||
# Parse JSON output
|
||||
try:
|
||||
result_json = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
return False, None, 'readability-cli returned invalid JSON'
|
||||
|
||||
# Extract and save content
|
||||
# readability-cli v2.x uses hyphenated field names
|
||||
text_content = result_json.pop('text-content', result_json.pop('textContent', ''))
|
||||
html_content = result_json.pop('html-content', result_json.pop('content', ''))
|
||||
|
||||
if not text_content and not html_content:
|
||||
return False, None, 'No content extracted'
|
||||
|
||||
(output_dir / 'content.html').write_text(html_content, encoding='utf-8')
|
||||
(output_dir / 'content.txt').write_text(text_content, encoding='utf-8')
|
||||
(output_dir / 'article.json').write_text(json.dumps(result_json, indent=2), encoding='utf-8')
|
||||
|
||||
return True, OUTPUT_DIR, ''
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to extract article from')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Extract article content using Mozilla's Readability."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
|
||||
try:
|
||||
# Find binary
|
||||
binary = find_readability()
|
||||
if not binary:
|
||||
print(f'ERROR: readability-cli binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
|
||||
# Run extraction
|
||||
success, output, error = extract_readability(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
text_file = Path(output) / 'content.txt'
|
||||
html_file = Path(output) / 'content.html'
|
||||
text_len = text_file.stat().st_size if text_file.exists() else 0
|
||||
html_len = html_file.stat().st_size if html_file.exists() else 0
|
||||
print(f'Readability extracted: {text_len} chars text, {html_len} chars HTML')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if binary:
|
||||
print(f'CMD={binary} --json <html>')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
254
archivebox/plugins/readability/tests/test_readability.py
Normal file
254
archivebox/plugins/readability/tests/test_readability.py
Normal file
@@ -0,0 +1,254 @@
|
||||
"""
|
||||
Integration tests for readability plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin reports missing dependency correctly
|
||||
2. readability-cli can be installed via npm (note: package name != binary name)
|
||||
3. Extraction works against real example.com content
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
READABILITY_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_readability.py'))
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def create_example_html(tmpdir: Path) -> Path:
|
||||
"""Create sample HTML that looks like example.com with enough content for Readability."""
|
||||
singlefile_dir = tmpdir / 'singlefile'
|
||||
singlefile_dir.mkdir()
|
||||
|
||||
html_file = singlefile_dir / 'singlefile.html'
|
||||
html_file.write_text('''
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Example Domain</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
</head>
|
||||
<body>
|
||||
<article>
|
||||
<header>
|
||||
<h1>Example Domain</h1>
|
||||
</header>
|
||||
<div class="content">
|
||||
<p>This domain is for use in illustrative examples in documents. You may use this
|
||||
domain in literature without prior coordination or asking for permission.</p>
|
||||
|
||||
<p>Example domains are maintained by the Internet Assigned Numbers Authority (IANA)
|
||||
to provide a well-known address for documentation purposes. This helps authors create
|
||||
examples that readers can understand without confusion about actual domain ownership.</p>
|
||||
|
||||
<p>The practice of using example domains dates back to the early days of the internet.
|
||||
These reserved domains ensure that example code and documentation doesn't accidentally
|
||||
point to real, active websites that might change or disappear over time.</p>
|
||||
|
||||
<p>For more information about example domains and their history, you can visit the
|
||||
IANA website. They maintain several example domains including example.com, example.net,
|
||||
and example.org, all specifically reserved for this purpose.</p>
|
||||
|
||||
<p><a href="https://www.iana.org/domains/example">More information about example domains...</a></p>
|
||||
</div>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
return html_file
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert READABILITY_HOOK.exists(), f"Hook script not found: {READABILITY_HOOK}"
|
||||
|
||||
|
||||
def test_reports_missing_dependency_when_not_installed():
|
||||
"""Test that script reports DEPENDENCY_NEEDED when readability-cli is not found."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create HTML source so it doesn't fail on missing HTML
|
||||
create_example_html(tmpdir)
|
||||
|
||||
# Run with empty PATH so binary won't be found
|
||||
env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail and report missing dependency
|
||||
assert result.returncode != 0, "Should exit non-zero when dependency missing"
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
|
||||
assert 'readability-cli' in combined or 'BIN_NAME' in combined, "Should mention readability-cli"
|
||||
|
||||
|
||||
def test_can_install_readability_via_npm():
|
||||
"""Test that readability-cli can be installed via npm and binary becomes available.
|
||||
|
||||
Note: The npm package 'readability-cli' installs a binary named 'readable',
|
||||
so we test the full installation flow using npm install directly.
|
||||
"""
|
||||
|
||||
# Check npm is available
|
||||
if not shutil.which('npm'):
|
||||
pytest.skip("npm not available on this system")
|
||||
|
||||
# Install readability-cli package via npm
|
||||
# The orchestrator/dependency hooks would call this via npm provider
|
||||
result = subprocess.run(
|
||||
['npm', 'install', '-g', 'readability-cli'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"npm install failed: {result.stderr}"
|
||||
|
||||
# Verify the 'readable' binary is now available
|
||||
# (readability-cli package installs as 'readable' not 'readability-cli')
|
||||
result = subprocess.run(['which', 'readable'], capture_output=True, text=True)
|
||||
assert result.returncode == 0, "readable binary not found after npm install"
|
||||
|
||||
binary_path = result.stdout.strip()
|
||||
assert Path(binary_path).exists(), f"Binary should exist at {binary_path}"
|
||||
|
||||
# Test that it's executable and responds to --version
|
||||
result = subprocess.run(
|
||||
[binary_path, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10
|
||||
)
|
||||
assert result.returncode == 0, f"Binary not executable: {result.stderr}"
|
||||
|
||||
|
||||
def test_extracts_article_after_installation():
|
||||
"""Test full workflow: ensure readability-cli installed then extract from example.com HTML."""
|
||||
|
||||
# Check npm is available
|
||||
if not shutil.which('npm'):
|
||||
pytest.skip("npm not available on this system")
|
||||
|
||||
# Ensure readability-cli is installed (orchestrator would handle this)
|
||||
install_result = subprocess.run(
|
||||
['npm', 'install', '-g', 'readability-cli'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if install_result.returncode != 0:
|
||||
pytest.skip(f"Could not install readability-cli: {install_result.stderr}")
|
||||
|
||||
# Now test extraction
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create example.com HTML for readability to process
|
||||
create_example_html(tmpdir)
|
||||
|
||||
# Run readability extraction (should find the installed binary)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify output directory created
|
||||
readability_dir = tmpdir / 'readability'
|
||||
assert readability_dir.exists(), "Output directory not created"
|
||||
|
||||
# Verify output files exist
|
||||
html_file = readability_dir / 'content.html'
|
||||
txt_file = readability_dir / 'content.txt'
|
||||
json_file = readability_dir / 'article.json'
|
||||
|
||||
assert html_file.exists(), "content.html not created"
|
||||
assert txt_file.exists(), "content.txt not created"
|
||||
assert json_file.exists(), "article.json not created"
|
||||
|
||||
# Verify HTML content contains REAL example.com text
|
||||
html_content = html_file.read_text()
|
||||
assert len(html_content) > 100, f"HTML content too short: {len(html_content)} bytes"
|
||||
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
|
||||
assert ('illustrative examples' in html_content.lower() or
|
||||
'use in' in html_content.lower() or
|
||||
'literature' in html_content.lower()), \
|
||||
"Missing example.com description in HTML"
|
||||
|
||||
# Verify text content contains REAL example.com text
|
||||
txt_content = txt_file.read_text()
|
||||
assert len(txt_content) > 50, f"Text content too short: {len(txt_content)} bytes"
|
||||
assert 'example' in txt_content.lower(), "Missing 'example' in text"
|
||||
|
||||
# Verify JSON metadata
|
||||
json_data = json.loads(json_file.read_text())
|
||||
assert isinstance(json_data, dict), "article.json should be a dict"
|
||||
|
||||
# Verify stdout contains expected output
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'OUTPUT=readability' in result.stdout, "Should report output directory"
|
||||
|
||||
|
||||
def test_fails_gracefully_without_html_source():
|
||||
"""Test that extraction fails gracefully when no HTML source is available."""
|
||||
|
||||
# Check npm is available
|
||||
if not shutil.which('npm'):
|
||||
pytest.skip("npm not available on this system")
|
||||
|
||||
# Ensure readability-cli is installed
|
||||
install_result = subprocess.run(
|
||||
['npm', 'install', '-g', 'readability-cli'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if install_result.returncode != 0:
|
||||
pytest.skip("Could not install readability-cli")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Don't create any HTML source files
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(READABILITY_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode != 0, "Should fail without HTML source"
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert ('no html source' in combined_output.lower() or
|
||||
'not found' in combined_output.lower() or
|
||||
'ERROR=' in combined_output), \
|
||||
"Should report missing HTML source"
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
281
archivebox/plugins/redirects/on_Snapshot__22_redirects.js
Executable file
281
archivebox/plugins/redirects/on_Snapshot__22_redirects.js
Executable file
@@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Track complete redirect chains for a URL.
|
||||
*
|
||||
* Captures:
|
||||
* - HTTP redirects (301, 302, 303, 307, 308)
|
||||
* - Meta refresh redirects
|
||||
* - JavaScript redirects (basic detection)
|
||||
* - Full redirect chain with timestamps
|
||||
*
|
||||
* Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes redirects/redirects.json
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_REDIRECTS: Enable redirect tracking (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'redirects';
|
||||
const OUTPUT_DIR = 'redirects';
|
||||
const OUTPUT_FILE = 'redirects.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Track redirect chain
|
||||
async function trackRedirects(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
const redirectChain = [];
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Track all responses to capture redirects
|
||||
page.on('response', async (response) => {
|
||||
const status = response.status();
|
||||
const responseUrl = response.url();
|
||||
const headers = response.headers();
|
||||
|
||||
// Check if it's a redirect
|
||||
if (status >= 300 && status < 400) {
|
||||
redirectChain.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
url: responseUrl,
|
||||
status,
|
||||
statusText: response.statusText(),
|
||||
location: headers['location'] || headers['Location'] || '',
|
||||
type: 'http',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// Get the current URL (which is the final destination after redirects)
|
||||
const finalUrl = page.url();
|
||||
|
||||
// Check for meta refresh redirects
|
||||
const metaRefresh = await page.evaluate(() => {
|
||||
const meta = document.querySelector('meta[http-equiv="refresh"]');
|
||||
if (meta) {
|
||||
const content = meta.getAttribute('content') || '';
|
||||
const match = content.match(/url=['"]?([^'"]+)['"]?/i);
|
||||
return {
|
||||
content,
|
||||
url: match ? match[1] : null,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (metaRefresh && metaRefresh.url) {
|
||||
redirectChain.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
url: finalUrl,
|
||||
status: null,
|
||||
statusText: 'Meta Refresh',
|
||||
location: metaRefresh.url,
|
||||
type: 'meta_refresh',
|
||||
content: metaRefresh.content,
|
||||
});
|
||||
}
|
||||
|
||||
// Check for JavaScript redirects (basic detection)
|
||||
const jsRedirect = await page.evaluate(() => {
|
||||
// Check for common JavaScript redirect patterns
|
||||
const html = document.documentElement.outerHTML;
|
||||
const patterns = [
|
||||
/window\.location\s*=\s*['"]([^'"]+)['"]/i,
|
||||
/window\.location\.href\s*=\s*['"]([^'"]+)['"]/i,
|
||||
/window\.location\.replace\s*\(\s*['"]([^'"]+)['"]\s*\)/i,
|
||||
/document\.location\s*=\s*['"]([^'"]+)['"]/i,
|
||||
];
|
||||
|
||||
for (const pattern of patterns) {
|
||||
const match = html.match(pattern);
|
||||
if (match) {
|
||||
return {
|
||||
pattern: pattern.toString(),
|
||||
url: match[1],
|
||||
};
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (jsRedirect && jsRedirect.url) {
|
||||
redirectChain.push({
|
||||
timestamp: new Date().toISOString(),
|
||||
url: finalUrl,
|
||||
status: null,
|
||||
statusText: 'JavaScript Redirect',
|
||||
location: jsRedirect.url,
|
||||
type: 'javascript',
|
||||
pattern: jsRedirect.pattern,
|
||||
});
|
||||
}
|
||||
|
||||
const redirectData = {
|
||||
original_url: url,
|
||||
final_url: finalUrl,
|
||||
redirect_count: redirectChain.length,
|
||||
redirects: redirectChain,
|
||||
is_redirect: redirectChain.length > 0,
|
||||
};
|
||||
|
||||
// Write output
|
||||
fs.writeFileSync(outputPath, JSON.stringify(redirectData, null, 2));
|
||||
|
||||
return { success: true, output: outputPath, redirectData };
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__15_redirects.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_REDIRECTS', true)) {
|
||||
console.log('Skipping redirects (SAVE_REDIRECTS=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await trackRedirects(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const redirectCount = result.redirectData.redirect_count;
|
||||
const finalUrl = result.redirectData.final_url;
|
||||
if (redirectCount > 0) {
|
||||
console.log(`Tracked ${redirectCount} redirect(s) to: ${finalUrl}`);
|
||||
} else {
|
||||
console.log('No redirects detected');
|
||||
}
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
381
archivebox/plugins/responses/on_Snapshot__24_responses.js
Executable file
381
archivebox/plugins/responses/on_Snapshot__24_responses.js
Executable file
@@ -0,0 +1,381 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Archive all network responses during page load.
|
||||
*
|
||||
* Connects to Chrome session and captures ALL network responses (XHR, images, scripts, etc.)
|
||||
* Saves them in an organized directory structure with both timestamped unique files
|
||||
* and URL-organized symlinks.
|
||||
*
|
||||
* Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Creates responses/ directory with:
|
||||
* - all/<timestamp>__<METHOD>__<URL>.<ext>: Timestamped unique files
|
||||
* - <type>/<domain>/<path>/: URL-organized symlinks by resource type
|
||||
* - index.jsonl: Searchable index of all responses
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_RESPONSES: Enable response archiving (default: true)
|
||||
* RESPONSES_TIMEOUT: Timeout in seconds (default: 120)
|
||||
* RESPONSES_TYPES: Comma-separated resource types to save (default: all)
|
||||
* Options: script,stylesheet,font,image,media,xhr,websocket,document
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const crypto = require('crypto');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'responses';
|
||||
const OUTPUT_DIR = 'responses';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Resource types to capture (by default, capture everything)
|
||||
const DEFAULT_TYPES = ['script', 'stylesheet', 'font', 'image', 'media', 'xhr', 'websocket'];
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Get file extension from MIME type
|
||||
function getExtensionFromMimeType(mimeType) {
|
||||
const mimeMap = {
|
||||
'text/html': 'html',
|
||||
'text/css': 'css',
|
||||
'text/javascript': 'js',
|
||||
'application/javascript': 'js',
|
||||
'application/x-javascript': 'js',
|
||||
'application/json': 'json',
|
||||
'application/xml': 'xml',
|
||||
'text/xml': 'xml',
|
||||
'image/png': 'png',
|
||||
'image/jpeg': 'jpg',
|
||||
'image/gif': 'gif',
|
||||
'image/svg+xml': 'svg',
|
||||
'image/webp': 'webp',
|
||||
'font/woff': 'woff',
|
||||
'font/woff2': 'woff2',
|
||||
'font/ttf': 'ttf',
|
||||
'font/otf': 'otf',
|
||||
'application/font-woff': 'woff',
|
||||
'application/font-woff2': 'woff2',
|
||||
'video/mp4': 'mp4',
|
||||
'video/webm': 'webm',
|
||||
'audio/mpeg': 'mp3',
|
||||
'audio/ogg': 'ogg',
|
||||
};
|
||||
|
||||
const mimeBase = (mimeType || '').split(';')[0].trim().toLowerCase();
|
||||
return mimeMap[mimeBase] || '';
|
||||
}
|
||||
|
||||
// Get extension from URL path
|
||||
function getExtensionFromUrl(url) {
|
||||
try {
|
||||
const pathname = new URL(url).pathname;
|
||||
const match = pathname.match(/\.([a-z0-9]+)$/i);
|
||||
return match ? match[1].toLowerCase() : '';
|
||||
} catch (e) {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
// Sanitize filename
|
||||
function sanitizeFilename(str, maxLen = 200) {
|
||||
return str
|
||||
.replace(/[^a-zA-Z0-9._-]/g, '_')
|
||||
.slice(0, maxLen);
|
||||
}
|
||||
|
||||
// Create symlink (handle errors gracefully)
|
||||
async function createSymlink(target, linkPath) {
|
||||
try {
|
||||
// Create parent directory
|
||||
const dir = path.dirname(linkPath);
|
||||
if (!fs.existsSync(dir)) {
|
||||
fs.mkdirSync(dir, { recursive: true });
|
||||
}
|
||||
|
||||
// Remove existing symlink/file if present
|
||||
if (fs.existsSync(linkPath)) {
|
||||
fs.unlinkSync(linkPath);
|
||||
}
|
||||
|
||||
// Create relative symlink
|
||||
const relativePath = path.relative(dir, target);
|
||||
fs.symlinkSync(relativePath, linkPath);
|
||||
} catch (e) {
|
||||
// Ignore symlink errors (file conflicts, permissions, etc.)
|
||||
console.error(`Failed to create symlink: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Archive responses by intercepting network traffic
|
||||
async function archiveResponses(originalUrl) {
|
||||
const timeout = (getEnvInt('RESPONSES_TIMEOUT') || getEnvInt('TIMEOUT', 120)) * 1000;
|
||||
const typesStr = getEnv('RESPONSES_TYPES', DEFAULT_TYPES.join(','));
|
||||
const typesToSave = typesStr.split(',').map(t => t.trim().toLowerCase());
|
||||
|
||||
// Create output directories
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const allDir = path.join(OUTPUT_DIR, 'all');
|
||||
if (!fs.existsSync(allDir)) {
|
||||
fs.mkdirSync(allDir, { recursive: true });
|
||||
}
|
||||
|
||||
// Create index file
|
||||
const indexPath = path.join(OUTPUT_DIR, 'index.jsonl');
|
||||
fs.writeFileSync(indexPath, ''); // Clear existing
|
||||
|
||||
let browser = null;
|
||||
let savedCount = 0;
|
||||
const savedResponses = [];
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Enable request interception
|
||||
await page.setRequestInterception(false); // Don't block requests
|
||||
|
||||
// Listen for responses
|
||||
page.on('response', async (response) => {
|
||||
try {
|
||||
const request = response.request();
|
||||
const url = response.url();
|
||||
const resourceType = request.resourceType().toLowerCase();
|
||||
const method = request.method();
|
||||
const status = response.status();
|
||||
|
||||
// Skip redirects and errors
|
||||
if (status >= 300 && status < 400) return;
|
||||
if (status >= 400 && status < 600) return;
|
||||
|
||||
// Check if we should save this resource type
|
||||
if (typesToSave.length && !typesToSave.includes(resourceType)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Get response body
|
||||
let bodyBuffer = null;
|
||||
try {
|
||||
bodyBuffer = await response.buffer();
|
||||
} catch (e) {
|
||||
// Some responses can't be captured (already consumed, etc.)
|
||||
return;
|
||||
}
|
||||
|
||||
if (!bodyBuffer || bodyBuffer.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Determine file extension
|
||||
const mimeType = response.headers()['content-type'] || '';
|
||||
let extension = getExtensionFromMimeType(mimeType) || getExtensionFromUrl(url);
|
||||
|
||||
// Create timestamp-based unique filename
|
||||
const timestamp = new Date().toISOString().replace(/[-:]/g, '').replace(/\..+/, '');
|
||||
const urlHash = sanitizeFilename(encodeURIComponent(url).slice(0, 64));
|
||||
const uniqueFilename = `${timestamp}__${method}__${urlHash}${extension ? '.' + extension : ''}`;
|
||||
const uniquePath = path.join(allDir, uniqueFilename);
|
||||
|
||||
// Save to unique file
|
||||
fs.writeFileSync(uniquePath, bodyBuffer);
|
||||
|
||||
// Create URL-organized symlink
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const hostname = urlObj.hostname;
|
||||
const pathname = urlObj.pathname || '/';
|
||||
const filename = path.basename(pathname) || 'index' + (extension ? '.' + extension : '');
|
||||
const dirPath = path.dirname(pathname);
|
||||
|
||||
// Create symlink: responses/<type>/<hostname>/<path>/<filename>
|
||||
const symlinkDir = path.join(OUTPUT_DIR, resourceType, hostname, dirPath);
|
||||
const symlinkPath = path.join(symlinkDir, filename);
|
||||
await createSymlink(uniquePath, symlinkPath);
|
||||
} catch (e) {
|
||||
// URL parsing or symlink creation failed, skip
|
||||
}
|
||||
|
||||
// Calculate SHA256
|
||||
const sha256 = crypto.createHash('sha256').update(bodyBuffer).digest('hex');
|
||||
const urlSha256 = crypto.createHash('sha256').update(url).digest('hex');
|
||||
|
||||
// Write to index
|
||||
const indexEntry = {
|
||||
ts: timestamp,
|
||||
method,
|
||||
url: method === 'DATA' ? url.slice(0, 128) : url, // Truncate data: URLs
|
||||
urlSha256,
|
||||
status,
|
||||
resourceType,
|
||||
mimeType: mimeType.split(';')[0],
|
||||
responseSha256: sha256,
|
||||
path: './' + path.relative(OUTPUT_DIR, uniquePath),
|
||||
extension,
|
||||
};
|
||||
|
||||
fs.appendFileSync(indexPath, JSON.stringify(indexEntry) + '\n');
|
||||
savedResponses.push(indexEntry);
|
||||
savedCount++;
|
||||
|
||||
} catch (e) {
|
||||
// Log but don't fail the whole extraction
|
||||
console.error(`Error capturing response: ${e.message}`);
|
||||
}
|
||||
});
|
||||
|
||||
// Wait a bit to ensure we capture responses
|
||||
// (chrome_session already loaded the page, just capture any remaining traffic)
|
||||
await new Promise(resolve => setTimeout(resolve, 2000));
|
||||
|
||||
return {
|
||||
success: true,
|
||||
output: OUTPUT_DIR,
|
||||
savedCount,
|
||||
indexPath,
|
||||
};
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__23_responses.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
let savedCount = 0;
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_RESPONSES', true)) {
|
||||
console.log('Skipping responses (SAVE_RESPONSES=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await archiveResponses(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
savedCount = result.savedCount || 0;
|
||||
console.log(`Saved ${savedCount} network responses to ${output}/`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
saved_count: savedCount,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
90
archivebox/plugins/run_all_tests.sh
Executable file
90
archivebox/plugins/run_all_tests.sh
Executable file
@@ -0,0 +1,90 @@
|
||||
#!/bin/bash
|
||||
# Run all plugin tests
|
||||
#
|
||||
# Usage: ./run_all_tests.sh
|
||||
|
||||
set -e
|
||||
|
||||
echo "=========================================="
|
||||
echo "Running All Plugin Tests"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# Color codes
|
||||
GREEN='\033[0;32m'
|
||||
RED='\033[0;31m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
# Track results
|
||||
TOTAL_TESTS=0
|
||||
PASSED_TESTS=0
|
||||
FAILED_TESTS=0
|
||||
|
||||
run_test_suite() {
|
||||
local test_file=$1
|
||||
local test_name=$(basename $(dirname $test_file))
|
||||
|
||||
echo -e "${YELLOW}[RUNNING]${NC} $test_name tests..."
|
||||
|
||||
if node --test "$test_file" 2>&1; then
|
||||
echo -e "${GREEN}[PASSED]${NC} $test_name tests"
|
||||
PASSED_TESTS=$((PASSED_TESTS + 1))
|
||||
else
|
||||
echo -e "${RED}[FAILED]${NC} $test_name tests"
|
||||
FAILED_TESTS=$((FAILED_TESTS + 1))
|
||||
fi
|
||||
|
||||
TOTAL_TESTS=$((TOTAL_TESTS + 1))
|
||||
echo ""
|
||||
}
|
||||
|
||||
# Find and run all test files
|
||||
echo "Finding test files..."
|
||||
echo ""
|
||||
|
||||
# Chrome extensions utils tests
|
||||
if [ -f "chrome_extensions/tests/test_chrome_extension_utils.js" ]; then
|
||||
run_test_suite "chrome_extensions/tests/test_chrome_extension_utils.js"
|
||||
fi
|
||||
|
||||
# Captcha2 tests
|
||||
if [ -f "captcha2/tests/test_captcha2_install.js" ]; then
|
||||
run_test_suite "captcha2/tests/test_captcha2_install.js"
|
||||
fi
|
||||
|
||||
if [ -f "captcha2/tests/test_captcha2_config.js" ]; then
|
||||
run_test_suite "captcha2/tests/test_captcha2_config.js"
|
||||
fi
|
||||
|
||||
# I Still Don't Care About Cookies tests
|
||||
if [ -f "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js" ]; then
|
||||
run_test_suite "istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.js"
|
||||
fi
|
||||
|
||||
# uBlock tests
|
||||
if [ -f "ublock/tests/test_ublock.js" ]; then
|
||||
run_test_suite "ublock/tests/test_ublock.js"
|
||||
fi
|
||||
|
||||
# SingleFile tests
|
||||
if [ -f "singlefile/tests/test_singlefile.js" ]; then
|
||||
run_test_suite "singlefile/tests/test_singlefile.js"
|
||||
fi
|
||||
|
||||
# Print summary
|
||||
echo "=========================================="
|
||||
echo "Test Summary"
|
||||
echo "=========================================="
|
||||
echo -e "Total test suites: $TOTAL_TESTS"
|
||||
echo -e "${GREEN}Passed:${NC} $PASSED_TESTS"
|
||||
echo -e "${RED}Failed:${NC} $FAILED_TESTS"
|
||||
echo ""
|
||||
|
||||
if [ $FAILED_TESTS -eq 0 ]; then
|
||||
echo -e "${GREEN}✓ All tests passed!${NC}"
|
||||
exit 0
|
||||
else
|
||||
echo -e "${RED}✗ Some tests failed${NC}"
|
||||
exit 1
|
||||
fi
|
||||
29
archivebox/plugins/run_tests.sh
Executable file
29
archivebox/plugins/run_tests.sh
Executable file
@@ -0,0 +1,29 @@
|
||||
#!/bin/bash
|
||||
# Run all plugin tests
|
||||
#
|
||||
# Usage: ./run_tests.sh [plugin_name]
|
||||
#
|
||||
# Examples:
|
||||
# ./run_tests.sh # Run all tests
|
||||
# ./run_tests.sh captcha2 # Run only captcha2 tests
|
||||
# ./run_tests.sh chrome_* # Run all chrome tests
|
||||
|
||||
set -e
|
||||
|
||||
echo "=========================================="
|
||||
echo "Running ArchiveBox Plugin Tests"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
if [ -n "$1" ]; then
|
||||
echo "Running tests for: $1"
|
||||
python -m pytest "$1"/tests/ -v
|
||||
else
|
||||
echo "Running all plugin tests..."
|
||||
python -m pytest */tests/test_*.py -v
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Tests Complete"
|
||||
echo "=========================================="
|
||||
291
archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
Normal file
291
archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js
Normal file
@@ -0,0 +1,291 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Take a screenshot of a URL using Chrome/Puppeteer.
|
||||
*
|
||||
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
|
||||
* Otherwise launches a new Chrome instance.
|
||||
*
|
||||
* Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes screenshot/screenshot.png
|
||||
*
|
||||
* Environment variables:
|
||||
* CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
* CHROME_TIMEOUT: Timeout in seconds (default: 60)
|
||||
* CHROME_RESOLUTION: Screenshot resolution (default: 1440,2000)
|
||||
* CHROME_USER_AGENT: User agent string (optional)
|
||||
* CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
|
||||
* CHROME_HEADLESS: Run in headless mode (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'screenshot';
|
||||
const OUTPUT_DIR = 'screenshot';
|
||||
const OUTPUT_FILE = 'screenshot.png';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already downloaded this URL
|
||||
const STATICFILE_DIR = 'staticfile';
|
||||
function hasStaticFileOutput() {
|
||||
return fs.existsSync(STATICFILE_DIR) && fs.readdirSync(STATICFILE_DIR).length > 0;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session if available
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Find Chrome binary
|
||||
function findChrome() {
|
||||
const chromeBinary = getEnv('CHROME_BINARY');
|
||||
if (chromeBinary && fs.existsSync(chromeBinary)) {
|
||||
return chromeBinary;
|
||||
}
|
||||
|
||||
const candidates = [
|
||||
// Linux
|
||||
'/usr/bin/google-chrome',
|
||||
'/usr/bin/google-chrome-stable',
|
||||
'/usr/bin/chromium',
|
||||
'/usr/bin/chromium-browser',
|
||||
// macOS
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
// Common paths
|
||||
'google-chrome',
|
||||
'chromium',
|
||||
];
|
||||
|
||||
for (const candidate of candidates) {
|
||||
if (candidate.startsWith('/') && fs.existsSync(candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Parse resolution string
|
||||
function parseResolution(resolution) {
|
||||
const [width, height] = resolution.split(',').map(x => parseInt(x.trim(), 10));
|
||||
return { width: width || 1440, height: height || 2000 };
|
||||
}
|
||||
|
||||
async function takeScreenshot(url) {
|
||||
const timeout = (getEnvInt('CHROME_TIMEOUT') || getEnvInt('TIMEOUT', 60)) * 1000;
|
||||
const resolution = getEnv('CHROME_RESOLUTION') || getEnv('RESOLUTION', '1440,2000');
|
||||
const userAgent = getEnv('CHROME_USER_AGENT') || getEnv('USER_AGENT', '');
|
||||
const checkSsl = getEnvBool('CHROME_CHECK_SSL_VALIDITY', getEnvBool('CHECK_SSL_VALIDITY', true));
|
||||
const headless = getEnvBool('CHROME_HEADLESS', true);
|
||||
|
||||
const { width, height } = parseResolution(resolution);
|
||||
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
let page = null;
|
||||
let connectedToSession = false;
|
||||
|
||||
try {
|
||||
// Try to connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
try {
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
connectedToSession = true;
|
||||
|
||||
// Get existing pages or create new one
|
||||
const pages = await browser.pages();
|
||||
page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
page = await browser.newPage();
|
||||
}
|
||||
|
||||
// Set viewport on the page
|
||||
await page.setViewport({ width, height });
|
||||
|
||||
} catch (e) {
|
||||
console.error(`Failed to connect to CDP session: ${e.message}`);
|
||||
browser = null;
|
||||
}
|
||||
}
|
||||
|
||||
// Fall back to launching new browser
|
||||
if (!browser) {
|
||||
const executablePath = findChrome();
|
||||
if (!executablePath) {
|
||||
return { success: false, error: 'Chrome binary not found' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
executablePath,
|
||||
headless: headless ? 'new' : false,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-gpu',
|
||||
`--window-size=${width},${height}`,
|
||||
...(checkSsl ? [] : ['--ignore-certificate-errors']),
|
||||
],
|
||||
defaultViewport: { width, height },
|
||||
});
|
||||
|
||||
page = await browser.newPage();
|
||||
|
||||
// Navigate to URL (only if we launched fresh browser)
|
||||
if (userAgent) {
|
||||
await page.setUserAgent(userAgent);
|
||||
}
|
||||
|
||||
await page.goto(url, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
}
|
||||
|
||||
// Take screenshot
|
||||
await page.screenshot({
|
||||
path: outputPath,
|
||||
fullPage: true,
|
||||
});
|
||||
|
||||
if (fs.existsSync(outputPath) && fs.statSync(outputPath).size > 0) {
|
||||
return { success: true, output: outputPath };
|
||||
} else {
|
||||
return { success: false, error: 'Screenshot file not created' };
|
||||
}
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
// Only close browser if we launched it (not if we connected to session)
|
||||
if (browser && !connectedToSession) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
if (hasStaticFileOutput()) {
|
||||
console.log(`Skipping screenshot - staticfile extractor already downloaded this`);
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${new Date().toISOString()}`);
|
||||
console.log(`STATUS=skipped`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status: 'skipped', url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0); // Permanent skip - staticfile already handled
|
||||
} else {
|
||||
const result = await takeScreenshot(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const size = fs.statSync(output).size;
|
||||
console.log(`Screenshot saved (${size} bytes)`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
24
archivebox/plugins/search_backend_ripgrep/config.json
Normal file
24
archivebox/plugins/search_backend_ripgrep/config.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"RIPGREP_BINARY": {
|
||||
"type": "string",
|
||||
"default": "rg",
|
||||
"description": "Path to ripgrep binary"
|
||||
},
|
||||
"RIPGREP_IGNORE_EXTENSIONS": {
|
||||
"type": "string",
|
||||
"default": "css,js,orig,svg",
|
||||
"description": "Comma-separated file extensions to ignore"
|
||||
},
|
||||
"SEARCH_BACKEND_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 90,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Search timeout in seconds"
|
||||
}
|
||||
}
|
||||
}
|
||||
80
archivebox/plugins/search_backend_ripgrep/search.py
Normal file
80
archivebox/plugins/search_backend_ripgrep/search.py
Normal file
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Ripgrep search backend - searches files directly without indexing.
|
||||
|
||||
This backend doesn't maintain an index - it searches archived files directly
|
||||
using ripgrep (rg). This is simpler but slower for large archives.
|
||||
|
||||
Environment variables:
|
||||
RIPGREP_BINARY: Path to ripgrep binary (default: rg)
|
||||
RIPGREP_IGNORE_EXTENSIONS: Comma-separated extensions to ignore (default: css,js,orig,svg)
|
||||
SEARCH_BACKEND_TIMEOUT: Search timeout in seconds (default: 90)
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Iterable
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
# Config with old var names for backwards compatibility
|
||||
RIPGREP_BINARY = os.environ.get('RIPGREP_BINARY', 'rg').strip()
|
||||
RIPGREP_IGNORE_EXTENSIONS = os.environ.get('RIPGREP_IGNORE_EXTENSIONS', 'css,js,orig,svg').strip()
|
||||
SEARCH_BACKEND_TIMEOUT = int(os.environ.get('SEARCH_BACKEND_TIMEOUT', '90'))
|
||||
|
||||
|
||||
def search(query: str) -> List[str]:
|
||||
"""Search for snapshots using ripgrep."""
|
||||
rg_binary = shutil.which(RIPGREP_BINARY) or RIPGREP_BINARY
|
||||
if not rg_binary or not Path(rg_binary).exists():
|
||||
raise RuntimeError(f'ripgrep binary not found ({RIPGREP_BINARY}). Install with: apt install ripgrep')
|
||||
|
||||
archive_dir = Path(settings.ARCHIVE_DIR)
|
||||
if not archive_dir.exists():
|
||||
return []
|
||||
|
||||
# Build ignore pattern from config
|
||||
ignore_pattern = f'*.{{{RIPGREP_IGNORE_EXTENSIONS}}}'
|
||||
|
||||
cmd = [
|
||||
rg_binary,
|
||||
f'--type-add=ignore:{ignore_pattern}',
|
||||
'--type-not=ignore',
|
||||
'--files-with-matches',
|
||||
'--no-messages',
|
||||
'--ignore-case',
|
||||
'--regexp',
|
||||
query,
|
||||
str(archive_dir),
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=SEARCH_BACKEND_TIMEOUT)
|
||||
|
||||
# Extract snapshot IDs from file paths
|
||||
# Paths look like: archive/<snapshot_id>/<extractor>/file.txt
|
||||
snapshot_ids = set()
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
path = Path(line)
|
||||
try:
|
||||
relative = path.relative_to(archive_dir)
|
||||
snapshot_id = relative.parts[0]
|
||||
snapshot_ids.add(snapshot_id)
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
return list(snapshot_ids)
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def flush(snapshot_ids: Iterable[str]) -> None:
|
||||
"""No-op for ripgrep - it searches files directly."""
|
||||
pass
|
||||
0
archivebox/plugins/search_backend_sonic/__init__.py
Normal file
0
archivebox/plugins/search_backend_sonic/__init__.py
Normal file
37
archivebox/plugins/search_backend_sonic/config.json
Normal file
37
archivebox/plugins/search_backend_sonic/config.json
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SEARCH_BACKEND_HOST_NAME": {
|
||||
"type": "string",
|
||||
"default": "127.0.0.1",
|
||||
"x-aliases": ["SONIC_HOST"],
|
||||
"description": "Sonic server hostname"
|
||||
},
|
||||
"SEARCH_BACKEND_PORT": {
|
||||
"type": "integer",
|
||||
"default": 1491,
|
||||
"minimum": 1,
|
||||
"maximum": 65535,
|
||||
"x-aliases": ["SONIC_PORT"],
|
||||
"description": "Sonic server port"
|
||||
},
|
||||
"SEARCH_BACKEND_PASSWORD": {
|
||||
"type": "string",
|
||||
"default": "SecretPassword",
|
||||
"x-aliases": ["SONIC_PASSWORD"],
|
||||
"description": "Sonic server password"
|
||||
},
|
||||
"SONIC_COLLECTION": {
|
||||
"type": "string",
|
||||
"default": "archivebox",
|
||||
"description": "Sonic collection name"
|
||||
},
|
||||
"SONIC_BUCKET": {
|
||||
"type": "string",
|
||||
"default": "snapshots",
|
||||
"description": "Sonic bucket name"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Sonic search backend - indexes snapshot content in Sonic server.
|
||||
|
||||
This hook runs after all extractors and indexes text content in Sonic.
|
||||
Only runs if SEARCH_BACKEND_ENGINE=sonic.
|
||||
|
||||
Usage: on_Snapshot__91_index_sonic.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SEARCH_BACKEND_ENGINE: Must be 'sonic' for this hook to run
|
||||
USE_INDEXING_BACKEND: Enable search indexing (default: true)
|
||||
SEARCH_BACKEND_HOST_NAME: Sonic server host (default: 127.0.0.1)
|
||||
SEARCH_BACKEND_PORT: Sonic server port (default: 1491)
|
||||
SEARCH_BACKEND_PASSWORD: Sonic server password (default: SecretPassword)
|
||||
SONIC_COLLECTION: Collection name (default: archivebox)
|
||||
SONIC_BUCKET: Bucket name (default: snapshots)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sonic'
|
||||
OUTPUT_DIR = 'search_index'
|
||||
|
||||
# Text file patterns to index
|
||||
INDEXABLE_FILES = [
|
||||
('readability', 'content.txt'),
|
||||
('readability', 'content.html'),
|
||||
('mercury', 'content.txt'),
|
||||
('mercury', 'content.html'),
|
||||
('htmltotext', 'output.txt'),
|
||||
('singlefile', 'singlefile.html'),
|
||||
('dom', 'output.html'),
|
||||
('wget', '**/*.html'),
|
||||
('wget', '**/*.htm'),
|
||||
('title', 'title.txt'),
|
||||
]
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def strip_html_tags(html: str) -> str:
|
||||
"""Remove HTML tags, keeping text content."""
|
||||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<[^>]+>', ' ', html)
|
||||
html = html.replace(' ', ' ').replace('&', '&')
|
||||
html = html.replace('<', '<').replace('>', '>')
|
||||
html = html.replace('"', '"')
|
||||
html = re.sub(r'\s+', ' ', html)
|
||||
return html.strip()
|
||||
|
||||
|
||||
def find_indexable_content() -> list[tuple[str, str]]:
|
||||
"""Find text content to index from extractor outputs."""
|
||||
results = []
|
||||
cwd = Path.cwd()
|
||||
|
||||
for extractor, file_pattern in INDEXABLE_FILES:
|
||||
extractor_dir = cwd / extractor
|
||||
if not extractor_dir.exists():
|
||||
continue
|
||||
|
||||
if '*' in file_pattern:
|
||||
matches = list(extractor_dir.glob(file_pattern))
|
||||
else:
|
||||
match = extractor_dir / file_pattern
|
||||
matches = [match] if match.exists() else []
|
||||
|
||||
for match in matches:
|
||||
if match.is_file() and match.stat().st_size > 0:
|
||||
try:
|
||||
content = match.read_text(encoding='utf-8', errors='ignore')
|
||||
if content.strip():
|
||||
if match.suffix in ('.html', '.htm'):
|
||||
content = strip_html_tags(content)
|
||||
results.append((f'{extractor}/{match.name}', content))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_sonic_config() -> dict:
|
||||
"""Get Sonic connection configuration."""
|
||||
return {
|
||||
'host': get_env('SEARCH_BACKEND_HOST_NAME', '127.0.0.1'),
|
||||
'port': get_env_int('SEARCH_BACKEND_PORT', 1491),
|
||||
'password': get_env('SEARCH_BACKEND_PASSWORD', 'SecretPassword'),
|
||||
'collection': get_env('SONIC_COLLECTION', 'archivebox'),
|
||||
'bucket': get_env('SONIC_BUCKET', 'snapshots'),
|
||||
}
|
||||
|
||||
|
||||
def index_in_sonic(snapshot_id: str, texts: list[str]) -> None:
|
||||
"""Index texts in Sonic."""
|
||||
try:
|
||||
from sonic import IngestClient
|
||||
except ImportError:
|
||||
raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
|
||||
|
||||
config = get_sonic_config()
|
||||
|
||||
with IngestClient(config['host'], config['port'], config['password']) as ingest:
|
||||
# Flush existing content
|
||||
try:
|
||||
ingest.flush_object(config['collection'], config['bucket'], snapshot_id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Index new content in chunks (Sonic has size limits)
|
||||
content = ' '.join(texts)
|
||||
chunk_size = 10000
|
||||
for i in range(0, len(content), chunk_size):
|
||||
chunk = content[i:i + chunk_size]
|
||||
ingest.push(config['collection'], config['bucket'], snapshot_id, chunk)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL that was archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Index snapshot content in Sonic."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
indexed_sources = []
|
||||
|
||||
try:
|
||||
# Check if this backend is enabled (permanent skips - don't retry)
|
||||
backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
|
||||
if backend != 'sonic':
|
||||
print(f'Skipping Sonic indexing (SEARCH_BACKEND_ENGINE={backend})')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - different backend selected
|
||||
if not get_env_bool('USE_INDEXING_BACKEND', True):
|
||||
print('Skipping indexing (USE_INDEXING_BACKEND=False)')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - indexing disabled
|
||||
else:
|
||||
contents = find_indexable_content()
|
||||
indexed_sources = [source for source, _ in contents]
|
||||
|
||||
if not contents:
|
||||
status = 'skipped'
|
||||
print('No indexable content found')
|
||||
else:
|
||||
texts = [content for _, content in contents]
|
||||
index_in_sonic(snapshot_id, texts)
|
||||
status = 'succeeded'
|
||||
output = OUTPUT_DIR
|
||||
print(f'Sonic indexed {len(texts)} documents')
|
||||
print(f'Sources: {", ".join(indexed_sources)}')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'indexed_sources': indexed_sources,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
50
archivebox/plugins/search_backend_sonic/search.py
Normal file
50
archivebox/plugins/search_backend_sonic/search.py
Normal file
@@ -0,0 +1,50 @@
|
||||
"""
|
||||
Sonic search backend - search and flush operations.
|
||||
|
||||
This module provides the search interface for the Sonic backend.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import List, Iterable
|
||||
|
||||
|
||||
def get_sonic_config() -> dict:
|
||||
"""Get Sonic connection configuration."""
|
||||
return {
|
||||
'host': os.environ.get('SEARCH_BACKEND_HOST_NAME', '127.0.0.1').strip(),
|
||||
'port': int(os.environ.get('SEARCH_BACKEND_PORT', '1491')),
|
||||
'password': os.environ.get('SEARCH_BACKEND_PASSWORD', 'SecretPassword').strip(),
|
||||
'collection': os.environ.get('SONIC_COLLECTION', 'archivebox').strip(),
|
||||
'bucket': os.environ.get('SONIC_BUCKET', 'snapshots').strip(),
|
||||
}
|
||||
|
||||
|
||||
def search(query: str) -> List[str]:
|
||||
"""Search for snapshots in Sonic."""
|
||||
try:
|
||||
from sonic import SearchClient
|
||||
except ImportError:
|
||||
raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
|
||||
|
||||
config = get_sonic_config()
|
||||
|
||||
with SearchClient(config['host'], config['port'], config['password']) as search_client:
|
||||
results = search_client.query(config['collection'], config['bucket'], query, limit=100)
|
||||
return results
|
||||
|
||||
|
||||
def flush(snapshot_ids: Iterable[str]) -> None:
|
||||
"""Remove snapshots from Sonic index."""
|
||||
try:
|
||||
from sonic import IngestClient
|
||||
except ImportError:
|
||||
raise RuntimeError('sonic-client not installed. Run: pip install sonic-client')
|
||||
|
||||
config = get_sonic_config()
|
||||
|
||||
with IngestClient(config['host'], config['port'], config['password']) as ingest:
|
||||
for snapshot_id in snapshot_ids:
|
||||
try:
|
||||
ingest.flush_object(config['collection'], config['bucket'], snapshot_id)
|
||||
except Exception:
|
||||
pass
|
||||
24
archivebox/plugins/search_backend_sqlite/config.json
Normal file
24
archivebox/plugins/search_backend_sqlite/config.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SQLITEFTS_DB": {
|
||||
"type": "string",
|
||||
"default": "search.sqlite3",
|
||||
"description": "SQLite FTS database filename"
|
||||
},
|
||||
"FTS_SEPARATE_DATABASE": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SQLITEFTS_SEPARATE_DATABASE"],
|
||||
"description": "Use separate database file for FTS index"
|
||||
},
|
||||
"FTS_TOKENIZERS": {
|
||||
"type": "string",
|
||||
"default": "porter unicode61 remove_diacritics 2",
|
||||
"x-aliases": ["SQLITEFTS_TOKENIZERS"],
|
||||
"description": "FTS5 tokenizer configuration"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
SQLite FTS5 search backend - indexes snapshot content for full-text search.
|
||||
|
||||
This hook runs after all extractors and indexes text content in SQLite FTS5.
|
||||
Only runs if SEARCH_BACKEND_ENGINE=sqlite.
|
||||
|
||||
Usage: on_Snapshot__90_index_sqlite.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run
|
||||
USE_INDEXING_BACKEND: Enable search indexing (default: true)
|
||||
SQLITEFTS_DB: Database filename (default: search.sqlite3)
|
||||
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sqlite'
|
||||
OUTPUT_DIR = 'search_index'
|
||||
|
||||
# Text file patterns to index, in priority order
|
||||
INDEXABLE_FILES = [
|
||||
('readability', 'content.txt'),
|
||||
('readability', 'content.html'),
|
||||
('mercury', 'content.txt'),
|
||||
('mercury', 'content.html'),
|
||||
('htmltotext', 'output.txt'),
|
||||
('singlefile', 'singlefile.html'),
|
||||
('dom', 'output.html'),
|
||||
('wget', '**/*.html'),
|
||||
('wget', '**/*.htm'),
|
||||
('title', 'title.txt'),
|
||||
]
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def strip_html_tags(html: str) -> str:
|
||||
"""Remove HTML tags, keeping text content."""
|
||||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<[^>]+>', ' ', html)
|
||||
html = html.replace(' ', ' ').replace('&', '&')
|
||||
html = html.replace('<', '<').replace('>', '>')
|
||||
html = html.replace('"', '"')
|
||||
html = re.sub(r'\s+', ' ', html)
|
||||
return html.strip()
|
||||
|
||||
|
||||
def find_indexable_content() -> list[tuple[str, str]]:
|
||||
"""Find text content to index from extractor outputs."""
|
||||
results = []
|
||||
cwd = Path.cwd()
|
||||
|
||||
for extractor, file_pattern in INDEXABLE_FILES:
|
||||
extractor_dir = cwd / extractor
|
||||
if not extractor_dir.exists():
|
||||
continue
|
||||
|
||||
if '*' in file_pattern:
|
||||
matches = list(extractor_dir.glob(file_pattern))
|
||||
else:
|
||||
match = extractor_dir / file_pattern
|
||||
matches = [match] if match.exists() else []
|
||||
|
||||
for match in matches:
|
||||
if match.is_file() and match.stat().st_size > 0:
|
||||
try:
|
||||
content = match.read_text(encoding='utf-8', errors='ignore')
|
||||
if content.strip():
|
||||
if match.suffix in ('.html', '.htm'):
|
||||
content = strip_html_tags(content)
|
||||
results.append((f'{extractor}/{match.name}', content))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_db_path() -> Path:
|
||||
"""Get path to the search index database."""
|
||||
data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent))
|
||||
db_name = get_env('SQLITEFTS_DB', 'search.sqlite3')
|
||||
return Path(data_dir) / db_name
|
||||
|
||||
|
||||
def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None:
|
||||
"""Index texts in SQLite FTS5."""
|
||||
db_path = get_db_path()
|
||||
tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2')
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
|
||||
try:
|
||||
# Create FTS5 table if needed
|
||||
conn.execute(f'''
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
|
||||
USING fts5(snapshot_id, content, tokenize='{tokenizers}')
|
||||
''')
|
||||
|
||||
# Remove existing entries
|
||||
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
|
||||
|
||||
# Insert new content
|
||||
content = '\n\n'.join(texts)
|
||||
conn.execute(
|
||||
'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)',
|
||||
(snapshot_id, content)
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL that was archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Index snapshot content in SQLite FTS5."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
indexed_sources = []
|
||||
|
||||
try:
|
||||
# Check if this backend is enabled (permanent skips - don't retry)
|
||||
backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
|
||||
if backend != 'sqlite':
|
||||
print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - different backend selected
|
||||
if not get_env_bool('USE_INDEXING_BACKEND', True):
|
||||
print('Skipping indexing (USE_INDEXING_BACKEND=False)')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - indexing disabled
|
||||
else:
|
||||
contents = find_indexable_content()
|
||||
indexed_sources = [source for source, _ in contents]
|
||||
|
||||
if not contents:
|
||||
status = 'skipped'
|
||||
print('No indexable content found')
|
||||
else:
|
||||
texts = [content for _, content in contents]
|
||||
index_in_sqlite(snapshot_id, texts)
|
||||
status = 'succeeded'
|
||||
output = OUTPUT_DIR
|
||||
print(f'SQLite FTS indexed {len(texts)} documents')
|
||||
print(f'Sources: {", ".join(indexed_sources)}')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'indexed_sources': indexed_sources,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
65
archivebox/plugins/search_backend_sqlite/search.py
Normal file
65
archivebox/plugins/search_backend_sqlite/search.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
SQLite FTS5 search backend - search and flush operations.
|
||||
|
||||
This module provides the search interface for the SQLite FTS backend.
|
||||
|
||||
Environment variables:
|
||||
SQLITEFTS_DB: Database filename (default: search.sqlite3)
|
||||
FTS_SEPARATE_DATABASE: Use separate database file (default: true)
|
||||
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import List, Iterable
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
# Config with old var names for backwards compatibility
|
||||
SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip()
|
||||
FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() in ('true', '1', 'yes')
|
||||
FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip()
|
||||
|
||||
|
||||
def get_db_path() -> Path:
|
||||
"""Get path to the search index database."""
|
||||
return Path(settings.DATA_DIR) / SQLITEFTS_DB
|
||||
|
||||
|
||||
def search(query: str) -> List[str]:
|
||||
"""Search for snapshots matching the query."""
|
||||
db_path = get_db_path()
|
||||
if not db_path.exists():
|
||||
return []
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
'SELECT DISTINCT snapshot_id FROM search_index WHERE search_index MATCH ?',
|
||||
(query,)
|
||||
)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
except sqlite3.OperationalError:
|
||||
# Table doesn't exist yet
|
||||
return []
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def flush(snapshot_ids: Iterable[str]) -> None:
|
||||
"""Remove snapshots from the index."""
|
||||
db_path = get_db_path()
|
||||
if not db_path.exists():
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
for snapshot_id in snapshot_ids:
|
||||
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
|
||||
conn.commit()
|
||||
except sqlite3.OperationalError:
|
||||
pass # Table doesn't exist
|
||||
finally:
|
||||
conn.close()
|
||||
219
archivebox/plugins/seo/on_Snapshot__38_seo.js
Executable file
219
archivebox/plugins/seo/on_Snapshot__38_seo.js
Executable file
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract SEO metadata from a URL.
|
||||
*
|
||||
* Extracts all <meta> tags including:
|
||||
* - og:* (Open Graph)
|
||||
* - twitter:*
|
||||
* - description, keywords, author
|
||||
* - Any other meta tags
|
||||
*
|
||||
* Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes seo/seo.json
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_SEO: Enable SEO extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'seo';
|
||||
const OUTPUT_DIR = 'seo';
|
||||
const OUTPUT_FILE = 'seo.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract SEO metadata
|
||||
async function extractSeo(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
let browser = null;
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Extract all meta tags
|
||||
const seoData = await page.evaluate(() => {
|
||||
const metaTags = Array.from(document.querySelectorAll('meta'));
|
||||
const seo = {
|
||||
url: window.location.href,
|
||||
title: document.title || '',
|
||||
};
|
||||
|
||||
// Process each meta tag
|
||||
metaTags.forEach(tag => {
|
||||
// Get the key (name or property attribute)
|
||||
const key = tag.getAttribute('name') || tag.getAttribute('property') || '';
|
||||
const content = tag.getAttribute('content') || '';
|
||||
|
||||
if (key && content) {
|
||||
// Store by key
|
||||
seo[key] = content;
|
||||
}
|
||||
});
|
||||
|
||||
// Also get canonical URL if present
|
||||
const canonical = document.querySelector('link[rel="canonical"]');
|
||||
if (canonical) {
|
||||
seo.canonical = canonical.getAttribute('href');
|
||||
}
|
||||
|
||||
// Get language
|
||||
const htmlLang = document.documentElement.lang;
|
||||
if (htmlLang) {
|
||||
seo.language = htmlLang;
|
||||
}
|
||||
|
||||
return seo;
|
||||
});
|
||||
|
||||
// Write output
|
||||
fs.writeFileSync(outputPath, JSON.stringify(seoData, null, 2));
|
||||
|
||||
return { success: true, output: outputPath, seoData };
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_SEO', true)) {
|
||||
console.log('Skipping SEO (SAVE_SEO=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await extractSeo(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const metaCount = Object.keys(result.seoData).length - 2; // Subtract url and title
|
||||
console.log(`SEO metadata extracted: ${metaCount} meta tags`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
53
archivebox/plugins/singlefile/config.json
Normal file
53
archivebox/plugins/singlefile/config.json
Normal file
@@ -0,0 +1,53 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_SINGLEFILE": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable SingleFile archiving"
|
||||
},
|
||||
"SINGLEFILE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "single-file",
|
||||
"x-aliases": ["SINGLE_FILE_BINARY"],
|
||||
"description": "Path to single-file binary"
|
||||
},
|
||||
"NODE_BINARY": {
|
||||
"type": "string",
|
||||
"default": "node",
|
||||
"description": "Path to Node.js binary"
|
||||
},
|
||||
"SINGLEFILE_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 10,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for SingleFile in seconds"
|
||||
},
|
||||
"SINGLEFILE_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string"
|
||||
},
|
||||
"SINGLEFILE_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"SINGLEFILE_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [],
|
||||
"description": "Default single-file arguments"
|
||||
},
|
||||
"SINGLEFILE_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for single-file"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for single-file binary.
|
||||
|
||||
Runs at crawl start to verify single-file (npm package) is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from single-file binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
return result.stdout.strip().split('\n')[0][:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
# For scripts, hash the script content
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_singlefile() -> dict | None:
|
||||
"""Find single-file binary."""
|
||||
# Check env var first
|
||||
env_path = os.environ.get('SINGLEFILE_BINARY', '')
|
||||
if env_path and Path(env_path).is_file():
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': env_path,
|
||||
'version': get_binary_version(env_path),
|
||||
'sha256': get_binary_hash(env_path),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
# Try shutil.which
|
||||
for name in ['single-file', 'singlefile']:
|
||||
abspath = shutil.which(name)
|
||||
if abspath:
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'npm',
|
||||
}
|
||||
|
||||
# Check common npm paths
|
||||
npm_paths = [
|
||||
Path.home() / '.npm-global/bin/single-file',
|
||||
Path.home() / 'node_modules/.bin/single-file',
|
||||
Path('/usr/local/bin/single-file'),
|
||||
Path('/usr/local/lib/node_modules/.bin/single-file'),
|
||||
]
|
||||
for path in npm_paths:
|
||||
if path.is_file():
|
||||
return {
|
||||
'name': 'single-file',
|
||||
'abspath': str(path),
|
||||
'version': get_binary_version(str(path)),
|
||||
'sha256': get_binary_hash(str(path)),
|
||||
'binprovider': 'npm',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
result = find_singlefile()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/SINGLEFILE_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/SINGLEFILE_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'single-file',
|
||||
'bin_providers': 'npm,env',
|
||||
}))
|
||||
print(f"single-file binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
270
archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
Executable file
270
archivebox/plugins/singlefile/on_Snapshot__04_singlefile.js
Executable file
@@ -0,0 +1,270 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* SingleFile Extension Plugin
|
||||
*
|
||||
* Installs and uses the SingleFile Chrome extension for archiving complete web pages.
|
||||
* Falls back to single-file-cli if the extension is not available.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
|
||||
*
|
||||
* Priority: 04 (early) - Must install before Chrome session starts
|
||||
* Hook: on_Snapshot
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Saves complete web pages as single HTML files
|
||||
* - Inlines all resources (CSS, JS, images, fonts)
|
||||
* - Preserves page fidelity better than wget/curl
|
||||
* - Works with SPAs and dynamically loaded content
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const { promisify } = require('util');
|
||||
const { exec } = require('child_process');
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
name: 'singlefile',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
|
||||
|
||||
const OUTPUT_DIR = 'singlefile';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
/**
|
||||
* Install the SingleFile extension
|
||||
*/
|
||||
async function installSinglefileExtension() {
|
||||
console.log('[*] Installing SingleFile extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install SingleFile extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log('[+] SingleFile extension installed');
|
||||
console.log('[+] Web pages will be saved as single HTML files');
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for a specified amount of time
|
||||
*/
|
||||
function wait(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a page using the SingleFile extension
|
||||
*
|
||||
* @param {Object} page - Puppeteer page object
|
||||
* @param {Object} extension - Extension metadata with dispatchAction method
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithExtension(page, extension, options = {}) {
|
||||
if (!extension || !extension.version) {
|
||||
throw new Error('SingleFile extension not found or not loaded');
|
||||
}
|
||||
|
||||
const url = await page.url();
|
||||
|
||||
// Check for unsupported URL schemes
|
||||
const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
|
||||
const scheme = url.split(':')[0];
|
||||
if (URL_SCHEMES_IGNORED.includes(scheme)) {
|
||||
console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure downloads directory exists
|
||||
await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
|
||||
|
||||
// Get list of existing files to ignore
|
||||
const files_before = new Set(
|
||||
(await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'))
|
||||
);
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
|
||||
|
||||
// Bring page to front (extension action button acts on foreground tab)
|
||||
await page.bringToFront();
|
||||
|
||||
// Trigger the extension's action (toolbar button click)
|
||||
await extension.dispatchAction();
|
||||
|
||||
// Wait for file to appear in downloads directory
|
||||
const check_delay = 3000; // 3 seconds
|
||||
const max_tries = 10;
|
||||
let files_new = [];
|
||||
|
||||
for (let attempt = 0; attempt < max_tries; attempt++) {
|
||||
await wait(check_delay);
|
||||
|
||||
const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
|
||||
.filter(fn => fn.endsWith('.html'));
|
||||
|
||||
files_new = files_after.filter(file => !files_before.has(file));
|
||||
|
||||
if (files_new.length === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find the matching file by checking if it contains the URL in the HTML header
|
||||
for (const file of files_new) {
|
||||
const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
|
||||
const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
|
||||
const dl_header = dl_text.split('meta charset')[0];
|
||||
|
||||
if (dl_header.includes(`url: ${url}`)) {
|
||||
console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
|
||||
await fs.promises.rename(dl_path, out_path);
|
||||
return out_path;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
|
||||
console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save a page using single-file-cli (fallback method)
|
||||
*
|
||||
* @param {string} url - URL to archive
|
||||
* @param {Object} options - Additional options
|
||||
* @returns {Promise<string|null>} - Path to saved file or null on failure
|
||||
*/
|
||||
async function saveSinglefileWithCLI(url, options = {}) {
|
||||
console.log('[*] Falling back to single-file-cli...');
|
||||
|
||||
// Find single-file binary
|
||||
let binary = null;
|
||||
try {
|
||||
const { stdout } = await execAsync('which single-file');
|
||||
binary = stdout.trim();
|
||||
} catch (err) {
|
||||
console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
|
||||
return null;
|
||||
}
|
||||
|
||||
// Ensure output directory exists
|
||||
await fs.promises.mkdir(OUTPUT_DIR, { recursive: true });
|
||||
const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Build command
|
||||
const cmd = [
|
||||
binary,
|
||||
'--browser-headless',
|
||||
url,
|
||||
out_path,
|
||||
];
|
||||
|
||||
// Add optional args
|
||||
if (options.userAgent) {
|
||||
cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
|
||||
}
|
||||
if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
|
||||
cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
|
||||
}
|
||||
if (options.ignoreSSL) {
|
||||
cmd.splice(2, 0, '--browser-ignore-insecure-certs');
|
||||
}
|
||||
|
||||
// Execute
|
||||
try {
|
||||
const timeout = options.timeout || 120000;
|
||||
await execAsync(cmd.join(' '), { timeout });
|
||||
|
||||
if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
|
||||
console.log(`[+] SingleFile saved via CLI: ${out_path}`);
|
||||
return out_path;
|
||||
}
|
||||
|
||||
console.error('[❌] SingleFile CLI completed but no output file found');
|
||||
return null;
|
||||
} catch (err) {
|
||||
console.error(`[❌] SingleFile CLI error: ${err.message}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] SingleFile extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installSinglefileExtension();
|
||||
|
||||
// Export extension metadata for chrome_session to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome_session can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installSinglefileExtension,
|
||||
saveSinglefileWithExtension,
|
||||
saveSinglefileWithCLI,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] SingleFile extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] SingleFile extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
328
archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
Normal file
328
archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
Normal file
@@ -0,0 +1,328 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Archive a URL using SingleFile.
|
||||
|
||||
Usage: on_Snapshot__singlefile.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes singlefile.html to $PWD
|
||||
|
||||
Environment variables:
|
||||
SINGLEFILE_BINARY: Path to SingleFile binary
|
||||
SINGLEFILE_TIMEOUT: Timeout in seconds (default: 120)
|
||||
SINGLEFILE_USER_AGENT: User agent string (optional)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
SINGLEFILE_COOKIES_FILE: Path to cookies file (optional)
|
||||
SINGLEFILE_EXTRA_ARGS: Extra arguments for SingleFile (space-separated)
|
||||
|
||||
# Feature toggle
|
||||
SAVE_SINGLEFILE: Enable SingleFile archiving (default: True)
|
||||
|
||||
# Chrome binary (SingleFile needs Chrome)
|
||||
CHROME_BINARY: Path to Chrome/Chromium binary
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if SINGLEFILE_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
USER_AGENT: Fallback user agent
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
COOKIES_FILE: Fallback cookies file
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'singlefile'
|
||||
BIN_NAME = 'single-file'
|
||||
BIN_PROVIDERS = 'npm,env'
|
||||
OUTPUT_DIR = 'singlefile'
|
||||
OUTPUT_FILE = 'singlefile.html'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
staticfile_dir = Path(STATICFILE_DIR)
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
# Chrome binary search paths
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
'chromium', 'chromium-browser', 'chromium-browser-beta',
|
||||
'chromium-browser-unstable', 'chromium-browser-canary', 'chromium-browser-dev',
|
||||
]
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
'google-chrome', 'google-chrome-stable', 'google-chrome-beta',
|
||||
'google-chrome-canary', 'google-chrome-unstable', 'google-chrome-dev', 'chrome',
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ['/Applications/Chromium.app/Contents/MacOS/Chromium']
|
||||
|
||||
ALL_CHROME_BINARIES = (
|
||||
CHROME_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_LINUX +
|
||||
CHROME_BINARY_NAMES_MACOS + CHROMIUM_BINARY_NAMES_MACOS
|
||||
)
|
||||
|
||||
|
||||
def find_singlefile() -> str | None:
|
||||
"""Find SingleFile binary."""
|
||||
singlefile = get_env('SINGLEFILE_BINARY')
|
||||
if singlefile and os.path.isfile(singlefile):
|
||||
return singlefile
|
||||
|
||||
for name in ['single-file', 'singlefile']:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def find_chrome() -> str | None:
|
||||
"""Find Chrome/Chromium binary."""
|
||||
chrome = get_env('CHROME_BINARY')
|
||||
if chrome and os.path.isfile(chrome):
|
||||
return chrome
|
||||
|
||||
for name in ALL_CHROME_BINARIES:
|
||||
if '/' in name:
|
||||
if os.path.isfile(name):
|
||||
return name
|
||||
else:
|
||||
binary = shutil.which(name)
|
||||
if binary:
|
||||
return binary
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get SingleFile version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
|
||||
|
||||
def get_cdp_url() -> str | None:
|
||||
"""Get CDP URL from chrome_session if available."""
|
||||
cdp_file = Path(CHROME_SESSION_DIR) / 'cdp_url.txt'
|
||||
if cdp_file.exists():
|
||||
return cdp_file.read_text().strip()
|
||||
return None
|
||||
|
||||
|
||||
def get_port_from_cdp_url(cdp_url: str) -> str | None:
|
||||
"""Extract port from CDP WebSocket URL (ws://127.0.0.1:PORT/...)."""
|
||||
import re
|
||||
match = re.search(r':(\d+)/', cdp_url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def save_singlefile(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Archive URL using SingleFile.
|
||||
|
||||
If a Chrome session exists (from chrome_session extractor), connects to it via CDP.
|
||||
Otherwise launches a new Chrome instance.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with SINGLEFILE_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
timeout = get_env_int('SINGLEFILE_TIMEOUT') or get_env_int('TIMEOUT', 120)
|
||||
user_agent = get_env('SINGLEFILE_USER_AGENT') or get_env('USER_AGENT', '')
|
||||
check_ssl = get_env_bool('SINGLEFILE_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
cookies_file = get_env('SINGLEFILE_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
extra_args = get_env('SINGLEFILE_EXTRA_ARGS', '')
|
||||
chrome = find_chrome()
|
||||
|
||||
cmd = [binary]
|
||||
|
||||
# Try to use existing Chrome session via CDP
|
||||
cdp_url = get_cdp_url()
|
||||
if cdp_url:
|
||||
# SingleFile can connect to existing browser via WebSocket
|
||||
# Extract port from CDP URL (ws://127.0.0.1:PORT/...)
|
||||
port = get_port_from_cdp_url(cdp_url)
|
||||
if port:
|
||||
cmd.extend(['--browser-server', f'http://127.0.0.1:{port}'])
|
||||
elif chrome:
|
||||
cmd.extend(['--browser-executable-path', chrome])
|
||||
|
||||
# Common options
|
||||
cmd.extend([
|
||||
'--browser-headless',
|
||||
])
|
||||
|
||||
# SSL handling
|
||||
if not check_ssl:
|
||||
cmd.append('--browser-ignore-insecure-certs')
|
||||
|
||||
if user_agent:
|
||||
cmd.extend(['--browser-user-agent', user_agent])
|
||||
|
||||
if cookies_file and Path(cookies_file).is_file():
|
||||
cmd.extend(['--browser-cookies-file', cookies_file])
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
output_path = output_dir / OUTPUT_FILE
|
||||
|
||||
cmd.extend([url, str(output_path)])
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=timeout)
|
||||
|
||||
if output_path.exists() and output_path.stat().st_size > 0:
|
||||
return True, str(output_path), ''
|
||||
else:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
if 'ERR_NAME_NOT_RESOLVED' in stderr:
|
||||
return False, None, 'DNS resolution failed'
|
||||
if 'ERR_CONNECTION_REFUSED' in stderr:
|
||||
return False, None, 'Connection refused'
|
||||
return False, None, f'SingleFile failed: {stderr[:200]}'
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to archive')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Archive a URL using SingleFile."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
cmd_str = ''
|
||||
|
||||
try:
|
||||
# Check if SingleFile is enabled
|
||||
if not get_env_bool('SAVE_SINGLEFILE', True):
|
||||
print('Skipping SingleFile (SAVE_SINGLEFILE=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile extractor already handled this (permanent skip)
|
||||
if has_staticfile_output():
|
||||
print(f'Skipping SingleFile - staticfile extractor already downloaded this')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - staticfile already handled
|
||||
|
||||
# Find binary
|
||||
binary = find_singlefile()
|
||||
if not binary:
|
||||
print(f'ERROR: SingleFile binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
print(f'INSTALL_HINT=npm install -g single-file-cli', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} {url} {OUTPUT_DIR}/{OUTPUT_FILE}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_singlefile(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success and output:
|
||||
size = Path(output).stat().st_size
|
||||
print(f'SingleFile saved ({size} bytes)')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if cmd_str:
|
||||
print(f'CMD={cmd_str}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
110
archivebox/plugins/singlefile/tests/test_archiving.py
Normal file
110
archivebox/plugins/singlefile/tests/test_archiving.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
Integration tests - archive example.com with SingleFile and verify output
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
|
||||
# Check if single-file CLI is available
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["which", "single-file"],
|
||||
capture_output=True,
|
||||
timeout=5
|
||||
)
|
||||
SINGLEFILE_CLI_AVAILABLE = result.returncode == 0
|
||||
except:
|
||||
SINGLEFILE_CLI_AVAILABLE = False
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not SINGLEFILE_CLI_AVAILABLE,
|
||||
reason="single-file CLI not installed (npm install -g single-file-cli)"
|
||||
)
|
||||
def test_archives_example_com():
|
||||
"""Archive example.com and verify output contains expected content"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
output_dir = Path(tmpdir) / "singlefile"
|
||||
output_dir.mkdir()
|
||||
|
||||
output_file = output_dir / "singlefile.html"
|
||||
|
||||
# Run single-file CLI
|
||||
result = subprocess.run(
|
||||
[
|
||||
"single-file",
|
||||
"--browser-headless",
|
||||
TEST_URL,
|
||||
str(output_file)
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Archive failed: {result.stderr}"
|
||||
|
||||
# Verify output exists
|
||||
assert output_file.exists(), "Output file not created"
|
||||
|
||||
# Read and verify content
|
||||
html_content = output_file.read_text()
|
||||
file_size = output_file.stat().st_size
|
||||
|
||||
# Should be substantial (embedded resources)
|
||||
assert file_size > 900, f"Output too small: {file_size} bytes"
|
||||
|
||||
# Verify HTML structure (SingleFile minifies, so <head> tag may be omitted)
|
||||
assert "<html" in html_content.lower()
|
||||
assert "<body" in html_content.lower()
|
||||
assert "<title>" in html_content.lower() or "title>" in html_content.lower()
|
||||
|
||||
# Verify example.com content is actually present
|
||||
assert "example domain" in html_content.lower(), "Missing 'Example Domain' title"
|
||||
assert "this domain is" in html_content.lower(), "Missing example.com description text"
|
||||
assert "iana.org" in html_content.lower(), "Missing IANA link"
|
||||
|
||||
# Verify it's not just empty/error page
|
||||
assert file_size > 900, f"File too small: {file_size} bytes"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not SINGLEFILE_CLI_AVAILABLE, reason="single-file CLI not installed")
|
||||
def test_different_urls_produce_different_outputs():
|
||||
"""Verify different URLs produce different archived content"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
outputs = {}
|
||||
|
||||
for url in ["https://example.com", "https://example.org"]:
|
||||
output_file = Path(tmpdir) / f"{url.replace('https://', '').replace('.', '_')}.html"
|
||||
|
||||
result = subprocess.run(
|
||||
["single-file", "--browser-headless", url, str(output_file)],
|
||||
capture_output=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode == 0 and output_file.exists():
|
||||
outputs[url] = output_file.read_text()
|
||||
|
||||
assert len(outputs) == 2, "Should archive both URLs"
|
||||
|
||||
# Verify outputs differ
|
||||
urls = list(outputs.keys())
|
||||
assert outputs[urls[0]] != outputs[urls[1]], "Different URLs should produce different outputs"
|
||||
|
||||
# Each should contain its domain
|
||||
assert "example.com" in outputs[urls[0]]
|
||||
assert "example.org" in outputs[urls[1]]
|
||||
385
archivebox/plugins/singlefile/tests/test_singlefile.js
Normal file
385
archivebox/plugins/singlefile/tests/test_singlefile.js
Normal file
@@ -0,0 +1,385 @@
|
||||
/**
|
||||
* Unit tests for singlefile plugin
|
||||
*
|
||||
* Run with: node --test tests/test_singlefile.js
|
||||
*/
|
||||
|
||||
const assert = require('assert');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
|
||||
|
||||
// Test fixtures
|
||||
const TEST_DIR = path.join(__dirname, '.test_fixtures');
|
||||
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
|
||||
const TEST_DOWNLOADS_DIR = path.join(TEST_DIR, 'chrome_downloads');
|
||||
|
||||
describe('singlefile plugin', () => {
|
||||
before(() => {
|
||||
if (!fs.existsSync(TEST_DIR)) {
|
||||
fs.mkdirSync(TEST_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
after(() => {
|
||||
if (fs.existsSync(TEST_DIR)) {
|
||||
fs.rmSync(TEST_DIR, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('EXTENSION metadata', () => {
|
||||
it('should have correct webstore_id', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
|
||||
});
|
||||
|
||||
it('should have correct name', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__04_singlefile.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.name, 'singlefile');
|
||||
});
|
||||
});
|
||||
|
||||
describe('installSinglefileExtension', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should use cached extension if available', async () => {
|
||||
const { installSinglefileExtension } = require('../on_Snapshot__04_singlefile.js');
|
||||
|
||||
// Create fake cache
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'singlefile.extension.json');
|
||||
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_singlefile');
|
||||
|
||||
fs.mkdirSync(fakeExtensionDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(fakeExtensionDir, 'manifest.json'),
|
||||
JSON.stringify({ version: '1.22.90' })
|
||||
);
|
||||
|
||||
const fakeCache = {
|
||||
webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
|
||||
name: 'singlefile',
|
||||
unpacked_path: fakeExtensionDir,
|
||||
version: '1.22.90'
|
||||
};
|
||||
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
|
||||
|
||||
const result = await installSinglefileExtension();
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
assert.strictEqual(result.webstore_id, 'mpiodijhokgodhhofbcjdecpffjipkle');
|
||||
});
|
||||
});
|
||||
|
||||
describe('saveSinglefileWithExtension', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_DOWNLOADS_DIR = TEST_DOWNLOADS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_DOWNLOADS_DIR;
|
||||
});
|
||||
|
||||
it('should require extension and version to be present', () => {
|
||||
const mockExtension = {
|
||||
name: 'singlefile',
|
||||
version: '1.22.96',
|
||||
id: 'test_id'
|
||||
};
|
||||
|
||||
assert.ok(mockExtension.version);
|
||||
assert.ok(mockExtension.id);
|
||||
});
|
||||
|
||||
it('should filter unsupported URL schemes', () => {
|
||||
const unsupportedSchemes = [
|
||||
'about:',
|
||||
'chrome:',
|
||||
'chrome-extension:',
|
||||
'data:',
|
||||
'javascript:',
|
||||
'blob:'
|
||||
];
|
||||
|
||||
unsupportedSchemes.forEach(scheme => {
|
||||
const testUrl = scheme + 'something';
|
||||
const urlScheme = testUrl.split(':')[0];
|
||||
|
||||
assert.ok(unsupportedSchemes.some(s => s.startsWith(urlScheme)));
|
||||
});
|
||||
});
|
||||
|
||||
it('should wait for file to appear in downloads directory', async () => {
|
||||
const checkDelay = 3000; // 3 seconds
|
||||
const maxTries = 10;
|
||||
|
||||
// Total max wait time
|
||||
const maxWaitTime = checkDelay * maxTries;
|
||||
|
||||
assert.strictEqual(maxWaitTime, 30000); // 30 seconds
|
||||
});
|
||||
|
||||
it('should find downloaded file by checking URL in HTML header', () => {
|
||||
const testUrl = 'https://example.com';
|
||||
const mockHtml = `<!-- url: ${testUrl} --><html><head><meta charset="utf-8"></head></html>`;
|
||||
|
||||
// Should be able to extract URL from header
|
||||
const headerPart = mockHtml.split('meta charset')[0];
|
||||
assert.ok(headerPart.includes(`url: ${testUrl}`));
|
||||
});
|
||||
|
||||
it('should move file from downloads to output directory', () => {
|
||||
const downloadPath = path.join(TEST_DOWNLOADS_DIR, 'temp_file.html');
|
||||
const outputDir = 'singlefile';
|
||||
const outputFile = 'singlefile.html';
|
||||
const outputPath = path.join(outputDir, outputFile);
|
||||
|
||||
// Verify paths are different
|
||||
assert.notStrictEqual(downloadPath, outputPath);
|
||||
});
|
||||
});
|
||||
|
||||
describe('saveSinglefileWithCLI', () => {
|
||||
it('should use single-file-cli as fallback', () => {
|
||||
const cliCommand = 'single-file';
|
||||
|
||||
// Should check for CLI availability
|
||||
assert.strictEqual(typeof cliCommand, 'string');
|
||||
assert.ok(cliCommand.length > 0);
|
||||
});
|
||||
|
||||
it('should pass correct arguments to CLI', () => {
|
||||
const args = [
|
||||
'--browser-headless',
|
||||
'https://example.com',
|
||||
'singlefile/singlefile.html'
|
||||
];
|
||||
|
||||
assert.ok(args.includes('--browser-headless'));
|
||||
assert.ok(args.some(arg => arg.startsWith('http')));
|
||||
});
|
||||
|
||||
it('should handle optional CLI arguments', () => {
|
||||
const options = {
|
||||
userAgent: 'Mozilla/5.0...',
|
||||
cookiesFile: '/path/to/cookies.txt',
|
||||
ignoreSSL: true
|
||||
};
|
||||
|
||||
// Optional args should be conditionally added
|
||||
if (options.userAgent) {
|
||||
assert.ok(options.userAgent.length > 0);
|
||||
}
|
||||
|
||||
if (options.ignoreSSL) {
|
||||
assert.strictEqual(options.ignoreSSL, true);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('priority and execution order', () => {
|
||||
it('should have priority 04 (early)', () => {
|
||||
const filename = 'on_Snapshot__04_singlefile.js';
|
||||
|
||||
const match = filename.match(/on_Snapshot__(\d+)_/);
|
||||
assert.ok(match);
|
||||
|
||||
const priority = parseInt(match[1]);
|
||||
assert.strictEqual(priority, 4);
|
||||
});
|
||||
|
||||
it('should run before chrome_session (priority 20)', () => {
|
||||
const extensionPriority = 4;
|
||||
const chromeSessionPriority = 20;
|
||||
|
||||
assert.ok(extensionPriority < chromeSessionPriority);
|
||||
});
|
||||
|
||||
it('should install extensions in correct order', () => {
|
||||
const priorities = {
|
||||
captcha2: 1,
|
||||
istilldontcareaboutcookies: 2,
|
||||
ublock: 3,
|
||||
singlefile: 4
|
||||
};
|
||||
|
||||
// Should be in ascending order
|
||||
assert.ok(priorities.captcha2 < priorities.istilldontcareaboutcookies);
|
||||
assert.ok(priorities.istilldontcareaboutcookies < priorities.ublock);
|
||||
assert.ok(priorities.ublock < priorities.singlefile);
|
||||
});
|
||||
});
|
||||
|
||||
describe('output structure', () => {
|
||||
it('should define output directory and file', () => {
|
||||
const OUTPUT_DIR = 'singlefile';
|
||||
const OUTPUT_FILE = 'singlefile.html';
|
||||
|
||||
assert.strictEqual(OUTPUT_DIR, 'singlefile');
|
||||
assert.strictEqual(OUTPUT_FILE, 'singlefile.html');
|
||||
});
|
||||
|
||||
it('should create output directory if not exists', () => {
|
||||
const outputDir = path.join(TEST_DIR, 'singlefile');
|
||||
|
||||
// Should create directory
|
||||
if (!fs.existsSync(outputDir)) {
|
||||
fs.mkdirSync(outputDir, { recursive: true });
|
||||
}
|
||||
|
||||
assert.ok(fs.existsSync(outputDir));
|
||||
|
||||
// Cleanup
|
||||
fs.rmSync(outputDir, { recursive: true });
|
||||
});
|
||||
});
|
||||
|
||||
describe('extension vs CLI fallback', () => {
|
||||
it('should prefer extension over CLI', () => {
|
||||
const preferenceOrder = [
|
||||
'extension',
|
||||
'cli'
|
||||
];
|
||||
|
||||
assert.strictEqual(preferenceOrder[0], 'extension');
|
||||
assert.strictEqual(preferenceOrder[1], 'cli');
|
||||
});
|
||||
|
||||
it('should fallback to CLI if extension unavailable', () => {
|
||||
const extensionAvailable = false;
|
||||
const cliAvailable = true;
|
||||
|
||||
let method;
|
||||
if (extensionAvailable) {
|
||||
method = 'extension';
|
||||
} else if (cliAvailable) {
|
||||
method = 'cli';
|
||||
}
|
||||
|
||||
assert.strictEqual(method, 'cli');
|
||||
});
|
||||
|
||||
it('should use extension if available', () => {
|
||||
const extensionAvailable = true;
|
||||
|
||||
let method;
|
||||
if (extensionAvailable) {
|
||||
method = 'extension';
|
||||
} else {
|
||||
method = 'cli';
|
||||
}
|
||||
|
||||
assert.strictEqual(method, 'extension');
|
||||
});
|
||||
});
|
||||
|
||||
describe('file matching and validation', () => {
|
||||
beforeEach(() => {
|
||||
if (!fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.mkdirSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_DOWNLOADS_DIR)) {
|
||||
fs.rmSync(TEST_DOWNLOADS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
it('should filter HTML files from downloads', () => {
|
||||
// Create mock download files
|
||||
const files = [
|
||||
'example.html',
|
||||
'test.pdf',
|
||||
'image.png',
|
||||
'page.html'
|
||||
];
|
||||
|
||||
const htmlFiles = files.filter(f => f.endsWith('.html'));
|
||||
|
||||
assert.strictEqual(htmlFiles.length, 2);
|
||||
assert.ok(htmlFiles.includes('example.html'));
|
||||
assert.ok(htmlFiles.includes('page.html'));
|
||||
});
|
||||
|
||||
it('should match URL in HTML header comment', () => {
|
||||
const testUrl = 'https://example.com/page';
|
||||
|
||||
const htmlContent = `<!--
|
||||
Page saved with SingleFile
|
||||
url: ${testUrl}
|
||||
saved date: 2024-01-01
|
||||
-->
|
||||
<html>...</html>`;
|
||||
|
||||
const headerSection = htmlContent.split('meta charset')[0] || htmlContent.split('<html>')[0];
|
||||
|
||||
assert.ok(headerSection.includes(`url: ${testUrl}`));
|
||||
});
|
||||
|
||||
it('should handle multiple new files in downloads', () => {
|
||||
const filesBefore = new Set(['old1.html', 'old2.html']);
|
||||
const filesAfter = ['old1.html', 'old2.html', 'new1.html', 'new2.html'];
|
||||
|
||||
const filesNew = filesAfter.filter(f => !filesBefore.has(f));
|
||||
|
||||
assert.strictEqual(filesNew.length, 2);
|
||||
assert.ok(filesNew.includes('new1.html'));
|
||||
assert.ok(filesNew.includes('new2.html'));
|
||||
});
|
||||
});
|
||||
|
||||
describe('error handling', () => {
|
||||
it('should timeout after max wait time', () => {
|
||||
const checkDelay = 3000; // ms
|
||||
const maxTries = 10;
|
||||
const timeoutMs = checkDelay * maxTries;
|
||||
|
||||
assert.strictEqual(timeoutMs, 30000); // 30 seconds
|
||||
});
|
||||
|
||||
it('should handle missing extension gracefully', () => {
|
||||
const extension = null;
|
||||
|
||||
if (!extension || !extension.version) {
|
||||
// Should throw error
|
||||
assert.ok(true);
|
||||
}
|
||||
});
|
||||
|
||||
it('should handle file not found after waiting', () => {
|
||||
const filesNew = [];
|
||||
const maxWaitReached = true;
|
||||
|
||||
if (filesNew.length === 0 && maxWaitReached) {
|
||||
// Should return null
|
||||
const result = null;
|
||||
assert.strictEqual(result, null);
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
141
archivebox/plugins/singlefile/tests/test_singlefile.py
Normal file
141
archivebox/plugins/singlefile/tests/test_singlefile.py
Normal file
@@ -0,0 +1,141 @@
|
||||
"""
|
||||
Unit tests for singlefile plugin
|
||||
|
||||
Tests invoke the plugin hook as an external process and verify outputs/side effects.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__04_singlefile.js"
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that SingleFile extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert metadata["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "SingleFile" in result.stdout or "singlefile" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "singlefile.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
|
||||
assert cache_data["name"] == "singlefile"
|
||||
|
||||
|
||||
def test_install_uses_existing_cache():
|
||||
"""Test that install uses existing cache when available"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
# Create fake cache
|
||||
fake_extension_dir = ext_dir / "mpiodijhokgodhhofbcjdecpffjipkle__singlefile"
|
||||
fake_extension_dir.mkdir(parents=True)
|
||||
|
||||
manifest = {"version": "1.22.96", "name": "SingleFile"}
|
||||
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should use cache or install successfully
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_no_configuration_required():
|
||||
"""Test that SingleFile works without configuration"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# No API keys needed
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should work without API keys
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_priority_order():
|
||||
"""Test that singlefile has correct priority (04)"""
|
||||
# Extract priority from filename
|
||||
filename = INSTALL_SCRIPT.name
|
||||
assert "04" in filename, "SingleFile should have priority 04"
|
||||
assert filename.startswith("on_Snapshot__04_"), "Should follow priority naming convention"
|
||||
|
||||
|
||||
def test_output_directory_structure():
|
||||
"""Test that plugin defines correct output structure"""
|
||||
# Verify the script mentions singlefile output directory
|
||||
script_content = INSTALL_SCRIPT.read_text()
|
||||
|
||||
# Should mention singlefile output directory
|
||||
assert "singlefile" in script_content.lower()
|
||||
# Should mention HTML output
|
||||
assert ".html" in script_content or "html" in script_content.lower()
|
||||
243
archivebox/plugins/ssl/on_Snapshot__23_ssl.js
Executable file
243
archivebox/plugins/ssl/on_Snapshot__23_ssl.js
Executable file
@@ -0,0 +1,243 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract SSL/TLS certificate details from a URL.
|
||||
*
|
||||
* Connects to Chrome session and retrieves security details including:
|
||||
* - Protocol (TLS 1.2, TLS 1.3, etc.)
|
||||
* - Cipher suite
|
||||
* - Certificate issuer, validity period
|
||||
* - Security state
|
||||
*
|
||||
* Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes ssl/ssl.json
|
||||
*
|
||||
* Environment variables:
|
||||
* SAVE_SSL: Enable SSL extraction (default: true)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'ssl';
|
||||
const OUTPUT_DIR = 'ssl';
|
||||
const OUTPUT_FILE = 'ssl.json';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvBool(name, defaultValue = false) {
|
||||
const val = getEnv(name, '').toLowerCase();
|
||||
if (['true', '1', 'yes', 'on'].includes(val)) return true;
|
||||
if (['false', '0', 'no', 'off'].includes(val)) return false;
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract SSL details
|
||||
async function extractSsl(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Only extract SSL for HTTPS URLs
|
||||
if (!url.startsWith('https://')) {
|
||||
return { success: false, error: 'URL is not HTTPS' };
|
||||
}
|
||||
|
||||
let browser = null;
|
||||
let sslInfo = {};
|
||||
|
||||
try {
|
||||
// Connect to existing Chrome session
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (!cdpUrl) {
|
||||
return { success: false, error: 'No Chrome session found (chrome_session extractor must run first)' };
|
||||
}
|
||||
|
||||
browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
// Get the page
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
return { success: false, error: 'No page found in Chrome session' };
|
||||
}
|
||||
|
||||
// Get CDP client for low-level access
|
||||
const client = await page.target().createCDPSession();
|
||||
|
||||
// Enable Security domain
|
||||
await client.send('Security.enable');
|
||||
|
||||
// Get security details from the loaded page
|
||||
const securityState = await client.send('Security.getSecurityState');
|
||||
|
||||
sslInfo = {
|
||||
url,
|
||||
securityState: securityState.securityState,
|
||||
schemeIsCryptographic: securityState.schemeIsCryptographic,
|
||||
summary: securityState.summary || '',
|
||||
};
|
||||
|
||||
// Try to get detailed certificate info if available
|
||||
if (securityState.securityStateIssueIds && securityState.securityStateIssueIds.length > 0) {
|
||||
sslInfo.issues = securityState.securityStateIssueIds;
|
||||
}
|
||||
|
||||
// Get response security details from navigation
|
||||
let mainResponse = null;
|
||||
page.on('response', async (response) => {
|
||||
if (response.url() === url || response.request().isNavigationRequest()) {
|
||||
mainResponse = response;
|
||||
}
|
||||
});
|
||||
|
||||
// If we have security details from response
|
||||
if (mainResponse) {
|
||||
try {
|
||||
const securityDetails = await mainResponse.securityDetails();
|
||||
if (securityDetails) {
|
||||
sslInfo.protocol = securityDetails.protocol();
|
||||
sslInfo.subjectName = securityDetails.subjectName();
|
||||
sslInfo.issuer = securityDetails.issuer();
|
||||
sslInfo.validFrom = securityDetails.validFrom();
|
||||
sslInfo.validTo = securityDetails.validTo();
|
||||
sslInfo.certificateId = securityDetails.subjectName();
|
||||
|
||||
const sanList = securityDetails.sanList();
|
||||
if (sanList && sanList.length > 0) {
|
||||
sslInfo.subjectAlternativeNames = sanList;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Security details not available
|
||||
}
|
||||
}
|
||||
|
||||
await client.detach();
|
||||
|
||||
// Write output
|
||||
fs.writeFileSync(outputPath, JSON.stringify(sslInfo, null, 2));
|
||||
|
||||
return { success: true, output: outputPath, sslInfo };
|
||||
|
||||
} catch (e) {
|
||||
return { success: false, error: `${e.name}: ${e.message}` };
|
||||
} finally {
|
||||
if (browser) {
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__16_ssl.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
// Check if enabled
|
||||
if (!getEnvBool('SAVE_SSL', true)) {
|
||||
console.log('Skipping SSL (SAVE_SSL=False)');
|
||||
status = 'skipped';
|
||||
const endTs = new Date();
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`STATUS=${status}`);
|
||||
console.log(`RESULT_JSON=${JSON.stringify({extractor: EXTRACTOR_NAME, status, url, snapshot_id: snapshotId})}`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const result = await extractSsl(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
const protocol = result.sslInfo?.protocol || 'unknown';
|
||||
console.log(`SSL details extracted: ${protocol}`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
337
archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
Normal file
337
archivebox/plugins/staticfile/on_Snapshot__31_staticfile.py
Normal file
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Download static files (PDFs, images, archives, etc.) directly.
|
||||
|
||||
This extractor runs AFTER chrome_session and checks the Content-Type header
|
||||
from chrome_session/response_headers.json to determine if the URL points to
|
||||
a static file that should be downloaded directly.
|
||||
|
||||
Other extractors check for the presence of this extractor's output directory
|
||||
to know if they should skip (since Chrome-based extractors can't meaningfully
|
||||
process static files like PDFs, images, etc.).
|
||||
|
||||
Usage: on_Snapshot__21_staticfile.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads file to staticfile/<filename>
|
||||
|
||||
Environment variables:
|
||||
STATICFILE_TIMEOUT: Timeout in seconds (default: 300)
|
||||
STATICFILE_MAX_SIZE: Maximum file size in bytes (default: 1GB)
|
||||
USER_AGENT: User agent string (optional)
|
||||
CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse, unquote
|
||||
|
||||
import rich_click as click
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'staticfile'
|
||||
OUTPUT_DIR = 'staticfile'
|
||||
CHROME_SESSION_DIR = 'chrome_session'
|
||||
|
||||
# Content-Types that indicate static files
|
||||
# These can't be meaningfully processed by Chrome-based extractors
|
||||
STATIC_CONTENT_TYPES = {
|
||||
# Documents
|
||||
'application/pdf',
|
||||
'application/msword',
|
||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
'application/vnd.ms-excel',
|
||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
'application/vnd.ms-powerpoint',
|
||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
||||
'application/rtf',
|
||||
'application/epub+zip',
|
||||
# Images
|
||||
'image/png',
|
||||
'image/jpeg',
|
||||
'image/gif',
|
||||
'image/webp',
|
||||
'image/svg+xml',
|
||||
'image/x-icon',
|
||||
'image/bmp',
|
||||
'image/tiff',
|
||||
'image/avif',
|
||||
'image/heic',
|
||||
'image/heif',
|
||||
# Audio
|
||||
'audio/mpeg',
|
||||
'audio/mp3',
|
||||
'audio/wav',
|
||||
'audio/flac',
|
||||
'audio/aac',
|
||||
'audio/ogg',
|
||||
'audio/webm',
|
||||
'audio/m4a',
|
||||
'audio/opus',
|
||||
# Video
|
||||
'video/mp4',
|
||||
'video/webm',
|
||||
'video/x-matroska',
|
||||
'video/avi',
|
||||
'video/quicktime',
|
||||
'video/x-ms-wmv',
|
||||
'video/x-flv',
|
||||
# Archives
|
||||
'application/zip',
|
||||
'application/x-tar',
|
||||
'application/gzip',
|
||||
'application/x-bzip2',
|
||||
'application/x-xz',
|
||||
'application/x-7z-compressed',
|
||||
'application/x-rar-compressed',
|
||||
'application/vnd.rar',
|
||||
# Data
|
||||
'application/json',
|
||||
'application/xml',
|
||||
'text/csv',
|
||||
'text/xml',
|
||||
'application/x-yaml',
|
||||
# Executables/Binaries
|
||||
'application/octet-stream', # Generic binary
|
||||
'application/x-executable',
|
||||
'application/x-msdos-program',
|
||||
'application/x-apple-diskimage',
|
||||
'application/vnd.debian.binary-package',
|
||||
'application/x-rpm',
|
||||
# Other
|
||||
'application/x-bittorrent',
|
||||
'application/wasm',
|
||||
}
|
||||
|
||||
# Also check Content-Type prefixes for categories
|
||||
STATIC_CONTENT_TYPE_PREFIXES = (
|
||||
'image/',
|
||||
'audio/',
|
||||
'video/',
|
||||
'application/zip',
|
||||
'application/x-',
|
||||
)
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def get_content_type_from_chrome_session() -> str | None:
|
||||
"""Read Content-Type from chrome_session's response headers."""
|
||||
headers_file = Path(CHROME_SESSION_DIR) / 'response_headers.json'
|
||||
if not headers_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(headers_file) as f:
|
||||
headers = json.load(f)
|
||||
# Headers might be nested or flat depending on chrome_session format
|
||||
content_type = headers.get('content-type') or headers.get('Content-Type') or ''
|
||||
# Strip charset and other parameters
|
||||
return content_type.split(';')[0].strip().lower()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def is_static_content_type(content_type: str) -> bool:
|
||||
"""Check if Content-Type indicates a static file."""
|
||||
if not content_type:
|
||||
return False
|
||||
|
||||
# Check exact match
|
||||
if content_type in STATIC_CONTENT_TYPES:
|
||||
return True
|
||||
|
||||
# Check prefixes
|
||||
for prefix in STATIC_CONTENT_TYPE_PREFIXES:
|
||||
if content_type.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_filename_from_url(url: str) -> str:
|
||||
"""Extract filename from URL."""
|
||||
parsed = urlparse(url)
|
||||
path = unquote(parsed.path)
|
||||
filename = path.split('/')[-1] or 'downloaded_file'
|
||||
|
||||
# Sanitize filename
|
||||
filename = filename.replace('/', '_').replace('\\', '_')
|
||||
if len(filename) > 200:
|
||||
filename = filename[:200]
|
||||
|
||||
return filename
|
||||
|
||||
|
||||
def download_file(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Download a static file.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
import requests
|
||||
|
||||
timeout = get_env_int('STATICFILE_TIMEOUT', 300)
|
||||
max_size = get_env_int('STATICFILE_MAX_SIZE', 1024 * 1024 * 1024) # 1GB default
|
||||
user_agent = get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
check_ssl = get_env_bool('CHECK_SSL_VALIDITY', True)
|
||||
|
||||
headers = {'User-Agent': user_agent}
|
||||
|
||||
try:
|
||||
# Stream download to handle large files
|
||||
response = requests.get(
|
||||
url,
|
||||
headers=headers,
|
||||
timeout=timeout,
|
||||
stream=True,
|
||||
verify=check_ssl,
|
||||
allow_redirects=True,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check content length if available
|
||||
content_length = response.headers.get('content-length')
|
||||
if content_length and int(content_length) > max_size:
|
||||
return False, None, f'File too large: {int(content_length)} bytes > {max_size} max'
|
||||
|
||||
# Create output directory
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Determine filename
|
||||
filename = get_filename_from_url(url)
|
||||
|
||||
# Check content-disposition header for better filename
|
||||
content_disp = response.headers.get('content-disposition', '')
|
||||
if 'filename=' in content_disp:
|
||||
import re
|
||||
match = re.search(r'filename[*]?=["\']?([^"\';\n]+)', content_disp)
|
||||
if match:
|
||||
filename = match.group(1).strip()
|
||||
|
||||
output_path = output_dir / filename
|
||||
|
||||
# Download in chunks
|
||||
downloaded_size = 0
|
||||
with open(output_path, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
downloaded_size += len(chunk)
|
||||
if downloaded_size > max_size:
|
||||
f.close()
|
||||
output_path.unlink()
|
||||
return False, None, f'File too large: exceeded {max_size} bytes'
|
||||
f.write(chunk)
|
||||
|
||||
return True, str(output_path), ''
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return False, None, f'Timed out after {timeout} seconds'
|
||||
except requests.exceptions.SSLError as e:
|
||||
return False, None, f'SSL error: {e}'
|
||||
except requests.exceptions.RequestException as e:
|
||||
return False, None, f'Download failed: {e}'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to download')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Download static files based on Content-Type from chrome_session."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
|
||||
# Check Content-Type from chrome_session's response headers
|
||||
content_type = get_content_type_from_chrome_session()
|
||||
|
||||
# If chrome_session didn't run or no Content-Type, skip
|
||||
if not content_type:
|
||||
print(f'No Content-Type found (chrome_session may not have run)')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - can't determine content type
|
||||
|
||||
# If not a static file type, skip (this is the normal case for HTML pages)
|
||||
if not is_static_content_type(content_type):
|
||||
print(f'Not a static file (Content-Type: {content_type})')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id, "content_type": content_type})}')
|
||||
sys.exit(0) # Permanent skip - not a static file
|
||||
|
||||
try:
|
||||
# Download the file
|
||||
print(f'Static file detected (Content-Type: {content_type}), downloading...')
|
||||
success, output, error = download_file(url)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success and output:
|
||||
size = Path(output).stat().st_size
|
||||
print(f'Static file downloaded ({size} bytes): {output}')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'content_type': content_type,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
262
archivebox/plugins/title/on_Snapshot__32_title.js
Normal file
262
archivebox/plugins/title/on_Snapshot__32_title.js
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Extract the title of a URL.
|
||||
*
|
||||
* If a Chrome session exists (from chrome_session extractor), connects to it via CDP
|
||||
* to get the page title (which includes JS-rendered content).
|
||||
* Otherwise falls back to fetching the URL and parsing HTML.
|
||||
*
|
||||
* Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes title/title.txt
|
||||
*
|
||||
* Environment variables:
|
||||
* TIMEOUT: Timeout in seconds (default: 30)
|
||||
* USER_AGENT: User agent string (optional)
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const https = require('https');
|
||||
const http = require('http');
|
||||
|
||||
// Extractor metadata
|
||||
const EXTRACTOR_NAME = 'title';
|
||||
const OUTPUT_DIR = 'title';
|
||||
const OUTPUT_FILE = 'title.txt';
|
||||
const CHROME_SESSION_DIR = 'chrome_session';
|
||||
|
||||
// Parse command line arguments
|
||||
function parseArgs() {
|
||||
const args = {};
|
||||
process.argv.slice(2).forEach(arg => {
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, ...valueParts] = arg.slice(2).split('=');
|
||||
args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
|
||||
}
|
||||
});
|
||||
return args;
|
||||
}
|
||||
|
||||
// Get environment variable with default
|
||||
function getEnv(name, defaultValue = '') {
|
||||
return (process.env[name] || defaultValue).trim();
|
||||
}
|
||||
|
||||
function getEnvInt(name, defaultValue = 0) {
|
||||
const val = parseInt(getEnv(name, String(defaultValue)), 10);
|
||||
return isNaN(val) ? defaultValue : val;
|
||||
}
|
||||
|
||||
// Get CDP URL from chrome_session if available
|
||||
function getCdpUrl() {
|
||||
const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
|
||||
if (fs.existsSync(cdpFile)) {
|
||||
return fs.readFileSync(cdpFile, 'utf8').trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Extract title from HTML
|
||||
function extractTitleFromHtml(html) {
|
||||
// Try <title> tag
|
||||
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
||||
if (titleMatch) {
|
||||
return titleMatch[1].trim();
|
||||
}
|
||||
|
||||
// Try og:title
|
||||
const ogMatch = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i);
|
||||
if (ogMatch) {
|
||||
return ogMatch[1].trim();
|
||||
}
|
||||
|
||||
// Try twitter:title
|
||||
const twitterMatch = html.match(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i);
|
||||
if (twitterMatch) {
|
||||
return twitterMatch[1].trim();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Fetch URL and extract title (fallback method)
|
||||
function fetchTitle(url) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const timeout = getEnvInt('TIMEOUT', 30) * 1000;
|
||||
const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
|
||||
|
||||
const client = url.startsWith('https') ? https : http;
|
||||
|
||||
const req = client.get(url, {
|
||||
headers: { 'User-Agent': userAgent },
|
||||
timeout,
|
||||
}, (res) => {
|
||||
// Handle redirects
|
||||
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
||||
fetchTitle(res.headers.location).then(resolve).catch(reject);
|
||||
return;
|
||||
}
|
||||
|
||||
let data = '';
|
||||
res.on('data', chunk => {
|
||||
data += chunk;
|
||||
// Only need first 64KB to find title
|
||||
if (data.length > 65536) {
|
||||
req.destroy();
|
||||
}
|
||||
});
|
||||
res.on('end', () => {
|
||||
const title = extractTitleFromHtml(data);
|
||||
if (title) {
|
||||
resolve(title);
|
||||
} else {
|
||||
reject(new Error('No title found in HTML'));
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
req.on('error', reject);
|
||||
req.on('timeout', () => {
|
||||
req.destroy();
|
||||
reject(new Error('Request timeout'));
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Get title using Puppeteer CDP connection
|
||||
async function getTitleFromCdp(cdpUrl) {
|
||||
const puppeteer = require('puppeteer-core');
|
||||
|
||||
const browser = await puppeteer.connect({
|
||||
browserWSEndpoint: cdpUrl,
|
||||
});
|
||||
|
||||
try {
|
||||
// Get existing pages
|
||||
const pages = await browser.pages();
|
||||
const page = pages.find(p => p.url().startsWith('http')) || pages[0];
|
||||
|
||||
if (!page) {
|
||||
throw new Error('No page found in Chrome session');
|
||||
}
|
||||
|
||||
// Get title from page
|
||||
const title = await page.title();
|
||||
|
||||
if (!title) {
|
||||
// Try getting from DOM directly
|
||||
const domTitle = await page.evaluate(() => {
|
||||
return document.title ||
|
||||
document.querySelector('meta[property="og:title"]')?.content ||
|
||||
document.querySelector('meta[name="twitter:title"]')?.content ||
|
||||
document.querySelector('h1')?.textContent?.trim();
|
||||
});
|
||||
return domTitle;
|
||||
}
|
||||
|
||||
return title;
|
||||
} finally {
|
||||
// Disconnect without closing browser
|
||||
browser.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
async function extractTitle(url) {
|
||||
// Create output directory
|
||||
if (!fs.existsSync(OUTPUT_DIR)) {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
}
|
||||
const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
|
||||
|
||||
// Try Chrome session first
|
||||
const cdpUrl = getCdpUrl();
|
||||
if (cdpUrl) {
|
||||
try {
|
||||
const title = await getTitleFromCdp(cdpUrl);
|
||||
if (title) {
|
||||
fs.writeFileSync(outputPath, title, 'utf8');
|
||||
return { success: true, output: outputPath, title, method: 'cdp' };
|
||||
}
|
||||
} catch (e) {
|
||||
console.error(`CDP title extraction failed: ${e.message}, falling back to HTTP`);
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to HTTP fetch
|
||||
try {
|
||||
const title = await fetchTitle(url);
|
||||
fs.writeFileSync(outputPath, title, 'utf8');
|
||||
return { success: true, output: outputPath, title, method: 'http' };
|
||||
} catch (e) {
|
||||
return { success: false, error: e.message };
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const args = parseArgs();
|
||||
const url = args.url;
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const startTs = new Date();
|
||||
let status = 'failed';
|
||||
let output = null;
|
||||
let error = '';
|
||||
|
||||
try {
|
||||
const result = await extractTitle(url);
|
||||
|
||||
if (result.success) {
|
||||
status = 'succeeded';
|
||||
output = result.output;
|
||||
console.log(`Title extracted (${result.method}): ${result.title}`);
|
||||
} else {
|
||||
status = 'failed';
|
||||
error = result.error;
|
||||
}
|
||||
} catch (e) {
|
||||
error = `${e.name}: ${e.message}`;
|
||||
status = 'failed';
|
||||
}
|
||||
|
||||
const endTs = new Date();
|
||||
const duration = (endTs - startTs) / 1000;
|
||||
|
||||
// Print results
|
||||
console.log(`START_TS=${startTs.toISOString()}`);
|
||||
console.log(`END_TS=${endTs.toISOString()}`);
|
||||
console.log(`DURATION=${duration.toFixed(2)}`);
|
||||
if (output) {
|
||||
console.log(`OUTPUT=${output}`);
|
||||
}
|
||||
console.log(`STATUS=${status}`);
|
||||
|
||||
if (error) {
|
||||
console.error(`ERROR=${error}`);
|
||||
}
|
||||
|
||||
// Print JSON result
|
||||
const resultJson = {
|
||||
extractor: EXTRACTOR_NAME,
|
||||
url,
|
||||
snapshot_id: snapshotId,
|
||||
status,
|
||||
start_ts: startTs.toISOString(),
|
||||
end_ts: endTs.toISOString(),
|
||||
duration: Math.round(duration * 100) / 100,
|
||||
output,
|
||||
error: error || null,
|
||||
};
|
||||
console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
|
||||
|
||||
process.exit(status === 'succeeded' ? 0 : 1);
|
||||
}
|
||||
|
||||
main().catch(e => {
|
||||
console.error(`Fatal error: ${e.message}`);
|
||||
process.exit(1);
|
||||
});
|
||||
241
archivebox/plugins/title/tests/test_title.py
Normal file
241
archivebox/plugins/title/tests/test_title.py
Normal file
@@ -0,0 +1,241 @@
|
||||
"""
|
||||
Integration tests for title plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin script exists
|
||||
2. Node.js is available
|
||||
3. Title extraction works for real example.com
|
||||
4. Output file contains actual page title
|
||||
5. Handles various title sources (<title>, og:title, twitter:title)
|
||||
6. Config options work (TIMEOUT, USER_AGENT)
|
||||
7. Fallback to HTTP when chrome_session not available
|
||||
"""
|
||||
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
TITLE_HOOK = PLUGIN_DIR / 'on_Snapshot__32_title.js'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"
|
||||
|
||||
|
||||
def test_extracts_title_from_example_com():
|
||||
"""Test full workflow: extract title from real example.com."""
|
||||
|
||||
# Check node is available
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run title extraction
|
||||
result = subprocess.run(
|
||||
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify output in stdout
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'Title extracted' in result.stdout, "Should report completion"
|
||||
|
||||
# Verify output directory created
|
||||
title_dir = tmpdir / 'title'
|
||||
assert title_dir.exists(), "Output directory not created"
|
||||
|
||||
# Verify output file exists
|
||||
title_file = title_dir / 'title.txt'
|
||||
assert title_file.exists(), "title.txt not created"
|
||||
|
||||
# Verify title contains REAL example.com title
|
||||
title_text = title_file.read_text().strip()
|
||||
assert len(title_text) > 0, "Title should not be empty"
|
||||
assert 'example' in title_text.lower(), "Title should contain 'example'"
|
||||
|
||||
# example.com has title "Example Domain"
|
||||
assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
|
||||
|
||||
# Verify RESULT_JSON is present
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
|
||||
def test_falls_back_to_http_when_chrome_session_unavailable():
|
||||
"""Test that title plugin falls back to HTTP when chrome_session unavailable."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Don't create chrome_session directory - force HTTP fallback
|
||||
|
||||
# Run title extraction
|
||||
result = subprocess.run(
|
||||
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
|
||||
# Verify output exists and has real title
|
||||
output_title_file = tmpdir / 'title' / 'title.txt'
|
||||
assert output_title_file.exists(), "Output title.txt not created"
|
||||
|
||||
title_text = output_title_file.read_text().strip()
|
||||
assert 'example' in title_text.lower()
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that TIMEOUT config is respected."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout (but example.com should still succeed)
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['TIMEOUT'] = '5'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should complete (success or fail, but not hang)
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
def test_config_user_agent():
|
||||
"""Test that USER_AGENT config is used."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set custom user agent
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
env['USER_AGENT'] = 'TestBot/1.0'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout
|
||||
|
||||
|
||||
def test_handles_https_urls():
|
||||
"""Test that HTTPS URLs work correctly."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
output_title_file = tmpdir / 'title' / 'title.txt'
|
||||
if output_title_file.exists():
|
||||
title_text = output_title_file.read_text().strip()
|
||||
assert len(title_text) > 0, "Title should not be empty"
|
||||
assert 'example' in title_text.lower()
|
||||
|
||||
|
||||
def test_handles_404_gracefully():
|
||||
"""Test that title plugin handles 404 pages.
|
||||
|
||||
Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
|
||||
with the generic "Example Domain" title.
|
||||
"""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# May succeed or fail depending on server behavior
|
||||
# example.com returns "Example Domain" even for 404s
|
||||
assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
|
||||
|
||||
|
||||
def test_handles_redirects():
|
||||
"""Test that title plugin handles redirects correctly."""
|
||||
|
||||
if not shutil.which('node'):
|
||||
pytest.skip("node not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# http://example.com redirects to https://example.com
|
||||
result = subprocess.run(
|
||||
['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should succeed and follow redirect
|
||||
if result.returncode == 0:
|
||||
output_title_file = tmpdir / 'title' / 'title.txt'
|
||||
if output_title_file.exists():
|
||||
title_text = output_title_file.read_text().strip()
|
||||
assert 'example' in title_text.lower()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
116
archivebox/plugins/ublock/on_Snapshot__03_ublock.js
Executable file
116
archivebox/plugins/ublock/on_Snapshot__03_ublock.js
Executable file
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* uBlock Origin Extension Plugin
|
||||
*
|
||||
* Installs and configures the uBlock Origin Chrome extension for ad blocking
|
||||
* and privacy protection during page archiving.
|
||||
*
|
||||
* Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
|
||||
*
|
||||
* Priority: 03 (early) - Must install before Chrome session starts
|
||||
* Hook: on_Snapshot
|
||||
*
|
||||
* This extension automatically:
|
||||
* - Blocks ads, trackers, and malware domains
|
||||
* - Reduces page load time and bandwidth usage
|
||||
* - Improves privacy during archiving
|
||||
* - Removes clutter from archived pages
|
||||
* - Uses efficient blocking with filter lists
|
||||
*/
|
||||
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
|
||||
// Import extension utilities
|
||||
const extensionUtils = require('../chrome_extensions/chrome_extension_utils.js');
|
||||
|
||||
// Extension metadata
|
||||
const EXTENSION = {
|
||||
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
|
||||
name: 'ublock',
|
||||
};
|
||||
|
||||
// Get extensions directory from environment or use default
|
||||
const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
|
||||
path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
|
||||
|
||||
/**
|
||||
* Install the uBlock Origin extension
|
||||
*/
|
||||
async function installUblockExtension() {
|
||||
console.log('[*] Installing uBlock Origin extension...');
|
||||
|
||||
// Install the extension
|
||||
const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
|
||||
|
||||
if (!extension) {
|
||||
console.error('[❌] Failed to install uBlock Origin extension');
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log('[+] uBlock Origin extension installed');
|
||||
console.log('[+] Ads and trackers will be blocked during archiving');
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: uBlock Origin works automatically with default filter lists.
|
||||
* No configuration needed - blocks ads, trackers, and malware domains out of the box.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*/
|
||||
async function main() {
|
||||
// Check if extension is already cached
|
||||
const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
|
||||
|
||||
if (fs.existsSync(cacheFile)) {
|
||||
try {
|
||||
const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
|
||||
|
||||
if (fs.existsSync(manifestPath)) {
|
||||
console.log('[*] uBlock Origin extension already installed (using cache)');
|
||||
return cached;
|
||||
}
|
||||
} catch (e) {
|
||||
// Cache file corrupted, re-install
|
||||
console.warn('[⚠️] Extension cache corrupted, re-installing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Install extension
|
||||
const extension = await installUblockExtension();
|
||||
|
||||
// Export extension metadata for chrome_session to load
|
||||
if (extension) {
|
||||
// Write extension info to a cache file that chrome_session can read
|
||||
await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
|
||||
await fs.promises.writeFile(
|
||||
cacheFile,
|
||||
JSON.stringify(extension, null, 2)
|
||||
);
|
||||
console.log(`[+] Extension metadata written to ${cacheFile}`);
|
||||
}
|
||||
|
||||
return extension;
|
||||
}
|
||||
|
||||
// Export functions for use by other plugins
|
||||
module.exports = {
|
||||
EXTENSION,
|
||||
installUblockExtension,
|
||||
};
|
||||
|
||||
// Run if executed directly
|
||||
if (require.main === module) {
|
||||
main().then(() => {
|
||||
console.log('[✓] uBlock Origin extension setup complete');
|
||||
process.exit(0);
|
||||
}).catch(err => {
|
||||
console.error('[❌] uBlock Origin extension setup failed:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
321
archivebox/plugins/ublock/tests/test_ublock.js
Normal file
321
archivebox/plugins/ublock/tests/test_ublock.js
Normal file
@@ -0,0 +1,321 @@
|
||||
/**
|
||||
* Unit tests for ublock plugin
|
||||
*
|
||||
* Run with: node --test tests/test_ublock.js
|
||||
*/
|
||||
|
||||
const assert = require('assert');
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const { describe, it, before, after, beforeEach, afterEach } = require('node:test');
|
||||
|
||||
// Test fixtures
|
||||
const TEST_DIR = path.join(__dirname, '.test_fixtures');
|
||||
const TEST_EXTENSIONS_DIR = path.join(TEST_DIR, 'chrome_extensions');
|
||||
|
||||
describe('ublock plugin', () => {
|
||||
before(() => {
|
||||
if (!fs.existsSync(TEST_DIR)) {
|
||||
fs.mkdirSync(TEST_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
after(() => {
|
||||
if (fs.existsSync(TEST_DIR)) {
|
||||
fs.rmSync(TEST_DIR, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
describe('EXTENSION metadata', () => {
|
||||
it('should have correct webstore_id for uBlock Origin', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
|
||||
});
|
||||
|
||||
it('should have correct name', () => {
|
||||
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
|
||||
|
||||
assert.strictEqual(EXTENSION.name, 'ublock');
|
||||
});
|
||||
});
|
||||
|
||||
describe('installUblockExtension', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should use cached extension if available', async () => {
|
||||
const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
|
||||
|
||||
// Create fake cache
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
|
||||
const fakeExtensionDir = path.join(TEST_EXTENSIONS_DIR, 'fake_ublock');
|
||||
|
||||
fs.mkdirSync(fakeExtensionDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(fakeExtensionDir, 'manifest.json'),
|
||||
JSON.stringify({ version: '1.67.0' })
|
||||
);
|
||||
|
||||
const fakeCache = {
|
||||
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
|
||||
name: 'ublock',
|
||||
unpacked_path: fakeExtensionDir,
|
||||
version: '1.67.0'
|
||||
};
|
||||
|
||||
fs.writeFileSync(cacheFile, JSON.stringify(fakeCache));
|
||||
|
||||
const result = await installUblockExtension();
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
assert.strictEqual(result.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
|
||||
});
|
||||
|
||||
it('should not require any configuration', async () => {
|
||||
// uBlock Origin works out of the box with default filter lists
|
||||
const { EXTENSION } = require('../on_Snapshot__03_ublock.js');
|
||||
|
||||
assert.ok(EXTENSION);
|
||||
// No config fields should be required
|
||||
});
|
||||
|
||||
it('should have large download size (filter lists)', () => {
|
||||
// uBlock Origin is typically larger than other extensions
|
||||
// due to included filter lists (usually 3-5 MB)
|
||||
|
||||
const typicalSize = 4 * 1024 * 1024; // ~4 MB
|
||||
const minExpectedSize = 2 * 1024 * 1024; // Minimum 2 MB
|
||||
|
||||
// Just verify we understand the expected size
|
||||
assert.ok(typicalSize > minExpectedSize);
|
||||
});
|
||||
});
|
||||
|
||||
describe('cache file creation', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should create cache file with correct structure', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
|
||||
|
||||
const mockExtension = {
|
||||
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
|
||||
name: 'ublock',
|
||||
version: '1.68.0',
|
||||
unpacked_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock'),
|
||||
crx_path: path.join(TEST_EXTENSIONS_DIR, 'test_ublock.crx')
|
||||
};
|
||||
|
||||
await fs.promises.writeFile(cacheFile, JSON.stringify(mockExtension, null, 2));
|
||||
|
||||
assert.ok(fs.existsSync(cacheFile));
|
||||
|
||||
const cache = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
|
||||
assert.strictEqual(cache.name, 'ublock');
|
||||
assert.strictEqual(cache.webstore_id, 'cjpalhdlnbpafiamejdnhcphjbkeiagm');
|
||||
});
|
||||
});
|
||||
|
||||
describe('extension functionality', () => {
|
||||
it('should work automatically with default filter lists', () => {
|
||||
const features = {
|
||||
automaticBlocking: true,
|
||||
requiresConfiguration: false,
|
||||
requiresApiKey: false,
|
||||
defaultFilterLists: true,
|
||||
blocksAds: true,
|
||||
blocksTrackers: true,
|
||||
blocksMalware: true
|
||||
};
|
||||
|
||||
assert.strictEqual(features.automaticBlocking, true);
|
||||
assert.strictEqual(features.requiresConfiguration, false);
|
||||
assert.strictEqual(features.requiresApiKey, false);
|
||||
assert.strictEqual(features.defaultFilterLists, true);
|
||||
});
|
||||
|
||||
it('should not require runtime configuration', () => {
|
||||
// uBlock Origin works purely via filter lists and content scripts
|
||||
// No API keys or runtime configuration needed
|
||||
|
||||
const requiresRuntimeConfig = false;
|
||||
const requiresApiKey = false;
|
||||
|
||||
assert.strictEqual(requiresRuntimeConfig, false);
|
||||
assert.strictEqual(requiresApiKey, false);
|
||||
});
|
||||
|
||||
it('should support standard filter list formats', () => {
|
||||
const supportedFormats = [
|
||||
'EasyList',
|
||||
'EasyPrivacy',
|
||||
'Malware Domains',
|
||||
'Peter Lowe\'s List',
|
||||
'uBlock Origin filters'
|
||||
];
|
||||
|
||||
assert.ok(supportedFormats.length > 0);
|
||||
// Should support multiple filter list formats
|
||||
});
|
||||
});
|
||||
|
||||
describe('priority and execution order', () => {
|
||||
it('should have priority 03 (early)', () => {
|
||||
const filename = 'on_Snapshot__03_ublock.js';
|
||||
|
||||
const match = filename.match(/on_Snapshot__(\d+)_/);
|
||||
assert.ok(match);
|
||||
|
||||
const priority = parseInt(match[1]);
|
||||
assert.strictEqual(priority, 3);
|
||||
});
|
||||
|
||||
it('should run before chrome_session (priority 20)', () => {
|
||||
const extensionPriority = 3;
|
||||
const chromeSessionPriority = 20;
|
||||
|
||||
assert.ok(extensionPriority < chromeSessionPriority);
|
||||
});
|
||||
|
||||
it('should run after cookie dismissal extension', () => {
|
||||
const ublockPriority = 3;
|
||||
const cookiesPriority = 2;
|
||||
|
||||
assert.ok(ublockPriority > cookiesPriority);
|
||||
});
|
||||
});
|
||||
|
||||
describe('performance considerations', () => {
|
||||
it('should benefit from caching due to large size', () => {
|
||||
// uBlock Origin's large size makes caching especially important
|
||||
|
||||
const averageDownloadTime = 10; // seconds
|
||||
const averageCacheCheckTime = 0.01; // seconds
|
||||
|
||||
const performanceGain = averageDownloadTime / averageCacheCheckTime;
|
||||
|
||||
// Should be at least 100x faster with cache
|
||||
assert.ok(performanceGain > 100);
|
||||
});
|
||||
|
||||
it('should not impact page load time significantly', () => {
|
||||
// While extension is large, it uses efficient blocking
|
||||
|
||||
const efficientBlocking = true;
|
||||
const minimalOverhead = true;
|
||||
|
||||
assert.strictEqual(efficientBlocking, true);
|
||||
assert.strictEqual(minimalOverhead, true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('error handling', () => {
|
||||
beforeEach(() => {
|
||||
process.env.CHROME_EXTENSIONS_DIR = TEST_EXTENSIONS_DIR;
|
||||
|
||||
if (!fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.mkdirSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
if (fs.existsSync(TEST_EXTENSIONS_DIR)) {
|
||||
fs.rmSync(TEST_EXTENSIONS_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
delete process.env.CHROME_EXTENSIONS_DIR;
|
||||
});
|
||||
|
||||
it('should handle corrupted cache gracefully', async () => {
|
||||
const cacheFile = path.join(TEST_EXTENSIONS_DIR, 'ublock.extension.json');
|
||||
|
||||
// Create corrupted cache
|
||||
fs.writeFileSync(cacheFile, 'invalid json content');
|
||||
|
||||
const { installUblockExtension } = require('../on_Snapshot__03_ublock.js');
|
||||
|
||||
// Mock loadOrInstallExtension to avoid actual download
|
||||
const extensionUtils = require('../../chrome_extensions/chrome_extension_utils.js');
|
||||
const originalFunc = extensionUtils.loadOrInstallExtension;
|
||||
|
||||
extensionUtils.loadOrInstallExtension = async () => ({
|
||||
webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
|
||||
name: 'ublock',
|
||||
version: '1.68.0'
|
||||
});
|
||||
|
||||
const result = await installUblockExtension();
|
||||
|
||||
extensionUtils.loadOrInstallExtension = originalFunc;
|
||||
|
||||
assert.notStrictEqual(result, null);
|
||||
});
|
||||
|
||||
it('should handle download timeout gracefully', () => {
|
||||
// For large extension like uBlock, timeout handling is important
|
||||
|
||||
const timeoutSeconds = 120; // 2 minutes
|
||||
const minTimeout = 30; // Should allow at least 30 seconds
|
||||
|
||||
assert.ok(timeoutSeconds > minTimeout);
|
||||
});
|
||||
});
|
||||
|
||||
describe('filter list validation', () => {
|
||||
it('should have valid filter list format', () => {
|
||||
// Example filter list entry
|
||||
const sampleFilters = [
|
||||
'||ads.example.com^',
|
||||
'||tracker.example.com^$third-party',
|
||||
'##.advertisement'
|
||||
];
|
||||
|
||||
// All filters should follow standard format
|
||||
sampleFilters.forEach(filter => {
|
||||
assert.ok(typeof filter === 'string');
|
||||
assert.ok(filter.length > 0);
|
||||
});
|
||||
});
|
||||
|
||||
it('should support cosmetic filters', () => {
|
||||
const cosmeticFilter = '##.banner-ad';
|
||||
|
||||
// Should start with ## for cosmetic filters
|
||||
assert.ok(cosmeticFilter.startsWith('##'));
|
||||
});
|
||||
|
||||
it('should support network filters', () => {
|
||||
const networkFilter = '||ads.example.com^';
|
||||
|
||||
// Network filters typically start with || or contain ^
|
||||
assert.ok(networkFilter.includes('||') || networkFilter.includes('^'));
|
||||
});
|
||||
});
|
||||
});
|
||||
148
archivebox/plugins/ublock/tests/test_ublock.py
Normal file
148
archivebox/plugins/ublock/tests/test_ublock.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Unit tests for ublock plugin
|
||||
|
||||
Tests invoke the plugin hook as an external process and verify outputs/side effects.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / "on_Snapshot__03_ublock.js"
|
||||
|
||||
|
||||
def test_install_script_exists():
|
||||
"""Verify install script exists"""
|
||||
assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
|
||||
|
||||
|
||||
def test_extension_metadata():
|
||||
"""Test that uBlock Origin extension has correct metadata"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
|
||||
|
||||
result = subprocess.run(
|
||||
["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
|
||||
|
||||
metadata = json.loads(result.stdout)
|
||||
assert metadata["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
|
||||
assert metadata["name"] == "ublock"
|
||||
|
||||
|
||||
def test_install_creates_cache():
|
||||
"""Test that install creates extension cache"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120 # uBlock is large, may take longer to download
|
||||
)
|
||||
|
||||
# Check output mentions installation
|
||||
assert "uBlock" in result.stdout or "ublock" in result.stdout
|
||||
|
||||
# Check cache file was created
|
||||
cache_file = ext_dir / "ublock.extension.json"
|
||||
assert cache_file.exists(), "Cache file should be created"
|
||||
|
||||
# Verify cache content
|
||||
cache_data = json.loads(cache_file.read_text())
|
||||
assert cache_data["webstore_id"] == "cjpalhdlnbpafiamejdnhcphjbkeiagm"
|
||||
assert cache_data["name"] == "ublock"
|
||||
|
||||
|
||||
def test_install_uses_existing_cache():
|
||||
"""Test that install uses existing cache when available"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
# Create fake cache
|
||||
fake_extension_dir = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock"
|
||||
fake_extension_dir.mkdir(parents=True)
|
||||
|
||||
manifest = {"version": "1.68.0", "name": "uBlock Origin"}
|
||||
(fake_extension_dir / "manifest.json").write_text(json.dumps(manifest))
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should use cache or install successfully
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_no_configuration_required():
|
||||
"""Test that uBlock Origin works without configuration"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
# No API keys needed - works with default filter lists
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# Should not require any API keys
|
||||
combined_output = result.stdout + result.stderr
|
||||
assert "API" not in combined_output or result.returncode == 0
|
||||
|
||||
|
||||
def test_large_extension_size():
|
||||
"""Test that uBlock Origin is downloaded successfully despite large size"""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
ext_dir = Path(tmpdir) / "chrome_extensions"
|
||||
ext_dir.mkdir(parents=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
|
||||
|
||||
result = subprocess.run(
|
||||
["node", str(INSTALL_SCRIPT)],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# If extension was downloaded, verify it's substantial size
|
||||
crx_file = ext_dir / "cjpalhdlnbpafiamejdnhcphjbkeiagm__ublock.crx"
|
||||
if crx_file.exists():
|
||||
# uBlock Origin with filter lists is typically 2-5 MB
|
||||
size_bytes = crx_file.stat().st_size
|
||||
assert size_bytes > 1_000_000, f"uBlock Origin should be > 1MB, got {size_bytes} bytes"
|
||||
80
archivebox/plugins/wget/config.json
Normal file
80
archivebox/plugins/wget/config.json
Normal file
@@ -0,0 +1,80 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SAVE_WGET": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Enable wget archiving"
|
||||
},
|
||||
"SAVE_WARC": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Save WARC archive file"
|
||||
},
|
||||
"SAVE_WGET_REQUISITES": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"description": "Download page requisites (CSS, JS, images)"
|
||||
},
|
||||
"WGET_BINARY": {
|
||||
"type": "string",
|
||||
"default": "wget",
|
||||
"description": "Path to wget binary"
|
||||
},
|
||||
"WGET_TIMEOUT": {
|
||||
"type": "integer",
|
||||
"default": 60,
|
||||
"minimum": 5,
|
||||
"x-fallback": "TIMEOUT",
|
||||
"description": "Timeout for wget in seconds"
|
||||
},
|
||||
"WGET_USER_AGENT": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "USER_AGENT",
|
||||
"description": "User agent string for wget"
|
||||
},
|
||||
"WGET_CHECK_SSL_VALIDITY": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-fallback": "CHECK_SSL_VALIDITY",
|
||||
"x-aliases": ["CHECK_SSL_VALIDITY"],
|
||||
"description": "Whether to verify SSL certificates"
|
||||
},
|
||||
"WGET_COOKIES_FILE": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"x-fallback": "COOKIES_FILE",
|
||||
"description": "Path to cookies file"
|
||||
},
|
||||
"WGET_RESTRICT_FILE_NAMES": {
|
||||
"type": "string",
|
||||
"default": "windows",
|
||||
"enum": ["windows", "unix", "ascii", "nocontrol", "lowercase", "uppercase"],
|
||||
"x-fallback": "RESTRICT_FILE_NAMES",
|
||||
"description": "Filename restriction mode"
|
||||
},
|
||||
"WGET_ARGS": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"default": [
|
||||
"--no-verbose",
|
||||
"--adjust-extension",
|
||||
"--convert-links",
|
||||
"--force-directories",
|
||||
"--backup-converted",
|
||||
"--span-hosts",
|
||||
"--no-parent",
|
||||
"-e", "robots=off"
|
||||
],
|
||||
"description": "Default wget arguments"
|
||||
},
|
||||
"WGET_EXTRA_ARGS": {
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"description": "Extra arguments for wget (space-separated)"
|
||||
}
|
||||
}
|
||||
}
|
||||
136
archivebox/plugins/wget/on_Crawl__00_validate_wget.py
Normal file
136
archivebox/plugins/wget/on_Crawl__00_validate_wget.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validation hook for wget binary.
|
||||
|
||||
Runs at crawl start to verify wget is available.
|
||||
Outputs JSONL for InstalledBinary and Machine config updates.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
import hashlib
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_binary_version(abspath: str) -> str | None:
|
||||
"""Get version string from binary."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[abspath, '--version'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if result.returncode == 0 and result.stdout:
|
||||
# wget version string: "GNU Wget 1.24.5 built on ..."
|
||||
first_line = result.stdout.strip().split('\n')[0]
|
||||
# Extract version number
|
||||
parts = first_line.split()
|
||||
for i, part in enumerate(parts):
|
||||
if part.lower() == 'wget' and i + 1 < len(parts):
|
||||
return parts[i + 1]
|
||||
return first_line[:32]
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def get_binary_hash(abspath: str) -> str | None:
|
||||
"""Get SHA256 hash of binary."""
|
||||
try:
|
||||
with open(abspath, 'rb') as f:
|
||||
return hashlib.sha256(f.read()).hexdigest()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def find_wget() -> dict | None:
|
||||
"""Find wget binary using abx-pkg or fallback to shutil.which."""
|
||||
# Try abx-pkg first
|
||||
try:
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
class WgetBinary(Binary):
|
||||
name: str = 'wget'
|
||||
binproviders_supported = [EnvProvider()]
|
||||
|
||||
binary = WgetBinary()
|
||||
loaded = binary.load()
|
||||
if loaded and loaded.abspath:
|
||||
return {
|
||||
'name': 'wget',
|
||||
'abspath': str(loaded.abspath),
|
||||
'version': str(loaded.version) if loaded.version else None,
|
||||
'sha256': loaded.sha256 if hasattr(loaded, 'sha256') else None,
|
||||
'binprovider': loaded.binprovider.name if loaded.binprovider else 'env',
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Fallback to shutil.which
|
||||
abspath = shutil.which('wget') or os.environ.get('WGET_BINARY', '')
|
||||
if abspath and Path(abspath).is_file():
|
||||
return {
|
||||
'name': 'wget',
|
||||
'abspath': abspath,
|
||||
'version': get_binary_version(abspath),
|
||||
'sha256': get_binary_hash(abspath),
|
||||
'binprovider': 'env',
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Validate wget binary and output JSONL."""
|
||||
|
||||
result = find_wget()
|
||||
|
||||
if result and result.get('abspath'):
|
||||
# Output InstalledBinary
|
||||
print(json.dumps({
|
||||
'type': 'InstalledBinary',
|
||||
'name': result['name'],
|
||||
'abspath': result['abspath'],
|
||||
'version': result['version'],
|
||||
'sha256': result['sha256'],
|
||||
'binprovider': result['binprovider'],
|
||||
}))
|
||||
|
||||
# Output Machine config update
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/WGET_BINARY',
|
||||
'value': result['abspath'],
|
||||
}))
|
||||
|
||||
if result['version']:
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'_method': 'update',
|
||||
'key': 'config/WGET_VERSION',
|
||||
'value': result['version'],
|
||||
}))
|
||||
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Output Dependency request
|
||||
print(json.dumps({
|
||||
'type': 'Dependency',
|
||||
'bin_name': 'wget',
|
||||
'bin_providers': 'apt,brew,env',
|
||||
}))
|
||||
|
||||
# Exit non-zero to indicate binary not found
|
||||
print(f"wget binary not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
130
archivebox/plugins/wget/on_Crawl__00_validate_wget_config.py
Normal file
130
archivebox/plugins/wget/on_Crawl__00_validate_wget_config.py
Normal file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate and compute derived wget config values.
|
||||
|
||||
This hook runs early in the Crawl lifecycle to:
|
||||
1. Validate config values with warnings (not hard errors)
|
||||
2. Compute derived values (USE_WGET from SAVE_WGET/SAVE_WARC)
|
||||
3. Check binary availability and version
|
||||
|
||||
Output:
|
||||
- COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
|
||||
- InstalledBinary JSONL records to stdout when binaries are found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
# Read config from environment (already validated by JSONSchema)
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
def output_installed_binary(binary: Binary, name: str):
|
||||
"""Output InstalledBinary JSONL record to stdout."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'InstalledBinary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
warnings = []
|
||||
errors = []
|
||||
computed = {}
|
||||
|
||||
# Get config values
|
||||
save_wget = get_env_bool('SAVE_WGET', True)
|
||||
save_warc = get_env_bool('SAVE_WARC', True)
|
||||
wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
wget_binary = get_env('WGET_BINARY', 'wget')
|
||||
|
||||
# Compute derived values
|
||||
use_wget = save_wget or save_warc
|
||||
computed['USE_WGET'] = str(use_wget).lower()
|
||||
|
||||
# Validate timeout with warning (not error)
|
||||
if use_wget and wget_timeout < 20:
|
||||
warnings.append(
|
||||
f"WGET_TIMEOUT={wget_timeout} is very low. "
|
||||
"wget may fail to archive sites if set to less than ~20 seconds. "
|
||||
"Consider setting WGET_TIMEOUT=60 or higher."
|
||||
)
|
||||
|
||||
# Check binary availability using abx-pkg
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=wget_binary, binproviders=[provider]).load()
|
||||
binary_path = str(binary.abspath) if binary.abspath else ''
|
||||
except Exception:
|
||||
binary = None
|
||||
binary_path = ''
|
||||
|
||||
if not binary_path:
|
||||
if use_wget:
|
||||
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set SAVE_WGET=false.")
|
||||
computed['WGET_BINARY'] = ''
|
||||
else:
|
||||
computed['WGET_BINARY'] = binary_path
|
||||
wget_version = str(binary.version) if binary.version else 'unknown'
|
||||
computed['WGET_VERSION'] = wget_version
|
||||
|
||||
# Output InstalledBinary JSONL record
|
||||
output_installed_binary(binary, name='wget')
|
||||
|
||||
# Check for compression support
|
||||
if computed.get('WGET_BINARY'):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[computed['WGET_BINARY'], '--compression=auto', '--help'],
|
||||
capture_output=True, timeout=5
|
||||
)
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
|
||||
except Exception:
|
||||
computed['WGET_AUTO_COMPRESSION'] = 'false'
|
||||
|
||||
# Output results
|
||||
# Format: KEY=VALUE lines that hooks.py will parse and add to env
|
||||
for key, value in computed.items():
|
||||
print(f"COMPUTED:{key}={value}")
|
||||
|
||||
for warning in warnings:
|
||||
print(f"WARNING:{warning}", file=sys.stderr)
|
||||
|
||||
for error in errors:
|
||||
print(f"ERROR:{error}", file=sys.stderr)
|
||||
|
||||
# Exit with error if any hard errors
|
||||
sys.exit(1 if errors else 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
325
archivebox/plugins/wget/on_Snapshot__50_wget.py
Normal file
325
archivebox/plugins/wget/on_Snapshot__50_wget.py
Normal file
@@ -0,0 +1,325 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Archive a URL using wget.
|
||||
|
||||
Usage: on_Snapshot__wget.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Downloads files to $PWD
|
||||
|
||||
Environment variables:
|
||||
WGET_BINARY: Path to wget binary (optional, falls back to PATH)
|
||||
WGET_TIMEOUT: Timeout in seconds (default: 60)
|
||||
WGET_USER_AGENT: User agent string
|
||||
WGET_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: True)
|
||||
WGET_COOKIES_FILE: Path to cookies file (optional)
|
||||
WGET_RESTRICT_FILE_NAMES: Filename restriction mode (default: windows)
|
||||
WGET_EXTRA_ARGS: Extra arguments for wget (space-separated)
|
||||
|
||||
# Wget feature toggles
|
||||
SAVE_WGET: Enable wget archiving (default: True)
|
||||
SAVE_WARC: Save WARC file (default: True)
|
||||
SAVE_WGET_REQUISITES: Download page requisites (default: True)
|
||||
|
||||
# Fallback to ARCHIVING_CONFIG values if WGET_* not set:
|
||||
TIMEOUT: Fallback timeout
|
||||
USER_AGENT: Fallback user agent
|
||||
CHECK_SSL_VALIDITY: Fallback SSL check
|
||||
COOKIES_FILE: Fallback cookies file
|
||||
RESTRICT_FILE_NAMES: Fallback filename restriction
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'wget'
|
||||
BIN_NAME = 'wget'
|
||||
BIN_PROVIDERS = 'apt,brew,env'
|
||||
OUTPUT_DIR = 'wget'
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def get_env_int(name: str, default: int = 0) -> int:
|
||||
try:
|
||||
return int(get_env(name, str(default)))
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
STATICFILE_DIR = 'staticfile'
|
||||
|
||||
def has_staticfile_output() -> bool:
|
||||
"""Check if staticfile extractor already downloaded this URL."""
|
||||
staticfile_dir = Path(STATICFILE_DIR)
|
||||
return staticfile_dir.exists() and any(staticfile_dir.iterdir())
|
||||
|
||||
|
||||
def find_wget() -> str | None:
|
||||
"""Find wget binary."""
|
||||
wget = get_env('WGET_BINARY')
|
||||
if wget and os.path.isfile(wget):
|
||||
return wget
|
||||
return shutil.which('wget')
|
||||
|
||||
|
||||
def get_version(binary: str) -> str:
|
||||
"""Get wget version."""
|
||||
try:
|
||||
result = subprocess.run([binary, '--version'], capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.split('\n')[0].strip()[:64]
|
||||
except Exception:
|
||||
return ''
|
||||
|
||||
|
||||
def check_wget_compression(binary: str) -> bool:
|
||||
"""Check if wget supports --compression=auto."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[binary, '--compression=auto', '--help'],
|
||||
capture_output=True,
|
||||
timeout=5
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# Default wget args (from old WGET_CONFIG)
|
||||
WGET_DEFAULT_ARGS = [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
|
||||
|
||||
def save_wget(url: str, binary: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Archive URL using wget.
|
||||
|
||||
Returns: (success, output_path, error_message)
|
||||
"""
|
||||
# Get config from env (with WGET_ prefix or fallback to ARCHIVING_CONFIG style)
|
||||
timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
|
||||
user_agent = get_env('WGET_USER_AGENT') or get_env('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
check_ssl = get_env_bool('WGET_CHECK_SSL_VALIDITY', get_env_bool('CHECK_SSL_VALIDITY', True))
|
||||
cookies_file = get_env('WGET_COOKIES_FILE') or get_env('COOKIES_FILE', '')
|
||||
restrict_names = get_env('WGET_RESTRICT_FILE_NAMES') or get_env('RESTRICT_FILE_NAMES', 'windows')
|
||||
extra_args = get_env('WGET_EXTRA_ARGS', '')
|
||||
|
||||
# Feature toggles
|
||||
save_warc = get_env_bool('SAVE_WARC', True)
|
||||
save_requisites = get_env_bool('SAVE_WGET_REQUISITES', True)
|
||||
|
||||
# Check for compression support
|
||||
supports_compression = check_wget_compression(binary)
|
||||
|
||||
# Build wget command (later options take precedence)
|
||||
cmd = [
|
||||
binary,
|
||||
*WGET_DEFAULT_ARGS,
|
||||
f'--timeout={timeout}',
|
||||
'--tries=2',
|
||||
]
|
||||
|
||||
if user_agent:
|
||||
cmd.append(f'--user-agent={user_agent}')
|
||||
|
||||
if restrict_names:
|
||||
cmd.append(f'--restrict-file-names={restrict_names}')
|
||||
|
||||
if save_requisites:
|
||||
cmd.append('--page-requisites')
|
||||
|
||||
if save_warc:
|
||||
warc_dir = Path('warc')
|
||||
warc_dir.mkdir(exist_ok=True)
|
||||
warc_path = warc_dir / str(int(datetime.now(timezone.utc).timestamp()))
|
||||
cmd.append(f'--warc-file={warc_path}')
|
||||
else:
|
||||
cmd.append('--timestamping')
|
||||
|
||||
if cookies_file and Path(cookies_file).is_file():
|
||||
cmd.extend(['--load-cookies', cookies_file])
|
||||
|
||||
if supports_compression:
|
||||
cmd.append('--compression=auto')
|
||||
|
||||
if not check_ssl:
|
||||
cmd.extend(['--no-check-certificate', '--no-hsts'])
|
||||
|
||||
if extra_args:
|
||||
cmd.extend(extra_args.split())
|
||||
|
||||
cmd.append(url)
|
||||
|
||||
# Run wget
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
timeout=timeout * 2, # Allow extra time for large downloads
|
||||
)
|
||||
|
||||
# Find downloaded files
|
||||
downloaded_files = [
|
||||
f for f in Path('.').rglob('*')
|
||||
if f.is_file() and f.name != '.gitkeep' and not str(f).startswith('warc/')
|
||||
]
|
||||
|
||||
if not downloaded_files:
|
||||
stderr = result.stderr.decode('utf-8', errors='replace')
|
||||
stdout = result.stdout.decode('utf-8', errors='replace')
|
||||
combined = stderr + stdout
|
||||
|
||||
if '403' in combined or 'Forbidden' in combined:
|
||||
return False, None, '403 Forbidden (try changing USER_AGENT)'
|
||||
elif '404' in combined or 'Not Found' in combined:
|
||||
return False, None, '404 Not Found'
|
||||
elif '500' in combined:
|
||||
return False, None, '500 Internal Server Error'
|
||||
else:
|
||||
return False, None, f'No files downloaded: {stderr[:200]}'
|
||||
|
||||
# Find main HTML file
|
||||
html_files = [
|
||||
f for f in downloaded_files
|
||||
if re.search(r'\.[Ss]?[Hh][Tt][Mm][Ll]?$', str(f))
|
||||
]
|
||||
output_path = str(html_files[0]) if html_files else str(downloaded_files[0])
|
||||
|
||||
# Parse download stats from wget output
|
||||
output_tail = result.stderr.decode('utf-8', errors='replace').strip().split('\n')[-3:]
|
||||
files_count = len(downloaded_files)
|
||||
|
||||
return True, output_path, ''
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, None, f'Timed out after {timeout * 2} seconds'
|
||||
except Exception as e:
|
||||
return False, None, f'{type(e).__name__}: {e}'
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL to archive')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Archive a URL using wget."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
version = ''
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
binary = None
|
||||
cmd_str = ''
|
||||
|
||||
try:
|
||||
# Check if wget is enabled
|
||||
if not get_env_bool('SAVE_WGET', True):
|
||||
print('Skipping wget (SAVE_WGET=False)')
|
||||
status = 'skipped'
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'STATUS={status}')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0)
|
||||
|
||||
# Check if staticfile extractor already handled this (permanent skip)
|
||||
if has_staticfile_output():
|
||||
print(f'Skipping wget - staticfile extractor already downloaded this')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - staticfile already handled
|
||||
|
||||
# Find binary
|
||||
binary = find_wget()
|
||||
if not binary:
|
||||
print(f'ERROR: {BIN_NAME} binary not found', file=sys.stderr)
|
||||
print(f'DEPENDENCY_NEEDED={BIN_NAME}', file=sys.stderr)
|
||||
print(f'BIN_PROVIDERS={BIN_PROVIDERS}', file=sys.stderr)
|
||||
print(f'INSTALL_HINT=apt install wget OR brew install wget', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
version = get_version(binary)
|
||||
cmd_str = f'{binary} ... {url}'
|
||||
|
||||
# Run extraction
|
||||
success, output, error = save_wget(url, binary)
|
||||
status = 'succeeded' if success else 'failed'
|
||||
|
||||
if success:
|
||||
# Count downloaded files
|
||||
files = list(Path('.').rglob('*'))
|
||||
file_count = len([f for f in files if f.is_file()])
|
||||
print(f'wget completed: {file_count} files downloaded')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
# Print results
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if cmd_str:
|
||||
print(f'CMD={cmd_str}')
|
||||
if version:
|
||||
print(f'VERSION={version}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
# Print JSON result
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'cmd_version': version,
|
||||
'output': output,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
369
archivebox/plugins/wget/tests/test_wget.py
Normal file
369
archivebox/plugins/wget/tests/test_wget.py
Normal file
@@ -0,0 +1,369 @@
|
||||
"""
|
||||
Integration tests for wget plugin
|
||||
|
||||
Tests verify:
|
||||
1. Plugin reports missing dependency correctly
|
||||
2. wget can be installed via brew/apt provider hooks
|
||||
3. Config options work (SAVE_WGET, SAVE_WARC, etc.)
|
||||
4. Extraction works against real example.com
|
||||
5. Output files contain actual page content
|
||||
6. Skip cases work (SAVE_WGET=False, staticfile present)
|
||||
7. Failure cases handled (404, network errors)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = PLUGIN_DIR.parent
|
||||
WGET_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_wget.py'))
|
||||
BREW_HOOK = PLUGINS_ROOT / 'brew' / 'on_Dependency__install_using_brew_provider.py'
|
||||
APT_HOOK = PLUGINS_ROOT / 'apt' / 'on_Dependency__install_using_apt_provider.py'
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
def test_hook_script_exists():
|
||||
"""Verify hook script exists."""
|
||||
assert WGET_HOOK.exists(), f"Hook script not found: {WGET_HOOK}"
|
||||
|
||||
|
||||
def test_reports_missing_dependency_when_not_installed():
|
||||
"""Test that script reports DEPENDENCY_NEEDED when wget is not found."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run with empty PATH so binary won't be found
|
||||
env = {'PATH': '/nonexistent', 'HOME': str(tmpdir)}
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test123'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail and report missing dependency
|
||||
assert result.returncode != 0, "Should exit non-zero when dependency missing"
|
||||
combined = result.stdout + result.stderr
|
||||
assert 'DEPENDENCY_NEEDED' in combined, "Should output DEPENDENCY_NEEDED"
|
||||
assert 'wget' in combined.lower(), "Should mention wget"
|
||||
assert 'BIN_PROVIDERS' in combined, "Should report available providers (apt,brew,env)"
|
||||
|
||||
|
||||
def test_can_install_wget_via_provider():
|
||||
"""Test that wget can be installed via brew/apt provider hooks."""
|
||||
|
||||
# Determine which provider to use
|
||||
if shutil.which('brew'):
|
||||
provider_hook = BREW_HOOK
|
||||
provider_name = 'brew'
|
||||
elif shutil.which('apt-get'):
|
||||
provider_hook = APT_HOOK
|
||||
provider_name = 'apt'
|
||||
else:
|
||||
pytest.skip("Neither brew nor apt available on this system")
|
||||
|
||||
assert provider_hook.exists(), f"Provider hook not found: {provider_hook}"
|
||||
|
||||
# Test installation via provider hook
|
||||
dependency_id = str(uuid.uuid4())
|
||||
|
||||
result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(provider_hook),
|
||||
'--dependency-id', dependency_id,
|
||||
'--bin-name', 'wget',
|
||||
'--bin-providers', 'apt,brew,env'
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300 # Installation can take time
|
||||
)
|
||||
|
||||
# Should succeed (wget installs successfully or is already installed)
|
||||
assert result.returncode == 0, f"{provider_name} install failed: {result.stderr}"
|
||||
|
||||
# Should output InstalledBinary JSONL record
|
||||
assert 'InstalledBinary' in result.stdout or 'wget' in result.stderr, \
|
||||
f"Should output installation info: stdout={result.stdout}, stderr={result.stderr}"
|
||||
|
||||
# Parse JSONL if present
|
||||
if result.stdout.strip():
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'InstalledBinary':
|
||||
assert record['name'] == 'wget'
|
||||
assert record['binprovider'] in ['brew', 'apt']
|
||||
assert record['abspath'], "Should have binary path"
|
||||
assert Path(record['abspath']).exists(), f"Binary should exist at {record['abspath']}"
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Verify wget is now available
|
||||
result = subprocess.run(['which', 'wget'], capture_output=True, text=True)
|
||||
assert result.returncode == 0, "wget should be available after installation"
|
||||
|
||||
|
||||
def test_archives_example_com():
|
||||
"""Test full workflow: ensure wget installed then archive example.com."""
|
||||
|
||||
# First ensure wget is installed via provider
|
||||
if shutil.which('brew'):
|
||||
provider_hook = BREW_HOOK
|
||||
elif shutil.which('apt-get'):
|
||||
provider_hook = APT_HOOK
|
||||
else:
|
||||
pytest.skip("Neither brew nor apt available")
|
||||
|
||||
# Run installation (idempotent - will succeed if already installed)
|
||||
install_result = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
str(provider_hook),
|
||||
'--dependency-id', str(uuid.uuid4()),
|
||||
'--bin-name', 'wget',
|
||||
'--bin-providers', 'apt,brew,env'
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if install_result.returncode != 0:
|
||||
pytest.skip(f"Could not install wget: {install_result.stderr}")
|
||||
|
||||
# Now test archiving
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Run wget extraction
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Verify output in stdout
|
||||
assert 'STATUS=succeeded' in result.stdout, "Should report success"
|
||||
assert 'wget completed' in result.stdout, "Should report completion"
|
||||
|
||||
# Verify files were downloaded
|
||||
downloaded_files = list(tmpdir.rglob('*.html')) + list(tmpdir.rglob('*.htm'))
|
||||
assert len(downloaded_files) > 0, "No HTML files downloaded"
|
||||
|
||||
# Find main HTML file (should contain example.com)
|
||||
main_html = None
|
||||
for html_file in downloaded_files:
|
||||
content = html_file.read_text(errors='ignore')
|
||||
if 'example domain' in content.lower():
|
||||
main_html = html_file
|
||||
break
|
||||
|
||||
assert main_html is not None, "Could not find main HTML file with example.com content"
|
||||
|
||||
# Verify HTML content contains REAL example.com text
|
||||
html_content = main_html.read_text(errors='ignore')
|
||||
assert len(html_content) > 200, f"HTML content too short: {len(html_content)} bytes"
|
||||
assert 'example domain' in html_content.lower(), "Missing 'Example Domain' in HTML"
|
||||
assert ('this domain' in html_content.lower() or
|
||||
'illustrative examples' in html_content.lower()), \
|
||||
"Missing example.com description text"
|
||||
assert ('iana' in html_content.lower() or
|
||||
'more information' in html_content.lower()), \
|
||||
"Missing IANA reference"
|
||||
|
||||
# Verify RESULT_JSON is present and valid
|
||||
assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
|
||||
|
||||
for line in result.stdout.split('\n'):
|
||||
if line.startswith('RESULT_JSON='):
|
||||
result_json = json.loads(line.replace('RESULT_JSON=', ''))
|
||||
assert result_json['extractor'] == 'wget'
|
||||
assert result_json['status'] == 'succeeded'
|
||||
assert result_json['url'] == TEST_URL
|
||||
assert result_json['snapshot_id'] == 'test789'
|
||||
assert 'duration' in result_json
|
||||
assert result_json['duration'] >= 0
|
||||
break
|
||||
|
||||
|
||||
def test_config_save_wget_false_skips():
|
||||
"""Test that SAVE_WGET=False causes skip."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set SAVE_WGET=False
|
||||
env = os.environ.copy()
|
||||
env['SAVE_WGET'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'test999'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should succeed but skip
|
||||
assert result.returncode == 0, f"Should exit 0 when skipping: {result.stderr}"
|
||||
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
|
||||
assert 'SAVE_WGET=False' in result.stdout, "Should mention SAVE_WGET=False"
|
||||
|
||||
|
||||
def test_config_save_warc():
|
||||
"""Test that SAVE_WARC=True creates WARC files."""
|
||||
|
||||
# Ensure wget is available
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set SAVE_WARC=True explicitly
|
||||
env = os.environ.copy()
|
||||
env['SAVE_WARC'] = 'True'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testwarc'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
# Look for WARC files in warc/ subdirectory
|
||||
warc_dir = tmpdir / 'warc'
|
||||
if warc_dir.exists():
|
||||
warc_files = list(warc_dir.rglob('*'))
|
||||
warc_files = [f for f in warc_files if f.is_file()]
|
||||
assert len(warc_files) > 0, "WARC file not created when SAVE_WARC=True"
|
||||
|
||||
|
||||
def test_staticfile_present_skips():
|
||||
"""Test that wget skips when staticfile already downloaded."""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Create staticfile directory with content to simulate staticfile extractor ran
|
||||
staticfile_dir = tmpdir / 'staticfile'
|
||||
staticfile_dir.mkdir()
|
||||
(staticfile_dir / 'index.html').write_text('<html>test</html>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'teststatic'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Should skip
|
||||
assert result.returncode == 0, "Should exit 0 when skipping"
|
||||
assert 'STATUS=skipped' in result.stdout, "Should report skipped status"
|
||||
assert 'staticfile' in result.stdout.lower(), "Should mention staticfile"
|
||||
|
||||
|
||||
def test_handles_404_gracefully():
|
||||
"""Test that wget fails gracefully on 404."""
|
||||
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Try to download non-existent page
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', 'https://example.com/nonexistent-page-404', '--snapshot-id', 'test404'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Should fail
|
||||
assert result.returncode != 0, "Should fail on 404"
|
||||
combined = result.stdout + result.stderr
|
||||
assert '404' in combined or 'Not Found' in combined or 'No files downloaded' in combined, \
|
||||
"Should report 404 or no files downloaded"
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that WGET_TIMEOUT config is respected."""
|
||||
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set very short timeout
|
||||
env = os.environ.copy()
|
||||
env['WGET_TIMEOUT'] = '5'
|
||||
|
||||
# This should still succeed for example.com (it's fast)
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testtimeout'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# Verify it completed (success or fail, but didn't hang)
|
||||
assert result.returncode in (0, 1), "Should complete (success or fail)"
|
||||
|
||||
|
||||
def test_config_user_agent():
|
||||
"""Test that WGET_USER_AGENT config is used."""
|
||||
|
||||
if not shutil.which('wget'):
|
||||
pytest.skip("wget not installed")
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set custom user agent
|
||||
env = os.environ.copy()
|
||||
env['WGET_USER_AGENT'] = 'TestBot/1.0'
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(WGET_HOOK), '--url', TEST_URL, '--snapshot-id', 'testua'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=env,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
# Should succeed (example.com doesn't block)
|
||||
if result.returncode == 0:
|
||||
assert 'STATUS=succeeded' in result.stdout
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user