wip major changes

2026-04-05 07:17:52 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -0,0 +1,262 @@
+#!/usr/bin/env node
+/**
+ * Extract the title of a URL.
+ *
+ * If a Chrome session exists (from chrome_session extractor), connects to it via CDP
+ * to get the page title (which includes JS-rendered content).
+ * Otherwise falls back to fetching the URL and parsing HTML.
+ *
+ * Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>
+ * Output: Writes title/title.txt
+ *
+ * Environment variables:
+ *     TIMEOUT: Timeout in seconds (default: 30)
+ *     USER_AGENT: User agent string (optional)
+ */
+
+const fs = require('fs');
+const path = require('path');
+const https = require('https');
+const http = require('http');
+
+// Extractor metadata
+const EXTRACTOR_NAME = 'title';
+const OUTPUT_DIR = 'title';
+const OUTPUT_FILE = 'title.txt';
+const CHROME_SESSION_DIR = 'chrome_session';
+
+// Parse command line arguments
+function parseArgs() {
+    const args = {};
+    process.argv.slice(2).forEach(arg => {
+        if (arg.startsWith('--')) {
+            const [key, ...valueParts] = arg.slice(2).split('=');
+            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
+        }
+    });
+    return args;
+}
+
+// Get environment variable with default
+function getEnv(name, defaultValue = '') {
+    return (process.env[name] || defaultValue).trim();
+}
+
+function getEnvInt(name, defaultValue = 0) {
+    const val = parseInt(getEnv(name, String(defaultValue)), 10);
+    return isNaN(val) ? defaultValue : val;
+}
+
+// Get CDP URL from chrome_session if available
+function getCdpUrl() {
+    const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
+    if (fs.existsSync(cdpFile)) {
+        return fs.readFileSync(cdpFile, 'utf8').trim();
+    }
+    return null;
+}
+
+// Extract title from HTML
+function extractTitleFromHtml(html) {
+    // Try <title> tag
+    const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
+    if (titleMatch) {
+        return titleMatch[1].trim();
+    }
+
+    // Try og:title
+    const ogMatch = html.match(/<meta[^>]+property=["']og:title["'][^>]+content=["']([^"']+)["']/i);
+    if (ogMatch) {
+        return ogMatch[1].trim();
+    }
+
+    // Try twitter:title
+    const twitterMatch = html.match(/<meta[^>]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']/i);
+    if (twitterMatch) {
+        return twitterMatch[1].trim();
+    }
+
+    return null;
+}
+
+// Fetch URL and extract title (fallback method)
+function fetchTitle(url) {
+    return new Promise((resolve, reject) => {
+        const timeout = getEnvInt('TIMEOUT', 30) * 1000;
+        const userAgent = getEnv('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)');
+
+        const client = url.startsWith('https') ? https : http;
+
+        const req = client.get(url, {
+            headers: { 'User-Agent': userAgent },
+            timeout,
+        }, (res) => {
+            // Handle redirects
+            if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+                fetchTitle(res.headers.location).then(resolve).catch(reject);
+                return;
+            }
+
+            let data = '';
+            res.on('data', chunk => {
+                data += chunk;
+                // Only need first 64KB to find title
+                if (data.length > 65536) {
+                    req.destroy();
+                }
+            });
+            res.on('end', () => {
+                const title = extractTitleFromHtml(data);
+                if (title) {
+                    resolve(title);
+                } else {
+                    reject(new Error('No title found in HTML'));
+                }
+            });
+        });
+
+        req.on('error', reject);
+        req.on('timeout', () => {
+            req.destroy();
+            reject(new Error('Request timeout'));
+        });
+    });
+}
+
+// Get title using Puppeteer CDP connection
+async function getTitleFromCdp(cdpUrl) {
+    const puppeteer = require('puppeteer-core');
+
+    const browser = await puppeteer.connect({
+        browserWSEndpoint: cdpUrl,
+    });
+
+    try {
+        // Get existing pages
+        const pages = await browser.pages();
+        const page = pages.find(p => p.url().startsWith('http')) || pages[0];
+
+        if (!page) {
+            throw new Error('No page found in Chrome session');
+        }
+
+        // Get title from page
+        const title = await page.title();
+
+        if (!title) {
+            // Try getting from DOM directly
+            const domTitle = await page.evaluate(() => {
+                return document.title ||
+                       document.querySelector('meta[property="og:title"]')?.content ||
+                       document.querySelector('meta[name="twitter:title"]')?.content ||
+                       document.querySelector('h1')?.textContent?.trim();
+            });
+            return domTitle;
+        }
+
+        return title;
+    } finally {
+        // Disconnect without closing browser
+        browser.disconnect();
+    }
+}
+
+async function extractTitle(url) {
+    // Create output directory
+    if (!fs.existsSync(OUTPUT_DIR)) {
+        fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+    }
+    const outputPath = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    // Try Chrome session first
+    const cdpUrl = getCdpUrl();
+    if (cdpUrl) {
+        try {
+            const title = await getTitleFromCdp(cdpUrl);
+            if (title) {
+                fs.writeFileSync(outputPath, title, 'utf8');
+                return { success: true, output: outputPath, title, method: 'cdp' };
+            }
+        } catch (e) {
+            console.error(`CDP title extraction failed: ${e.message}, falling back to HTTP`);
+        }
+    }
+
+    // Fallback to HTTP fetch
+    try {
+        const title = await fetchTitle(url);
+        fs.writeFileSync(outputPath, title, 'utf8');
+        return { success: true, output: outputPath, title, method: 'http' };
+    } catch (e) {
+        return { success: false, error: e.message };
+    }
+}
+
+async function main() {
+    const args = parseArgs();
+    const url = args.url;
+    const snapshotId = args.snapshot_id;
+
+    if (!url || !snapshotId) {
+        console.error('Usage: on_Snapshot__10_title.js --url=<url> --snapshot-id=<uuid>');
+        process.exit(1);
+    }
+
+    const startTs = new Date();
+    let status = 'failed';
+    let output = null;
+    let error = '';
+
+    try {
+        const result = await extractTitle(url);
+
+        if (result.success) {
+            status = 'succeeded';
+            output = result.output;
+            console.log(`Title extracted (${result.method}): ${result.title}`);
+        } else {
+            status = 'failed';
+            error = result.error;
+        }
+    } catch (e) {
+        error = `${e.name}: ${e.message}`;
+        status = 'failed';
+    }
+
+    const endTs = new Date();
+    const duration = (endTs - startTs) / 1000;
+
+    // Print results
+    console.log(`START_TS=${startTs.toISOString()}`);
+    console.log(`END_TS=${endTs.toISOString()}`);
+    console.log(`DURATION=${duration.toFixed(2)}`);
+    if (output) {
+        console.log(`OUTPUT=${output}`);
+    }
+    console.log(`STATUS=${status}`);
+
+    if (error) {
+        console.error(`ERROR=${error}`);
+    }
+
+    // Print JSON result
+    const resultJson = {
+        extractor: EXTRACTOR_NAME,
+        url,
+        snapshot_id: snapshotId,
+        status,
+        start_ts: startTs.toISOString(),
+        end_ts: endTs.toISOString(),
+        duration: Math.round(duration * 100) / 100,
+        output,
+        error: error || null,
+    };
+    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+
+    process.exit(status === 'succeeded' ? 0 : 1);
+}
+
+main().catch(e => {
+    console.error(`Fatal error: ${e.message}`);
+    process.exit(1);
+});
--- a/archivebox/plugins/title/tests/test_title.py
+++ b/archivebox/plugins/title/tests/test_title.py
@@ -0,0 +1,241 @@
+"""
+Integration tests for title plugin
+
+Tests verify:
+1. Plugin script exists
+2. Node.js is available
+3. Title extraction works for real example.com
+4. Output file contains actual page title
+5. Handles various title sources (<title>, og:title, twitter:title)
+6. Config options work (TIMEOUT, USER_AGENT)
+7. Fallback to HTTP when chrome_session not available
+"""
+
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+
+import pytest
+
+
+PLUGIN_DIR = Path(__file__).parent.parent
+TITLE_HOOK = PLUGIN_DIR / 'on_Snapshot__32_title.js'
+TEST_URL = 'https://example.com'
+
+
+def test_hook_script_exists():
+    """Verify hook script exists."""
+    assert TITLE_HOOK.exists(), f"Hook script not found: {TITLE_HOOK}"
+
+
+def test_extracts_title_from_example_com():
+    """Test full workflow: extract title from real example.com."""
+
+    # Check node is available
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Run title extraction
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+
+        # Verify output in stdout
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+        assert 'Title extracted' in result.stdout, "Should report completion"
+
+        # Verify output directory created
+        title_dir = tmpdir / 'title'
+        assert title_dir.exists(), "Output directory not created"
+
+        # Verify output file exists
+        title_file = title_dir / 'title.txt'
+        assert title_file.exists(), "title.txt not created"
+
+        # Verify title contains REAL example.com title
+        title_text = title_file.read_text().strip()
+        assert len(title_text) > 0, "Title should not be empty"
+        assert 'example' in title_text.lower(), "Title should contain 'example'"
+
+        # example.com has title "Example Domain"
+        assert 'example domain' in title_text.lower(), f"Expected 'Example Domain', got: {title_text}"
+
+        # Verify RESULT_JSON is present
+        assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
+
+
+def test_falls_back_to_http_when_chrome_session_unavailable():
+    """Test that title plugin falls back to HTTP when chrome_session unavailable."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Don't create chrome_session directory - force HTTP fallback
+
+        # Run title extraction
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testhttp'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
+        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+
+        # Verify output exists and has real title
+        output_title_file = tmpdir / 'title' / 'title.txt'
+        assert output_title_file.exists(), "Output title.txt not created"
+
+        title_text = output_title_file.read_text().strip()
+        assert 'example' in title_text.lower()
+
+
+def test_config_timeout_honored():
+    """Test that TIMEOUT config is respected."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set very short timeout (but example.com should still succeed)
+        import os
+        env = os.environ.copy()
+        env['TIMEOUT'] = '5'
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testtimeout'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=30
+        )
+
+        # Should complete (success or fail, but not hang)
+        assert result.returncode in (0, 1), "Should complete without hanging"
+
+
+def test_config_user_agent():
+    """Test that USER_AGENT config is used."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # Set custom user agent
+        import os
+        env = os.environ.copy()
+        env['USER_AGENT'] = 'TestBot/1.0'
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), f'--url={TEST_URL}', '--snapshot-id=testua'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            env=env,
+            timeout=60
+        )
+
+        # Should succeed (example.com doesn't block)
+        if result.returncode == 0:
+            assert 'STATUS=succeeded' in result.stdout
+
+
+def test_handles_https_urls():
+    """Test that HTTPS URLs work correctly."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), '--url=https://example.org', '--snapshot-id=testhttps'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        if result.returncode == 0:
+            output_title_file = tmpdir / 'title' / 'title.txt'
+            if output_title_file.exists():
+                title_text = output_title_file.read_text().strip()
+                assert len(title_text) > 0, "Title should not be empty"
+                assert 'example' in title_text.lower()
+
+
+def test_handles_404_gracefully():
+    """Test that title plugin handles 404 pages.
+
+    Note: example.com returns valid HTML even for 404 pages, so extraction may succeed
+    with the generic "Example Domain" title.
+    """
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), '--url=https://example.com/nonexistent-page-404', '--snapshot-id=test404'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # May succeed or fail depending on server behavior
+        # example.com returns "Example Domain" even for 404s
+        assert result.returncode in (0, 1), "Should complete (may succeed or fail)"
+
+
+def test_handles_redirects():
+    """Test that title plugin handles redirects correctly."""
+
+    if not shutil.which('node'):
+        pytest.skip("node not installed")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+
+        # http://example.com redirects to https://example.com
+        result = subprocess.run(
+            ['node', str(TITLE_HOOK), '--url=http://example.com', '--snapshot-id=testredirect'],
+            cwd=tmpdir,
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+
+        # Should succeed and follow redirect
+        if result.returncode == 0:
+            output_title_file = tmpdir / 'title' / 'title.txt'
+            if output_title_file.exists():
+                title_text = output_title_file.read_text().strip()
+                assert 'example' in title_text.lower()
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])