way better plugin hooks system wip

2026-04-05 15:27:53 +10:00 · 2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions
--- a/archivebox/plugins/headers/on_Snapshot__33_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__33_headers.js
@@ -2,8 +2,8 @@
 /**
 * Extract HTTP response headers for a URL.
 *
- * If a Chrome session exists (from chrome_session extractor), reads the captured
- * response headers from chrome_session/response_headers.json.
+ * If a Chrome session exists (from chrome plugin), reads the captured
+ * response headers from chrome plugin/response_headers.json.
 * Otherwise falls back to making an HTTP HEAD request.
 *
 * Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
@@ -24,7 +24,7 @@ const http = require('http');
 const EXTRACTOR_NAME = 'headers';
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'headers.json';
-const CHROME_SESSION_DIR = '../chrome_session';
+const CHROME_SESSION_DIR = '../chrome';
 const CHROME_HEADERS_FILE = 'response_headers.json';

 // Parse command line arguments
@@ -56,7 +56,7 @@ function getEnvInt(name, defaultValue = 0) {
    return isNaN(val) ? defaultValue : val;
 }

-// Get headers from chrome_session if available
+// Get headers from chrome plugin if available
 function getHeadersFromChromeSession() {
    const headersFile = path.join(CHROME_SESSION_DIR, CHROME_HEADERS_FILE);
    if (fs.existsSync(headersFile)) {
@@ -117,7 +117,7 @@ async function extractHeaders(url) {
    const chromeHeaders = getHeadersFromChromeSession();
    if (chromeHeaders && chromeHeaders.headers) {
        fs.writeFileSync(outputPath, JSON.stringify(chromeHeaders, null, 2), 'utf8');
-        return { success: true, output: outputPath, method: 'chrome_session', status: chromeHeaders.status };
+        return { success: true, output: outputPath, method: 'chrome', status: chromeHeaders.status };
    }

    // Fallback to HTTP HEAD request
--- a/archivebox/plugins/headers/tests/test_headers.py
+++ b/archivebox/plugins/headers/tests/test_headers.py
@@ -75,16 +75,24 @@ def test_extracts_headers_from_example_com():

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

-        # Verify output in stdout
-        assert 'STATUS=succeeded' in result.stdout, "Should report success"
-        assert 'Headers extracted' in result.stdout, "Should report completion"
+        # Parse clean JSONL output
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass

-        # Verify output directory created
-        headers_dir = tmpdir / 'headers'
-        assert headers_dir.exists(), "Output directory not created"
+        assert result_json, "Should have ArchiveResult JSONL output"
+        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"

-        # Verify output file exists
-        headers_file = headers_dir / 'headers.json'
+        # Verify output file exists (hook writes to current directory)
+        headers_file = tmpdir / 'headers.json'
        assert headers_file.exists(), "headers.json not created"

        # Verify headers JSON contains REAL example.com response
@@ -106,20 +114,6 @@ def test_extracts_headers_from_example_com():
        assert 'content-type' in headers_lower or 'content-length' in headers_lower, \
            "Should have at least one common HTTP header"

-        # Verify RESULT_JSON is present and valid
-        assert 'RESULT_JSON=' in result.stdout, "Should output RESULT_JSON"
-
-        for line in result.stdout.split('\n'):
-            if line.startswith('RESULT_JSON='):
-                result_json = json.loads(line.replace('RESULT_JSON=', ''))
-                assert result_json['extractor'] == 'headers'
-                assert result_json['status'] == 'succeeded'
-                assert result_json['url'] == TEST_URL
-                assert result_json['snapshot_id'] == 'test789'
-                assert 'duration' in result_json
-                assert result_json['duration'] >= 0
-                break
-

 def test_headers_output_structure():
    """Test that headers plugin produces correctly structured output."""
@@ -140,10 +134,25 @@ def test_headers_output_structure():
        )

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
-        assert 'STATUS=succeeded' in result.stdout, "Should report success"
+
+        # Parse clean JSONL output
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass
+
+        assert result_json, "Should have ArchiveResult JSONL output"
+        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"

        # Verify output structure
-        output_headers_file = tmpdir / 'headers' / 'headers.json'
+        output_headers_file = tmpdir / 'headers.json'
        assert output_headers_file.exists(), "Output headers.json not created"

        output_data = json.loads(output_headers_file.read_text())
@@ -162,8 +171,8 @@ def test_headers_output_structure():
        assert output_data['status'] in [200, 301, 302]


-def test_falls_back_to_http_when_chrome_session_unavailable():
-    """Test that headers plugin falls back to HTTP HEAD when chrome_session unavailable."""
+def test_falls_back_to_http_when_chrome_unavailable():
+    """Test that headers plugin falls back to HTTP HEAD when chrome unavailable."""

    if not shutil.which('node'):
        pytest.skip("node not installed")
@@ -171,7 +180,7 @@ def test_falls_back_to_http_when_chrome_session_unavailable():
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

-        # Don't create chrome_session directory - force HTTP fallback
+        # Don't create chrome directory - force HTTP fallback

        # Run headers extraction
        result = subprocess.run(
@@ -183,12 +192,25 @@ def test_falls_back_to_http_when_chrome_session_unavailable():
        )

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
-        assert 'STATUS=succeeded' in result.stdout, "Should report success"
-        assert 'http' in result.stdout.lower() or 'HEAD' not in result.stdout, \
-            "Should use HTTP method"
+
+        # Parse clean JSONL output
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass
+
+        assert result_json, "Should have ArchiveResult JSONL output"
+        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"

        # Verify output exists and has real HTTP headers
-        output_headers_file = tmpdir / 'headers' / 'headers.json'
+        output_headers_file = tmpdir / 'headers.json'
        assert output_headers_file.exists(), "Output headers.json not created"

        output_data = json.loads(output_headers_file.read_text())
@@ -250,7 +272,21 @@ def test_config_user_agent():

        # Should succeed (example.com doesn't block)
        if result.returncode == 0:
-            assert 'STATUS=succeeded' in result.stdout
+            # Parse clean JSONL output
+            result_json = None
+            for line in result.stdout.strip().split('\n'):
+                line = line.strip()
+                if line.startswith('{'):
+                    try:
+                        record = json.loads(line)
+                        if record.get('type') == 'ArchiveResult':
+                            result_json = record
+                            break
+                    except json.JSONDecodeError:
+                        pass
+
+            assert result_json, "Should have ArchiveResult JSONL output"
+            assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"


 def test_handles_https_urls():
@@ -271,7 +307,7 @@ def test_handles_https_urls():
        )

        if result.returncode == 0:
-            output_headers_file = tmpdir / 'headers' / 'headers.json'
+            output_headers_file = tmpdir / 'headers.json'
            if output_headers_file.exists():
                output_data = json.loads(output_headers_file.read_text())
                assert output_data['url'] == 'https://example.org'
@@ -298,7 +334,7 @@ def test_handles_404_gracefully():
        # May succeed or fail depending on server behavior
        # If it succeeds, verify 404 status is captured
        if result.returncode == 0:
-            output_headers_file = tmpdir / 'headers' / 'headers.json'
+            output_headers_file = tmpdir / 'headers.json'
            if output_headers_file.exists():
                output_data = json.loads(output_headers_file.read_text())
                assert output_data['status'] == 404, "Should capture 404 status"