Update Python/JS hooks to clean JSONL format + add audit report

Phase 4 Plugin Audit Progress: - Audited all 6 Dependency hooks (all already compliant) - Audited all 11 Crawl Validate hooks (all already compliant) - Updated 8 Python Snapshot hooks to clean JSONL format - Updated 1 JS Snapshot hook (title.js) to clean JSONL format Snapshot hooks updated to remove: - RESULT_JSON= prefix - Extra output lines (START_TS=, END_TS=, DURATION=, VERSION=, OUTPUT=, STATUS=) Now output clean JSONL: {"type": "ArchiveResult", "status": "...", "output_str": "..."} Added implementation report to TODO_hook_architecture.md documenting: - All completed phases (1, 3, 6, 7) - Plugin audit results with status tables - Remaining 13 JS hooks that need updating - Files modified list
2026-04-05 15:27:53 +10:00 · 2025-12-27 09:31:03 +00:00
parent 3d985fa8c8
commit c52eef1459
9 changed files with 264 additions and 232 deletions
--- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
+++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py
@@ -121,33 +121,19 @@ def main(url: str, snapshot_id: str):
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Print results
+    # Calculate duration
    end_ts = datetime.now(timezone.utc)
-    duration = (end_ts - start_ts).total_seconds()
-
-    print(f'START_TS={start_ts.isoformat()}')
-    print(f'END_TS={end_ts.isoformat()}')
-    print(f'DURATION={duration:.2f}')
-    if output:
-        print(f'OUTPUT={output}')
-    print(f'STATUS={status}')

    if error:
-        print(f'ERROR={error}', file=sys.stderr)
+        print(f'ERROR: {error}', file=sys.stderr)

-    # Print JSON result
-    result_json = {
-        'extractor': EXTRACTOR_NAME,
-        'url': url,
-        'snapshot_id': snapshot_id,
+    # Output clean JSONL (no RESULT_JSON= prefix)
+    result = {
+        'type': 'ArchiveResult',
        'status': status,
-        'start_ts': start_ts.isoformat(),
-        'end_ts': end_ts.isoformat(),
-        'duration': round(duration, 2),
-        'output': output,
-        'error': error or None,
+        'output_str': output or error or '',
    }
-    print(f'RESULT_JSON={json.dumps(result_json)}')
+    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)

--- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
+++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py
@@ -134,33 +134,19 @@ def main(url: str, snapshot_id: str):
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Print results
+    # Calculate duration
    end_ts = datetime.now(timezone.utc)
-    duration = (end_ts - start_ts).total_seconds()
-
-    print(f'START_TS={start_ts.isoformat()}')
-    print(f'END_TS={end_ts.isoformat()}')
-    print(f'DURATION={duration:.2f}')
-    if output:
-        print(f'OUTPUT={output}')
-    print(f'STATUS={status}')

    if error:
-        print(f'ERROR={error}', file=sys.stderr)
+        print(f'ERROR: {error}', file=sys.stderr)

-    # Print JSON result
-    result_json = {
-        'extractor': EXTRACTOR_NAME,
-        'url': url,
-        'snapshot_id': snapshot_id,
+    # Output clean JSONL (no RESULT_JSON= prefix)
+    result = {
+        'type': 'ArchiveResult',
        'status': status,
-        'start_ts': start_ts.isoformat(),
-        'end_ts': end_ts.isoformat(),
-        'duration': round(duration, 2),
-        'output': output,
-        'error': error or None,
+        'output_str': output or error or '',
    }
-    print(f'RESULT_JSON={json.dumps(result_json)}')
+    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)

--- a/archivebox/plugins/git/on_Snapshot__12_git.py
+++ b/archivebox/plugins/git/on_Snapshot__12_git.py
@@ -153,38 +153,23 @@ def main(url: str, snapshot_id: str):
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Print results
+    # Calculate duration
    end_ts = datetime.now(timezone.utc)
-    duration = (end_ts - start_ts).total_seconds()
-
-    print(f'START_TS={start_ts.isoformat()}')
-    print(f'END_TS={end_ts.isoformat()}')
-    print(f'DURATION={duration:.2f}')
-    if binary:
-        print(f'CMD={binary} clone {url}')
-    if version:
-        print(f'VERSION={version}')
-    if output:
-        print(f'OUTPUT={output}')
-    print(f'STATUS={status}')

    if error:
-        print(f'ERROR={error}', file=sys.stderr)
+        print(f'ERROR: {error}', file=sys.stderr)

-    # Print JSON result
-    result_json = {
-        'extractor': EXTRACTOR_NAME,
-        'url': url,
-        'snapshot_id': snapshot_id,
+    # Output clean JSONL (no RESULT_JSON= prefix)
+    result = {
+        'type': 'ArchiveResult',
        'status': status,
-        'start_ts': start_ts.isoformat(),
-        'end_ts': end_ts.isoformat(),
-        'duration': round(duration, 2),
-        'cmd_version': version,
-        'output': output,
-        'error': error or None,
+        'output_str': output or error or '',
    }
-    print(f'RESULT_JSON={json.dumps(result_json)}')
+    if binary:
+        result['cmd'] = [binary, 'clone', '--depth=1', '--recursive', url, OUTPUT_DIR]
+    if version:
+        result['cmd_version'] = version
+    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)

--- a/archivebox/plugins/media/on_Snapshot__51_media.py
+++ b/archivebox/plugins/media/on_Snapshot__51_media.py
@@ -218,22 +218,14 @@ def main(url: str, snapshot_id: str):
    try:
        # Check if yt-dlp is enabled
        if not (get_env_bool('USE_YTDLP', True) and get_env_bool('SAVE_MEDIA', True)):
-            print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)')
-            status = 'skipped'
-            end_ts = datetime.now(timezone.utc)
-            print(f'START_TS={start_ts.isoformat()}')
-            print(f'END_TS={end_ts.isoformat()}')
-            print(f'STATUS={status}')
-            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)', file=sys.stderr)
+            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'USE_YTDLP=False'}))
            sys.exit(0)

        # Check if staticfile extractor already handled this (permanent skip)
        if has_staticfile_output():
-            print(f'Skipping media - staticfile extractor already downloaded this')
-            print(f'START_TS={start_ts.isoformat()}')
-            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
-            print(f'STATUS={status}')
-            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            print('Skipping media - staticfile extractor already downloaded this', file=sys.stderr)
+            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
            sys.exit(0)

        # Find binary
@@ -265,38 +257,23 @@ def main(url: str, snapshot_id: str):
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Print results
+    # Calculate duration
    end_ts = datetime.now(timezone.utc)
-    duration = (end_ts - start_ts).total_seconds()
-
-    print(f'START_TS={start_ts.isoformat()}')
-    print(f'END_TS={end_ts.isoformat()}')
-    print(f'DURATION={duration:.2f}')
-    if cmd_str:
-        print(f'CMD={cmd_str}')
-    if version:
-        print(f'VERSION={version}')
-    if output:
-        print(f'OUTPUT={output}')
-    print(f'STATUS={status}')

    if error:
-        print(f'ERROR={error}', file=sys.stderr)
+        print(f'ERROR: {error}', file=sys.stderr)

-    # Print JSON result
-    result_json = {
-        'extractor': EXTRACTOR_NAME,
-        'url': url,
-        'snapshot_id': snapshot_id,
+    # Output clean JSONL (no RESULT_JSON= prefix)
+    result = {
+        'type': 'ArchiveResult',
        'status': status,
-        'start_ts': start_ts.isoformat(),
-        'end_ts': end_ts.isoformat(),
-        'duration': round(duration, 2),
-        'cmd_version': version,
-        'output': output,
-        'error': error or None,
+        'output_str': output or error or '',
    }
-    print(f'RESULT_JSON={json.dumps(result_json)}')
+    if binary:
+        result['cmd'] = [binary, url]
+    if version:
+        result['cmd_version'] = version
+    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)

--- a/archivebox/plugins/readability/on_Snapshot__52_readability.py
+++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py
@@ -178,38 +178,23 @@ def main(url: str, snapshot_id: str):
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Print results
+    # Calculate duration
    end_ts = datetime.now(timezone.utc)
-    duration = (end_ts - start_ts).total_seconds()
-
-    print(f'START_TS={start_ts.isoformat()}')
-    print(f'END_TS={end_ts.isoformat()}')
-    print(f'DURATION={duration:.2f}')
-    if binary:
-        print(f'CMD={binary} <html>')
-    if version:
-        print(f'VERSION={version}')
-    if output:
-        print(f'OUTPUT={output}')
-    print(f'STATUS={status}')

    if error:
-        print(f'ERROR={error}', file=sys.stderr)
+        print(f'ERROR: {error}', file=sys.stderr)

-    # Print JSON result
-    result_json = {
-        'extractor': EXTRACTOR_NAME,
-        'url': url,
-        'snapshot_id': snapshot_id,
+    # Output clean JSONL (no RESULT_JSON= prefix)
+    result = {
+        'type': 'ArchiveResult',
        'status': status,
-        'start_ts': start_ts.isoformat(),
-        'end_ts': end_ts.isoformat(),
-        'duration': round(duration, 2),
-        'cmd_version': version,
-        'output': output,
-        'error': error or None,
+        'output_str': output or error or '',
    }
-    print(f'RESULT_JSON={json.dumps(result_json)}')
+    if binary:
+        result['cmd'] = [binary, '<html>']
+    if version:
+        result['cmd_version'] = version
+    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)

--- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
+++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py
@@ -245,23 +245,15 @@ def main(url: str, snapshot_id: str):
    try:
        # Check if SingleFile is enabled
        if not get_env_bool('SAVE_SINGLEFILE', True):
-            print('Skipping SingleFile (SAVE_SINGLEFILE=False)')
-            status = 'skipped'
-            end_ts = datetime.now(timezone.utc)
-            print(f'START_TS={start_ts.isoformat()}')
-            print(f'END_TS={end_ts.isoformat()}')
-            print(f'STATUS={status}')
-            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            print('Skipping SingleFile (SAVE_SINGLEFILE=False)', file=sys.stderr)
+            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'SAVE_SINGLEFILE=False'}))
            sys.exit(0)

        # Check if staticfile extractor already handled this (permanent skip)
        if has_staticfile_output():
-            print(f'Skipping SingleFile - staticfile extractor already downloaded this')
-            print(f'START_TS={start_ts.isoformat()}')
-            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
-            print(f'STATUS=skipped')
-            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
-            sys.exit(0)  # Permanent skip - staticfile already handled
+            print('Skipping SingleFile - staticfile extractor already downloaded this', file=sys.stderr)
+            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
+            sys.exit(0)

        # Find binary
        binary = find_singlefile()
@@ -287,38 +279,23 @@ def main(url: str, snapshot_id: str):
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Print results
+    # Calculate duration
    end_ts = datetime.now(timezone.utc)
-    duration = (end_ts - start_ts).total_seconds()
-
-    print(f'START_TS={start_ts.isoformat()}')
-    print(f'END_TS={end_ts.isoformat()}')
-    print(f'DURATION={duration:.2f}')
-    if cmd_str:
-        print(f'CMD={cmd_str}')
-    if version:
-        print(f'VERSION={version}')
-    if output:
-        print(f'OUTPUT={output}')
-    print(f'STATUS={status}')

    if error:
-        print(f'ERROR={error}', file=sys.stderr)
+        print(f'ERROR: {error}', file=sys.stderr)

-    # Print JSON result
-    result_json = {
-        'extractor': EXTRACTOR_NAME,
-        'url': url,
-        'snapshot_id': snapshot_id,
+    # Output clean JSONL (no RESULT_JSON= prefix)
+    result = {
+        'type': 'ArchiveResult',
        'status': status,
-        'start_ts': start_ts.isoformat(),
-        'end_ts': end_ts.isoformat(),
-        'duration': round(duration, 2),
-        'cmd_version': version,
-        'output': output,
-        'error': error or None,
+        'output_str': output or error or '',
    }
-    print(f'RESULT_JSON={json.dumps(result_json)}')
+    if binary:
+        result['cmd'] = [binary, '--browser-headless', url, OUTPUT_FILE]
+    if version:
+        result['cmd_version'] = version
+    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)

--- a/archivebox/plugins/title/on_Snapshot__32_title.js
+++ b/archivebox/plugins/title/on_Snapshot__32_title.js
@@ -221,34 +221,18 @@ async function main() {
    }

    const endTs = new Date();
-    const duration = (endTs - startTs) / 1000;
-
-    // Print results
-    console.log(`START_TS=${startTs.toISOString()}`);
-    console.log(`END_TS=${endTs.toISOString()}`);
-    console.log(`DURATION=${duration.toFixed(2)}`);
-    if (output) {
-        console.log(`OUTPUT=${output}`);
-    }
-    console.log(`STATUS=${status}`);

    if (error) {
-        console.error(`ERROR=${error}`);
+        console.error(`ERROR: ${error}`);
    }

-    // Print JSON result
-    const resultJson = {
-        extractor: EXTRACTOR_NAME,
-        url,
-        snapshot_id: snapshotId,
+    // Output clean JSONL (no RESULT_JSON= prefix)
+    const result = {
+        type: 'ArchiveResult',
        status,
-        start_ts: startTs.toISOString(),
-        end_ts: endTs.toISOString(),
-        duration: Math.round(duration * 100) / 100,
-        output,
-        error: error || null,
+        output_str: output || error || '',
    };
-    console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`);
+    console.log(JSON.stringify(result));

    process.exit(status === 'succeeded' ? 0 : 1);
 }
--- a/archivebox/plugins/wget/on_Snapshot__50_wget.py
+++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py
@@ -241,23 +241,15 @@ def main(url: str, snapshot_id: str):
    try:
        # Check if wget is enabled
        if not get_env_bool('SAVE_WGET', True):
-            print('Skipping wget (SAVE_WGET=False)')
-            status = 'skipped'
-            end_ts = datetime.now(timezone.utc)
-            print(f'START_TS={start_ts.isoformat()}')
-            print(f'END_TS={end_ts.isoformat()}')
-            print(f'STATUS={status}')
-            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}')
+            print('Skipping wget (SAVE_WGET=False)', file=sys.stderr)
+            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'SAVE_WGET=False'}))
            sys.exit(0)

        # Check if staticfile extractor already handled this (permanent skip)
        if has_staticfile_output():
-            print(f'Skipping wget - staticfile extractor already downloaded this')
-            print(f'START_TS={start_ts.isoformat()}')
-            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
-            print(f'STATUS=skipped')
-            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
-            sys.exit(0)  # Permanent skip - staticfile already handled
+            print('Skipping wget - staticfile extractor already downloaded this', file=sys.stderr)
+            print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'}))
+            sys.exit(0)

        # Find binary
        binary = find_wget()
@@ -285,38 +277,23 @@ def main(url: str, snapshot_id: str):
        error = f'{type(e).__name__}: {e}'
        status = 'failed'

-    # Print results
+    # Calculate duration
    end_ts = datetime.now(timezone.utc)
-    duration = (end_ts - start_ts).total_seconds()
-
-    print(f'START_TS={start_ts.isoformat()}')
-    print(f'END_TS={end_ts.isoformat()}')
-    print(f'DURATION={duration:.2f}')
-    if cmd_str:
-        print(f'CMD={cmd_str}')
-    if version:
-        print(f'VERSION={version}')
-    if output:
-        print(f'OUTPUT={output}')
-    print(f'STATUS={status}')

    if error:
-        print(f'ERROR={error}', file=sys.stderr)
+        print(f'ERROR: {error}', file=sys.stderr)

-    # Print JSON result
-    result_json = {
-        'extractor': EXTRACTOR_NAME,
-        'url': url,
-        'snapshot_id': snapshot_id,
+    # Output clean JSONL (no RESULT_JSON= prefix)
+    result = {
+        'type': 'ArchiveResult',
        'status': status,
-        'start_ts': start_ts.isoformat(),
-        'end_ts': end_ts.isoformat(),
-        'duration': round(duration, 2),
-        'cmd_version': version,
-        'output': output,
-        'error': error or None,
+        'output_str': output or error or '',
    }
-    print(f'RESULT_JSON={json.dumps(result_json)}')
+    if binary:
+        result['cmd'] = [binary, '--no-verbose', url]
+    if version:
+        result['cmd_version'] = version
+    print(json.dumps(result))

    sys.exit(0 if status == 'succeeded' else 1)