diff --git a/TODO_hook_architecture.md b/TODO_hook_architecture.md index f5e2ce5a..1c18b871 100644 --- a/TODO_hook_architecture.md +++ b/TODO_hook_architecture.md @@ -1780,3 +1780,178 @@ output_files = { } ``` Can query with custom SQL for complex per-file queries (e.g., "find all results with any file > 50KB"). Summary fields (output_size, output_mimetypes) remain as denormalized cache for performance. + +--- + +# Hook Architecture Implementation Report + +## Date: 2025-12-27 + +## Summary + +This report documents the Phase 4 plugin audit and Phase 1-7 implementation work. + +--- + +## Implementation Status + +### ✅ Phase 1: Database Migration (COMPLETE) + +Created migrations: +- `archivebox/core/migrations/0029_archiveresult_hook_fields.py` - Adds new fields +- `archivebox/core/migrations/0030_migrate_output_field.py` - Migrates old `output` field + +New ArchiveResult fields: +- [x] `output_str` (TextField) - human-readable summary +- [x] `output_json` (JSONField) - structured metadata +- [x] `output_files` (JSONField) - dict of {relative_path: {}} +- [x] `output_size` (BigIntegerField) - total bytes +- [x] `output_mimetypes` (CharField) - CSV of mimetypes sorted by size +- [x] `binary` (ForeignKey to InstalledBinary) - optional + +### ✅ Phase 3: Generic run_hook() (COMPLETE) + +Updated `archivebox/hooks.py`: +- [x] Parse JSONL output (any line with `{type: 'ModelName', ...}`) +- [x] Backwards compatible with `RESULT_JSON=` format +- [x] Add plugin metadata to each record +- [x] Detect background hooks with `.bg.` suffix +- [x] Added `find_binary_for_cmd()` helper +- [x] Added `create_model_record()` for InstalledBinary/Machine + +### ✅ Phase 6: Update ArchiveResult.run() (COMPLETE) + +Updated `archivebox/core/models.py`: +- [x] Handle background hooks (return immediately when result is None) +- [x] Process `records` from HookResult +- [x] Use new output fields +- [x] Added `_populate_output_fields()` method +- [x] Added `_set_binary_from_cmd()` method +- [x] Call `create_model_record()` for side-effect records + +### ✅ Phase 7: Background Hook Support (COMPLETE) + +Added to `archivebox/core/models.py`: +- [x] `is_background_hook()` method +- [x] `check_background_completed()` method +- [x] `finalize_background_hook()` method + +Updated `archivebox/core/statemachines.py`: +- [x] `SnapshotMachine.is_finished()` checks/finalizes background hooks + +--- + +## Phase 4: Plugin Audit + +### Dependency Hooks (on_Dependency__*) - ALL COMPLIANT ✅ + +| Plugin | Hook | Status | Notes | +|--------|------|--------|-------| +| apt | `on_Dependency__install_using_apt_provider.py` | ✅ OK | Emits `{type: 'InstalledBinary'}` JSONL | +| brew | `on_Dependency__install_using_brew_provider.py` | ✅ OK | Emits `{type: 'InstalledBinary'}` JSONL | +| custom | `on_Dependency__install_using_custom_bash.py` | ✅ OK | Emits `{type: 'InstalledBinary'}` JSONL | +| env | `on_Dependency__install_using_env_provider.py` | ✅ OK | Emits `{type: 'InstalledBinary'}` JSONL | +| npm | `on_Dependency__install_using_npm_provider.py` | ✅ OK | Emits `{type: 'InstalledBinary'}` JSONL | +| pip | `on_Dependency__install_using_pip_provider.py` | ✅ OK | Emits `{type: 'InstalledBinary'}` JSONL | + +### Crawl Validate Hooks (on_Crawl__00_validate_*) - ALL COMPLIANT ✅ + +| Plugin | Hook | Status | Notes | +|--------|------|--------|-------| +| chrome_session | `on_Crawl__00_validate_chrome.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| wget | `on_Crawl__00_validate_wget.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| singlefile | `on_Crawl__00_validate_singlefile.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| readability | `on_Crawl__00_validate_readability.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| media | `on_Crawl__00_validate_ytdlp.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| git | `on_Crawl__00_validate_git.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| forumdl | `on_Crawl__00_validate_forumdl.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| gallerydl | `on_Crawl__00_validate_gallerydl.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| mercury | `on_Crawl__00_validate_mercury.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| papersdl | `on_Crawl__00_validate_papersdl.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | +| search_backend_ripgrep | `on_Crawl__00_validate_ripgrep.py` | ✅ OK | Emits InstalledBinary/Dependency JSONL | + +### Snapshot Hooks (on_Snapshot__*) - Python Hooks UPDATED ✅ + +| Plugin | Hook | Status | Notes | +|--------|------|--------|-------| +| favicon | `on_Snapshot__11_favicon.py` | ✅ UPDATED | Now outputs clean JSONL | +| git | `on_Snapshot__12_git.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| archive_org | `on_Snapshot__13_archive_org.py` | ✅ UPDATED | Now outputs clean JSONL | +| title | `on_Snapshot__32_title.js` | ✅ UPDATED | Now outputs clean JSONL | +| singlefile | `on_Snapshot__37_singlefile.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| wget | `on_Snapshot__50_wget.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| media | `on_Snapshot__51_media.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | +| readability | `on_Snapshot__52_readability.py` | ✅ UPDATED | Now outputs clean JSONL with cmd | + +### Snapshot Hooks - JavaScript Hooks (REMAINING WORK) + +The following JS hooks still use the old `RESULT_JSON=` format and need to be updated: + +| Plugin | Hook | Current Issue | +|--------|------|---------------| +| chrome_session | `on_Snapshot__20_chrome_session.js` | Uses `RESULT_JSON=` prefix | +| consolelog | `on_Snapshot__21_consolelog.js` | Uses `RESULT_JSON=` prefix | +| ssl | `on_Snapshot__23_ssl.js` | Uses `RESULT_JSON=` prefix | +| responses | `on_Snapshot__24_responses.js` | Uses `RESULT_JSON=` prefix | +| chrome_navigate | `on_Snapshot__30_chrome_navigate.js` | Uses `RESULT_JSON=` prefix | +| redirects | `on_Snapshot__31_redirects.js` | Uses `RESULT_JSON=` prefix | +| headers | `on_Snapshot__33_headers.js` | Uses `RESULT_JSON=` prefix | +| screenshot | `on_Snapshot__34_screenshot.js` | Uses `RESULT_JSON=` prefix | +| pdf | `on_Snapshot__35_pdf.js` | Uses `RESULT_JSON=` prefix | +| dom | `on_Snapshot__36_dom.js` | Uses `RESULT_JSON=` prefix | +| seo | `on_Snapshot__38_seo.js` | Uses `RESULT_JSON=` prefix | +| accessibility | `on_Snapshot__39_accessibility.js` | Uses `RESULT_JSON=` prefix | +| parse_dom_outlinks | `on_Snapshot__40_parse_dom_outlinks.js` | Uses `RESULT_JSON=` prefix | + +**Fix Required for Each JS Hook:** + +Replace: +```javascript +console.log(`START_TS=${startTs.toISOString()}`); +console.log(`END_TS=${endTs.toISOString()}`); +console.log(`STATUS=${status}`); +console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); +``` + +With: +```javascript +console.log(JSON.stringify({ + type: 'ArchiveResult', + status, + output_str: output || error || '', +})); +``` + +--- + +## Files Modified + +### Core Infrastructure +- `archivebox/hooks.py` - Updated run_hook() and added helpers +- `archivebox/core/models.py` - Updated ArchiveResult model and run() method +- `archivebox/core/statemachines.py` - Updated SnapshotMachine.is_finished() +- `archivebox/core/admin_archiveresults.py` - Updated to use output_str +- `archivebox/core/templatetags/core_tags.py` - Updated to use output_str + +### Migrations +- `archivebox/core/migrations/0029_archiveresult_hook_fields.py` (new) +- `archivebox/core/migrations/0030_migrate_output_field.py` (new) + +### Plugins Updated +- `archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py` +- `archivebox/plugins/favicon/on_Snapshot__11_favicon.py` +- `archivebox/plugins/git/on_Snapshot__12_git.py` +- `archivebox/plugins/media/on_Snapshot__51_media.py` +- `archivebox/plugins/readability/on_Snapshot__52_readability.py` +- `archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py` +- `archivebox/plugins/title/on_Snapshot__32_title.js` +- `archivebox/plugins/wget/on_Snapshot__50_wget.py` + +--- + +## Remaining Work + +1. **Update remaining JS hooks** (13 files) to output clean JSONL +2. **Rename background hooks** with `.bg.` suffix +3. **Write tests** for the hook architecture +4. **Run migrations** and test on real data diff --git a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py index 1fbd0a6b..0572f3ee 100644 --- a/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py +++ b/archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py @@ -121,33 +121,19 @@ def main(url: str, snapshot_id: str): error = f'{type(e).__name__}: {e}' status = 'failed' - # Print results + # Calculate duration end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') - if output: - print(f'OUTPUT={output}') - print(f'STATUS={status}') if error: - print(f'ERROR={error}', file=sys.stderr) + print(f'ERROR: {error}', file=sys.stderr) - # Print JSON result - result_json = { - 'extractor': EXTRACTOR_NAME, - 'url': url, - 'snapshot_id': snapshot_id, + # Output clean JSONL (no RESULT_JSON= prefix) + result = { + 'type': 'ArchiveResult', 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), - 'output': output, - 'error': error or None, + 'output_str': output or error or '', } - print(f'RESULT_JSON={json.dumps(result_json)}') + print(json.dumps(result)) sys.exit(0 if status == 'succeeded' else 1) diff --git a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py index 78c9e4b3..46c6e44a 100644 --- a/archivebox/plugins/favicon/on_Snapshot__11_favicon.py +++ b/archivebox/plugins/favicon/on_Snapshot__11_favicon.py @@ -134,33 +134,19 @@ def main(url: str, snapshot_id: str): error = f'{type(e).__name__}: {e}' status = 'failed' - # Print results + # Calculate duration end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') - if output: - print(f'OUTPUT={output}') - print(f'STATUS={status}') if error: - print(f'ERROR={error}', file=sys.stderr) + print(f'ERROR: {error}', file=sys.stderr) - # Print JSON result - result_json = { - 'extractor': EXTRACTOR_NAME, - 'url': url, - 'snapshot_id': snapshot_id, + # Output clean JSONL (no RESULT_JSON= prefix) + result = { + 'type': 'ArchiveResult', 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), - 'output': output, - 'error': error or None, + 'output_str': output or error or '', } - print(f'RESULT_JSON={json.dumps(result_json)}') + print(json.dumps(result)) sys.exit(0 if status == 'succeeded' else 1) diff --git a/archivebox/plugins/git/on_Snapshot__12_git.py b/archivebox/plugins/git/on_Snapshot__12_git.py index 16e0c43e..4018bf75 100644 --- a/archivebox/plugins/git/on_Snapshot__12_git.py +++ b/archivebox/plugins/git/on_Snapshot__12_git.py @@ -153,38 +153,23 @@ def main(url: str, snapshot_id: str): error = f'{type(e).__name__}: {e}' status = 'failed' - # Print results + # Calculate duration end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') - if binary: - print(f'CMD={binary} clone {url}') - if version: - print(f'VERSION={version}') - if output: - print(f'OUTPUT={output}') - print(f'STATUS={status}') if error: - print(f'ERROR={error}', file=sys.stderr) + print(f'ERROR: {error}', file=sys.stderr) - # Print JSON result - result_json = { - 'extractor': EXTRACTOR_NAME, - 'url': url, - 'snapshot_id': snapshot_id, + # Output clean JSONL (no RESULT_JSON= prefix) + result = { + 'type': 'ArchiveResult', 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), - 'cmd_version': version, - 'output': output, - 'error': error or None, + 'output_str': output or error or '', } - print(f'RESULT_JSON={json.dumps(result_json)}') + if binary: + result['cmd'] = [binary, 'clone', '--depth=1', '--recursive', url, OUTPUT_DIR] + if version: + result['cmd_version'] = version + print(json.dumps(result)) sys.exit(0 if status == 'succeeded' else 1) diff --git a/archivebox/plugins/media/on_Snapshot__51_media.py b/archivebox/plugins/media/on_Snapshot__51_media.py index 1677fc2c..64072c0a 100644 --- a/archivebox/plugins/media/on_Snapshot__51_media.py +++ b/archivebox/plugins/media/on_Snapshot__51_media.py @@ -218,22 +218,14 @@ def main(url: str, snapshot_id: str): try: # Check if yt-dlp is enabled if not (get_env_bool('USE_YTDLP', True) and get_env_bool('SAVE_MEDIA', True)): - print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)') - status = 'skipped' - end_ts = datetime.now(timezone.utc) - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'STATUS={status}') - print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') + print('Skipping media (USE_YTDLP=False or SAVE_MEDIA=False)', file=sys.stderr) + print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'USE_YTDLP=False'})) sys.exit(0) # Check if staticfile extractor already handled this (permanent skip) if has_staticfile_output(): - print(f'Skipping media - staticfile extractor already downloaded this') - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={datetime.now(timezone.utc).isoformat()}') - print(f'STATUS={status}') - print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') + print('Skipping media - staticfile extractor already downloaded this', file=sys.stderr) + print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'})) sys.exit(0) # Find binary @@ -265,38 +257,23 @@ def main(url: str, snapshot_id: str): error = f'{type(e).__name__}: {e}' status = 'failed' - # Print results + # Calculate duration end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') - if cmd_str: - print(f'CMD={cmd_str}') - if version: - print(f'VERSION={version}') - if output: - print(f'OUTPUT={output}') - print(f'STATUS={status}') if error: - print(f'ERROR={error}', file=sys.stderr) + print(f'ERROR: {error}', file=sys.stderr) - # Print JSON result - result_json = { - 'extractor': EXTRACTOR_NAME, - 'url': url, - 'snapshot_id': snapshot_id, + # Output clean JSONL (no RESULT_JSON= prefix) + result = { + 'type': 'ArchiveResult', 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), - 'cmd_version': version, - 'output': output, - 'error': error or None, + 'output_str': output or error or '', } - print(f'RESULT_JSON={json.dumps(result_json)}') + if binary: + result['cmd'] = [binary, url] + if version: + result['cmd_version'] = version + print(json.dumps(result)) sys.exit(0 if status == 'succeeded' else 1) diff --git a/archivebox/plugins/readability/on_Snapshot__52_readability.py b/archivebox/plugins/readability/on_Snapshot__52_readability.py index a161e03f..7121ee7a 100644 --- a/archivebox/plugins/readability/on_Snapshot__52_readability.py +++ b/archivebox/plugins/readability/on_Snapshot__52_readability.py @@ -178,38 +178,23 @@ def main(url: str, snapshot_id: str): error = f'{type(e).__name__}: {e}' status = 'failed' - # Print results + # Calculate duration end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') - if binary: - print(f'CMD={binary} ') - if version: - print(f'VERSION={version}') - if output: - print(f'OUTPUT={output}') - print(f'STATUS={status}') if error: - print(f'ERROR={error}', file=sys.stderr) + print(f'ERROR: {error}', file=sys.stderr) - # Print JSON result - result_json = { - 'extractor': EXTRACTOR_NAME, - 'url': url, - 'snapshot_id': snapshot_id, + # Output clean JSONL (no RESULT_JSON= prefix) + result = { + 'type': 'ArchiveResult', 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), - 'cmd_version': version, - 'output': output, - 'error': error or None, + 'output_str': output or error or '', } - print(f'RESULT_JSON={json.dumps(result_json)}') + if binary: + result['cmd'] = [binary, ''] + if version: + result['cmd_version'] = version + print(json.dumps(result)) sys.exit(0 if status == 'succeeded' else 1) diff --git a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py index 2fa60327..ba647ec0 100644 --- a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py +++ b/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py @@ -245,23 +245,15 @@ def main(url: str, snapshot_id: str): try: # Check if SingleFile is enabled if not get_env_bool('SAVE_SINGLEFILE', True): - print('Skipping SingleFile (SAVE_SINGLEFILE=False)') - status = 'skipped' - end_ts = datetime.now(timezone.utc) - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'STATUS={status}') - print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') + print('Skipping SingleFile (SAVE_SINGLEFILE=False)', file=sys.stderr) + print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'SAVE_SINGLEFILE=False'})) sys.exit(0) # Check if staticfile extractor already handled this (permanent skip) if has_staticfile_output(): - print(f'Skipping SingleFile - staticfile extractor already downloaded this') - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={datetime.now(timezone.utc).isoformat()}') - print(f'STATUS=skipped') - print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}') - sys.exit(0) # Permanent skip - staticfile already handled + print('Skipping SingleFile - staticfile extractor already downloaded this', file=sys.stderr) + print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'})) + sys.exit(0) # Find binary binary = find_singlefile() @@ -287,38 +279,23 @@ def main(url: str, snapshot_id: str): error = f'{type(e).__name__}: {e}' status = 'failed' - # Print results + # Calculate duration end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') - if cmd_str: - print(f'CMD={cmd_str}') - if version: - print(f'VERSION={version}') - if output: - print(f'OUTPUT={output}') - print(f'STATUS={status}') if error: - print(f'ERROR={error}', file=sys.stderr) + print(f'ERROR: {error}', file=sys.stderr) - # Print JSON result - result_json = { - 'extractor': EXTRACTOR_NAME, - 'url': url, - 'snapshot_id': snapshot_id, + # Output clean JSONL (no RESULT_JSON= prefix) + result = { + 'type': 'ArchiveResult', 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), - 'cmd_version': version, - 'output': output, - 'error': error or None, + 'output_str': output or error or '', } - print(f'RESULT_JSON={json.dumps(result_json)}') + if binary: + result['cmd'] = [binary, '--browser-headless', url, OUTPUT_FILE] + if version: + result['cmd_version'] = version + print(json.dumps(result)) sys.exit(0 if status == 'succeeded' else 1) diff --git a/archivebox/plugins/title/on_Snapshot__32_title.js b/archivebox/plugins/title/on_Snapshot__32_title.js index eb760444..ff97e0f4 100644 --- a/archivebox/plugins/title/on_Snapshot__32_title.js +++ b/archivebox/plugins/title/on_Snapshot__32_title.js @@ -221,34 +221,18 @@ async function main() { } const endTs = new Date(); - const duration = (endTs - startTs) / 1000; - - // Print results - console.log(`START_TS=${startTs.toISOString()}`); - console.log(`END_TS=${endTs.toISOString()}`); - console.log(`DURATION=${duration.toFixed(2)}`); - if (output) { - console.log(`OUTPUT=${output}`); - } - console.log(`STATUS=${status}`); if (error) { - console.error(`ERROR=${error}`); + console.error(`ERROR: ${error}`); } - // Print JSON result - const resultJson = { - extractor: EXTRACTOR_NAME, - url, - snapshot_id: snapshotId, + // Output clean JSONL (no RESULT_JSON= prefix) + const result = { + type: 'ArchiveResult', status, - start_ts: startTs.toISOString(), - end_ts: endTs.toISOString(), - duration: Math.round(duration * 100) / 100, - output, - error: error || null, + output_str: output || error || '', }; - console.log(`RESULT_JSON=${JSON.stringify(resultJson)}`); + console.log(JSON.stringify(result)); process.exit(status === 'succeeded' ? 0 : 1); } diff --git a/archivebox/plugins/wget/on_Snapshot__50_wget.py b/archivebox/plugins/wget/on_Snapshot__50_wget.py index 265d43c2..21da1944 100644 --- a/archivebox/plugins/wget/on_Snapshot__50_wget.py +++ b/archivebox/plugins/wget/on_Snapshot__50_wget.py @@ -241,23 +241,15 @@ def main(url: str, snapshot_id: str): try: # Check if wget is enabled if not get_env_bool('SAVE_WGET', True): - print('Skipping wget (SAVE_WGET=False)') - status = 'skipped' - end_ts = datetime.now(timezone.utc) - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'STATUS={status}') - print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": status, "url": url, "snapshot_id": snapshot_id})}') + print('Skipping wget (SAVE_WGET=False)', file=sys.stderr) + print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'SAVE_WGET=False'})) sys.exit(0) # Check if staticfile extractor already handled this (permanent skip) if has_staticfile_output(): - print(f'Skipping wget - staticfile extractor already downloaded this') - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={datetime.now(timezone.utc).isoformat()}') - print(f'STATUS=skipped') - print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}') - sys.exit(0) # Permanent skip - staticfile already handled + print('Skipping wget - staticfile extractor already downloaded this', file=sys.stderr) + print(json.dumps({'type': 'ArchiveResult', 'status': 'skipped', 'output_str': 'staticfile already exists'})) + sys.exit(0) # Find binary binary = find_wget() @@ -285,38 +277,23 @@ def main(url: str, snapshot_id: str): error = f'{type(e).__name__}: {e}' status = 'failed' - # Print results + # Calculate duration end_ts = datetime.now(timezone.utc) - duration = (end_ts - start_ts).total_seconds() - - print(f'START_TS={start_ts.isoformat()}') - print(f'END_TS={end_ts.isoformat()}') - print(f'DURATION={duration:.2f}') - if cmd_str: - print(f'CMD={cmd_str}') - if version: - print(f'VERSION={version}') - if output: - print(f'OUTPUT={output}') - print(f'STATUS={status}') if error: - print(f'ERROR={error}', file=sys.stderr) + print(f'ERROR: {error}', file=sys.stderr) - # Print JSON result - result_json = { - 'extractor': EXTRACTOR_NAME, - 'url': url, - 'snapshot_id': snapshot_id, + # Output clean JSONL (no RESULT_JSON= prefix) + result = { + 'type': 'ArchiveResult', 'status': status, - 'start_ts': start_ts.isoformat(), - 'end_ts': end_ts.isoformat(), - 'duration': round(duration, 2), - 'cmd_version': version, - 'output': output, - 'error': error or None, + 'output_str': output or error or '', } - print(f'RESULT_JSON={json.dumps(result_json)}') + if binary: + result['cmd'] = [binary, '--no-verbose', url] + if version: + result['cmd_version'] = version + print(json.dumps(result)) sys.exit(0 if status == 'succeeded' else 1)