Update views, API, and exports for new ArchiveResult output fields

Replace old `output` field with new fields across the codebase: - output_str: Human-readable output summary - output_json: Structured metadata (optional) - output_files: Dict of output files with metadata - output_size: Total size in bytes - output_mimetypes: CSV of file mimetypes Files updated: - api/v1_core.py: Update MinimalArchiveResultSchema to expose new fields - api/v1_core.py: Update ArchiveResultFilterSchema to search output_str - cli/archivebox_extract.py: Use output_str in CLI output - core/admin_archiveresults.py: Update admin fields, search, and fieldsets - core/admin_archiveresults.py: Fix output_html variable name bug in output_summary - misc/jsonl.py: Update archiveresult_to_jsonl() to include new fields - plugins/extractor_utils.py: Update ExtractorResult helper class The embed_path() method already uses output_files and output_str, so snapshot detail page and template tags work correctly.
2026-04-04 06:47:57 +10:00 · 2025-12-27 20:28:22 +00:00
parent d65eb587d9
commit b632894bc9
5 changed files with 46 additions and 33 deletions
--- a/archivebox/plugins/extractor_utils.py
+++ b/archivebox/plugins/extractor_utils.py
@@ -105,7 +105,7 @@ class ExtractorResult:

        # ... do extraction ...

-        result.output = 'example.com/index.html'
+        result.output_str = 'example.com/index.html'
        result.status = 'succeeded'
        result.finish()

@@ -121,7 +121,7 @@ class ExtractorResult:

        self.cmd: list[str] = []
        self.version: str = ''
-        self.output: str | Path | None = None
+        self.output_str: str = ''  # Human-readable output summary
        self.status: str = 'failed'  # 'succeeded', 'failed', 'skipped'

        self.stdout: str = ''
@@ -174,8 +174,8 @@ class ExtractorResult:
            print(f"VERSION={self.version}")

        # Print output path
-        if self.output:
-            print(f"OUTPUT={self.output}")
+        if self.output_str:
+            print(f"OUTPUT={self.output_str}")

        # Print status
        print(f"STATUS={self.status}")
@@ -192,22 +192,17 @@ class ExtractorResult:
        for hint in self.hints:
            print(f"HINT={hint}", file=sys.stderr)

-        # Print JSON result for structured parsing
+        # Print clean JSONL result for hooks.py to parse
        result_json = {
-            'extractor': self.name,
-            'url': self.url,
-            'snapshot_id': self.snapshot_id,
+            'type': 'ArchiveResult',
            'status': self.status,
-            'start_ts': self.start_ts.isoformat(),
-            'end_ts': self.end_ts.isoformat() if self.end_ts else None,
-            'duration': round(self.duration, 2),
-            'cmd': self.cmd,
-            'cmd_version': self.version,
-            'output': str(self.output) if self.output else None,
-            'returncode': self.returncode,
-            'error': self.error or None,
+            'output_str': self.output_str or self.error or '',
        }
-        print(f"RESULT_JSON={json.dumps(result_json)}")
+        if self.cmd:
+            result_json['cmd'] = self.cmd
+        if self.version:
+            result_json['cmd_version'] = self.version
+        print(json.dumps(result_json))


 def run_shell_command(