Update views, API, and exports for new ArchiveResult output fields

Replace old `output` field with new fields across the codebase:
- output_str: Human-readable output summary
- output_json: Structured metadata (optional)
- output_files: Dict of output files with metadata
- output_size: Total size in bytes
- output_mimetypes: CSV of file mimetypes

Files updated:
- api/v1_core.py: Update MinimalArchiveResultSchema to expose new fields
- api/v1_core.py: Update ArchiveResultFilterSchema to search output_str
- cli/archivebox_extract.py: Use output_str in CLI output
- core/admin_archiveresults.py: Update admin fields, search, and fieldsets
- core/admin_archiveresults.py: Fix output_html variable name bug in output_summary
- misc/jsonl.py: Update archiveresult_to_jsonl() to include new fields
- plugins/extractor_utils.py: Update ExtractorResult helper class

The embed_path() method already uses output_files and output_str,
so snapshot detail page and template tags work correctly.
This commit is contained in:
Claude
2025-12-27 20:28:22 +00:00
parent d65eb587d9
commit b632894bc9
5 changed files with 46 additions and 33 deletions

View File

@@ -105,7 +105,7 @@ class ExtractorResult:
# ... do extraction ...
result.output = 'example.com/index.html'
result.output_str = 'example.com/index.html'
result.status = 'succeeded'
result.finish()
@@ -121,7 +121,7 @@ class ExtractorResult:
self.cmd: list[str] = []
self.version: str = ''
self.output: str | Path | None = None
self.output_str: str = '' # Human-readable output summary
self.status: str = 'failed' # 'succeeded', 'failed', 'skipped'
self.stdout: str = ''
@@ -174,8 +174,8 @@ class ExtractorResult:
print(f"VERSION={self.version}")
# Print output path
if self.output:
print(f"OUTPUT={self.output}")
if self.output_str:
print(f"OUTPUT={self.output_str}")
# Print status
print(f"STATUS={self.status}")
@@ -192,22 +192,17 @@ class ExtractorResult:
for hint in self.hints:
print(f"HINT={hint}", file=sys.stderr)
# Print JSON result for structured parsing
# Print clean JSONL result for hooks.py to parse
result_json = {
'extractor': self.name,
'url': self.url,
'snapshot_id': self.snapshot_id,
'type': 'ArchiveResult',
'status': self.status,
'start_ts': self.start_ts.isoformat(),
'end_ts': self.end_ts.isoformat() if self.end_ts else None,
'duration': round(self.duration, 2),
'cmd': self.cmd,
'cmd_version': self.version,
'output': str(self.output) if self.output else None,
'returncode': self.returncode,
'error': self.error or None,
'output_str': self.output_str or self.error or '',
}
print(f"RESULT_JSON={json.dumps(result_json)}")
if self.cmd:
result_json['cmd'] = self.cmd
if self.version:
result_json['cmd_version'] = self.version
print(json.dumps(result_json))
def run_shell_command(