mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
Update views, API, and exports for new ArchiveResult output fields
Replace old `output` field with new fields across the codebase: - output_str: Human-readable output summary - output_json: Structured metadata (optional) - output_files: Dict of output files with metadata - output_size: Total size in bytes - output_mimetypes: CSV of file mimetypes Files updated: - api/v1_core.py: Update MinimalArchiveResultSchema to expose new fields - api/v1_core.py: Update ArchiveResultFilterSchema to search output_str - cli/archivebox_extract.py: Use output_str in CLI output - core/admin_archiveresults.py: Update admin fields, search, and fieldsets - core/admin_archiveresults.py: Fix output_html variable name bug in output_summary - misc/jsonl.py: Update archiveresult_to_jsonl() to include new fields - plugins/extractor_utils.py: Update ExtractorResult helper class The embed_path() method already uses output_files and output_str, so snapshot detail page and template tags work correctly.
This commit is contained in:
@@ -69,7 +69,11 @@ class MinimalArchiveResultSchema(Schema):
|
||||
cmd_version: str | None
|
||||
cmd: list[str] | None
|
||||
pwd: str | None
|
||||
output: str | None
|
||||
output_str: str
|
||||
output_json: dict | None
|
||||
output_files: dict | None
|
||||
output_size: int
|
||||
output_mimetypes: str
|
||||
start_ts: datetime | None
|
||||
end_ts: datetime | None
|
||||
|
||||
@@ -109,12 +113,12 @@ class ArchiveResultSchema(MinimalArchiveResultSchema):
|
||||
|
||||
class ArchiveResultFilterSchema(FilterSchema):
|
||||
id: Optional[str] = Field(None, q=['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__timestamp__startswith'])
|
||||
snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
|
||||
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
|
||||
status: Optional[str] = Field(None, q='status')
|
||||
output: Optional[str] = Field(None, q='output__icontains')
|
||||
output_str: Optional[str] = Field(None, q='output_str__icontains')
|
||||
extractor: Optional[str] = Field(None, q='extractor__icontains')
|
||||
cmd: Optional[str] = Field(None, q='cmd__0__icontains')
|
||||
pwd: Optional[str] = Field(None, q='pwd__icontains')
|
||||
|
||||
@@ -59,10 +59,10 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
archiveresult.refresh_from_db()
|
||||
|
||||
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output}[/green]')
|
||||
print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
|
||||
return 0
|
||||
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'[red]Extraction failed: {archiveresult.output}[/red]', file=sys.stderr)
|
||||
print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
else:
|
||||
# Still in progress or backoff - not a failure
|
||||
@@ -202,7 +202,7 @@ def run_plugins(
|
||||
'failed': 'red',
|
||||
'skipped': 'yellow',
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output or ""}', file=sys.stderr)
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output_str or ""}', file=sys.stderr)
|
||||
else:
|
||||
write_record(archiveresult_to_jsonl(result))
|
||||
except Snapshot.DoesNotExist:
|
||||
|
||||
@@ -185,9 +185,9 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
parent_model = Snapshot
|
||||
# fk_name = 'snapshot'
|
||||
extra = 0
|
||||
sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
|
||||
sort_fields = ('end_ts', 'extractor', 'output_str', 'status', 'cmd_version')
|
||||
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output')
|
||||
fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str')
|
||||
# exclude = ('id',)
|
||||
ordering = ('end_ts',)
|
||||
show_change_link = True
|
||||
@@ -231,7 +231,7 @@ class ArchiveResultInline(admin.TabularInline):
|
||||
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
|
||||
formset.form.base_fields['created_by'].initial = request.user
|
||||
formset.form.base_fields['cmd'].initial = '["-"]'
|
||||
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
|
||||
formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...'
|
||||
|
||||
if obj is not None:
|
||||
# hidden values for existing entries and new entries
|
||||
@@ -255,7 +255,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface')
|
||||
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
search_fields = ('id', 'snapshot__url', 'extractor', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
@@ -276,7 +276,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Output', {
|
||||
'fields': ('output', 'output_summary'),
|
||||
'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
@@ -370,13 +370,13 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
if depth > 2:
|
||||
continue
|
||||
indent = ' ' * 4 * (depth)
|
||||
output_str += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
|
||||
output_html += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
|
||||
indentation_str = ' ' * 4 * (depth + 1)
|
||||
for filename in sorted(files):
|
||||
is_hidden = filename.startswith('.')
|
||||
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
|
||||
output_html += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
|
||||
|
||||
return output_str + mark_safe('</code></pre>')
|
||||
return output_html + mark_safe('</code></pre>')
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -174,16 +174,30 @@ def archiveresult_to_jsonl(result) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert an ArchiveResult model instance to a JSONL record.
|
||||
"""
|
||||
return {
|
||||
record = {
|
||||
'type': TYPE_ARCHIVERESULT,
|
||||
'id': str(result.id),
|
||||
'snapshot_id': str(result.snapshot_id),
|
||||
'extractor': result.extractor,
|
||||
'status': result.status,
|
||||
'output': result.output,
|
||||
'output_str': result.output_str,
|
||||
'start_ts': result.start_ts.isoformat() if result.start_ts else None,
|
||||
'end_ts': result.end_ts.isoformat() if result.end_ts else None,
|
||||
}
|
||||
# Include optional fields if set
|
||||
if result.output_json:
|
||||
record['output_json'] = result.output_json
|
||||
if result.output_files:
|
||||
record['output_files'] = result.output_files
|
||||
if result.output_size:
|
||||
record['output_size'] = result.output_size
|
||||
if result.output_mimetypes:
|
||||
record['output_mimetypes'] = result.output_mimetypes
|
||||
if result.cmd:
|
||||
record['cmd'] = result.cmd
|
||||
if result.cmd_version:
|
||||
record['cmd_version'] = result.cmd_version
|
||||
return record
|
||||
|
||||
|
||||
def tag_to_jsonl(tag) -> Dict[str, Any]:
|
||||
|
||||
@@ -105,7 +105,7 @@ class ExtractorResult:
|
||||
|
||||
# ... do extraction ...
|
||||
|
||||
result.output = 'example.com/index.html'
|
||||
result.output_str = 'example.com/index.html'
|
||||
result.status = 'succeeded'
|
||||
result.finish()
|
||||
|
||||
@@ -121,7 +121,7 @@ class ExtractorResult:
|
||||
|
||||
self.cmd: list[str] = []
|
||||
self.version: str = ''
|
||||
self.output: str | Path | None = None
|
||||
self.output_str: str = '' # Human-readable output summary
|
||||
self.status: str = 'failed' # 'succeeded', 'failed', 'skipped'
|
||||
|
||||
self.stdout: str = ''
|
||||
@@ -174,8 +174,8 @@ class ExtractorResult:
|
||||
print(f"VERSION={self.version}")
|
||||
|
||||
# Print output path
|
||||
if self.output:
|
||||
print(f"OUTPUT={self.output}")
|
||||
if self.output_str:
|
||||
print(f"OUTPUT={self.output_str}")
|
||||
|
||||
# Print status
|
||||
print(f"STATUS={self.status}")
|
||||
@@ -192,22 +192,17 @@ class ExtractorResult:
|
||||
for hint in self.hints:
|
||||
print(f"HINT={hint}", file=sys.stderr)
|
||||
|
||||
# Print JSON result for structured parsing
|
||||
# Print clean JSONL result for hooks.py to parse
|
||||
result_json = {
|
||||
'extractor': self.name,
|
||||
'url': self.url,
|
||||
'snapshot_id': self.snapshot_id,
|
||||
'type': 'ArchiveResult',
|
||||
'status': self.status,
|
||||
'start_ts': self.start_ts.isoformat(),
|
||||
'end_ts': self.end_ts.isoformat() if self.end_ts else None,
|
||||
'duration': round(self.duration, 2),
|
||||
'cmd': self.cmd,
|
||||
'cmd_version': self.version,
|
||||
'output': str(self.output) if self.output else None,
|
||||
'returncode': self.returncode,
|
||||
'error': self.error or None,
|
||||
'output_str': self.output_str or self.error or '',
|
||||
}
|
||||
print(f"RESULT_JSON={json.dumps(result_json)}")
|
||||
if self.cmd:
|
||||
result_json['cmd'] = self.cmd
|
||||
if self.version:
|
||||
result_json['cmd_version'] = self.version
|
||||
print(json.dumps(result_json))
|
||||
|
||||
|
||||
def run_shell_command(
|
||||
|
||||
Reference in New Issue
Block a user