diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 31235e68..7f4f4f37 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -69,7 +69,11 @@ class MinimalArchiveResultSchema(Schema): cmd_version: str | None cmd: list[str] | None pwd: str | None - output: str | None + output_str: str + output_json: dict | None + output_files: dict | None + output_size: int + output_mimetypes: str start_ts: datetime | None end_ts: datetime | None @@ -109,12 +113,12 @@ class ArchiveResultSchema(MinimalArchiveResultSchema): class ArchiveResultFilterSchema(FilterSchema): id: Optional[str] = Field(None, q=['id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith']) - search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith']) + search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output_str__icontains', 'id__startswith', 'snapshot__id__startswith', 'snapshot__timestamp__startswith']) snapshot_id: Optional[str] = Field(None, q=['snapshot__id__startswith', 'snapshot__timestamp__startswith']) snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains') snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains') status: Optional[str] = Field(None, q='status') - output: Optional[str] = Field(None, q='output__icontains') + output_str: Optional[str] = Field(None, q='output_str__icontains') extractor: Optional[str] = Field(None, q='extractor__icontains') cmd: Optional[str] = Field(None, q='cmd__0__icontains') pwd: Optional[str] = Field(None, q='pwd__icontains') diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index affea542..7ebdc385 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -59,10 +59,10 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int: archiveresult.refresh_from_db() if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: - print(f'[green]Extraction succeeded: {archiveresult.output}[/green]') + print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]') return 0 elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: - print(f'[red]Extraction failed: {archiveresult.output}[/red]', file=sys.stderr) + print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr) return 1 else: # Still in progress or backoff - not a failure @@ -202,7 +202,7 @@ def run_plugins( 'failed': 'red', 'skipped': 'yellow', }.get(result.status, 'dim') - rprint(f' [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output or ""}', file=sys.stderr) + rprint(f' [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output_str or ""}', file=sys.stderr) else: write_record(archiveresult_to_jsonl(result)) except Snapshot.DoesNotExist: diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index 18b5fadc..749170ab 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -185,9 +185,9 @@ class ArchiveResultInline(admin.TabularInline): parent_model = Snapshot # fk_name = 'snapshot' extra = 0 - sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version') + sort_fields = ('end_ts', 'extractor', 'output_str', 'status', 'cmd_version') readonly_fields = ('id', 'result_id', 'completed', 'command', 'version') - fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output') + fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'retry_at', 'output_str') # exclude = ('id',) ordering = ('end_ts',) show_change_link = True @@ -231,7 +231,7 @@ class ArchiveResultInline(admin.TabularInline): formset.form.base_fields['pwd'].initial = str(snapshot.output_dir) formset.form.base_fields['created_by'].initial = request.user formset.form.base_fields['cmd'].initial = '["-"]' - formset.form.base_fields['output'].initial = 'Manually recorded cmd output...' + formset.form.base_fields['output_str'].initial = 'Manually recorded cmd output...' if obj is not None: # hidden values for existing entries and new entries @@ -255,7 +255,7 @@ class ArchiveResultAdmin(BaseModelAdmin): list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str') sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status') readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon', 'iface') - search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') + search_fields = ('id', 'snapshot__url', 'extractor', 'output_str', 'cmd_version', 'cmd', 'snapshot__timestamp') autocomplete_fields = ['snapshot'] fieldsets = ( @@ -276,7 +276,7 @@ class ArchiveResultAdmin(BaseModelAdmin): 'classes': ('card',), }), ('Output', { - 'fields': ('output', 'output_summary'), + 'fields': ('output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes', 'output_summary'), 'classes': ('card', 'wide'), }), ('Metadata', { @@ -370,13 +370,13 @@ class ArchiveResultAdmin(BaseModelAdmin): if depth > 2: continue indent = ' ' * 4 * (depth) - output_str += format_html('{}{}/
', indent, os.path.basename(root)) + output_html += format_html('{}{}/
', indent, os.path.basename(root)) indentation_str = ' ' * 4 * (depth + 1) for filename in sorted(files): is_hidden = filename.startswith('.') - output_str += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) + output_html += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) - return output_str + mark_safe('') + return output_html + mark_safe('') diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py index 11ce6bc6..317de9b4 100644 --- a/archivebox/misc/jsonl.py +++ b/archivebox/misc/jsonl.py @@ -174,16 +174,30 @@ def archiveresult_to_jsonl(result) -> Dict[str, Any]: """ Convert an ArchiveResult model instance to a JSONL record. """ - return { + record = { 'type': TYPE_ARCHIVERESULT, 'id': str(result.id), 'snapshot_id': str(result.snapshot_id), 'extractor': result.extractor, 'status': result.status, - 'output': result.output, + 'output_str': result.output_str, 'start_ts': result.start_ts.isoformat() if result.start_ts else None, 'end_ts': result.end_ts.isoformat() if result.end_ts else None, } + # Include optional fields if set + if result.output_json: + record['output_json'] = result.output_json + if result.output_files: + record['output_files'] = result.output_files + if result.output_size: + record['output_size'] = result.output_size + if result.output_mimetypes: + record['output_mimetypes'] = result.output_mimetypes + if result.cmd: + record['cmd'] = result.cmd + if result.cmd_version: + record['cmd_version'] = result.cmd_version + return record def tag_to_jsonl(tag) -> Dict[str, Any]: diff --git a/archivebox/plugins/extractor_utils.py b/archivebox/plugins/extractor_utils.py index 45755b97..e62cae14 100644 --- a/archivebox/plugins/extractor_utils.py +++ b/archivebox/plugins/extractor_utils.py @@ -105,7 +105,7 @@ class ExtractorResult: # ... do extraction ... - result.output = 'example.com/index.html' + result.output_str = 'example.com/index.html' result.status = 'succeeded' result.finish() @@ -121,7 +121,7 @@ class ExtractorResult: self.cmd: list[str] = [] self.version: str = '' - self.output: str | Path | None = None + self.output_str: str = '' # Human-readable output summary self.status: str = 'failed' # 'succeeded', 'failed', 'skipped' self.stdout: str = '' @@ -174,8 +174,8 @@ class ExtractorResult: print(f"VERSION={self.version}") # Print output path - if self.output: - print(f"OUTPUT={self.output}") + if self.output_str: + print(f"OUTPUT={self.output_str}") # Print status print(f"STATUS={self.status}") @@ -192,22 +192,17 @@ class ExtractorResult: for hint in self.hints: print(f"HINT={hint}", file=sys.stderr) - # Print JSON result for structured parsing + # Print clean JSONL result for hooks.py to parse result_json = { - 'extractor': self.name, - 'url': self.url, - 'snapshot_id': self.snapshot_id, + 'type': 'ArchiveResult', 'status': self.status, - 'start_ts': self.start_ts.isoformat(), - 'end_ts': self.end_ts.isoformat() if self.end_ts else None, - 'duration': round(self.duration, 2), - 'cmd': self.cmd, - 'cmd_version': self.version, - 'output': str(self.output) if self.output else None, - 'returncode': self.returncode, - 'error': self.error or None, + 'output_str': self.output_str or self.error or '', } - print(f"RESULT_JSON={json.dumps(result_json)}") + if self.cmd: + result_json['cmd'] = self.cmd + if self.version: + result_json['cmd_version'] = self.version + print(json.dumps(result_json)) def run_shell_command(