From 3d985fa8c88c46e9c16a112f0dba2e8bc21acaac Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 27 Dec 2025 08:38:49 +0000
Subject: [PATCH] Implement hook architecture with JSONL output support

Phase 1: Database migration for new ArchiveResult fields
- Add output_str (TextField) for human-readable summary
- Add output_json (JSONField) for structured metadata
- Add output_files (JSONField) for dict of {relative_path: {}}
- Add output_size (BigIntegerField) for total bytes
- Add output_mimetypes (CharField) for CSV of mimetypes
- Add binary FK to InstalledBinary (optional)
- Migrate existing 'output' field to new split fields

Phase 3: Update run_hook() for JSONL parsing
- Support new JSONL format (any line with {type: 'ModelName', ...})
- Maintain backwards compatibility with RESULT_JSON= format
- Add plugin metadata to each parsed record
- Detect background hooks with .bg. suffix in filename
- Add find_binary_for_cmd() helper function
- Add create_model_record() for processing side-effect records

Phase 6: Update ArchiveResult.run()
- Handle background hooks (return immediately when result is None)
- Process 'records' from HookResult for side-effect models
- Use new output fields (output_str, output_json, output_files, etc.)
- Call create_model_record() for InstalledBinary, Machine updates

Phase 7: Add background hook support
- Add is_background_hook() method to ArchiveResult
- Add check_background_completed() to check if process exited
- Add finalize_background_hook() to collect results from completed hooks
- Update SnapshotMachine.is_finished() to check/finalize background hooks
- Update _populate_output_fields() to walk directory and populate stats

Also updated references to old 'output' field in:
- admin_archiveresults.py
- statemachines.py
- templatetags/core_tags.py
---
 archivebox/core/admin_archiveresults.py       |  31 +-
 .../0029_archiveresult_hook_fields.py         |  80 +++++
 .../migrations/0030_migrate_output_field.py   |  64 ++++
 archivebox/core/models.py                     | 330 ++++++++++++++++--
 archivebox/core/statemachines.py              |  21 +-
 archivebox/core/templatetags/core_tags.py     |   6 +-
 archivebox/hooks.py                           | 156 ++++++++-
 7 files changed, 633 insertions(+), 55 deletions(-)
 create mode 100644 archivebox/core/migrations/0029_archiveresult_hook_fields.py
 create mode 100644 archivebox/core/migrations/0030_migrate_output_field.py

diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py
index f525b84f..18b5fadc 100644
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
         end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
 
         # Truncate output for display
-        full_output = result.output or '-'
+        full_output = result.output_str or '-'
         output_display = full_output[:60]
         if len(full_output) > 60:
             output_display += '...'
@@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
         # Get full command as tooltip
         cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
 
-        # Build output link
-        output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
+        # Build output link - use embed_path() which checks output_files first
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
+        output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
 
         # Get version - try cmd_version field
         version = result.cmd_version if result.cmd_version else '-'
@@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
             ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
         )
 
-    def output_str(self, result):
-        # Determine output link path - use output if file exists, otherwise link to index
-        output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
+    def output_display(self, result):
+        # Determine output link path - use embed_path() which checks output_files
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
+        output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
         return format_html(
             '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
             result.snapshot.timestamp,
             output_path,
-            result.output,
+            result.output_str,
         )
 
     def output_summary(self, result):
         snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
-        output_str = format_html(
+        output_html = format_html(
             '<pre style="display: inline-block">{}</pre><br/>',
-            result.output,
+            result.output_str,
         )
-        output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
-        path_from_output_str = (snapshot_dir / (result.output or ''))
-        output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
-        if os.access(path_from_output_str, os.R_OK):
-            root_dir = str(path_from_output_str)
+        output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+        path_from_embed = (snapshot_dir / (embed_path or ''))
+        output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
+        if os.access(path_from_embed, os.R_OK):
+            root_dir = str(path_from_embed)
         else:
             root_dir = str(snapshot_dir)
 
diff --git a/archivebox/core/migrations/0029_archiveresult_hook_fields.py b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
new file mode 100644
index 00000000..0ff1f0c2
--- /dev/null
+++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
@@ -0,0 +1,80 @@
+# Generated by Django for hook architecture support
+# Phase 1: Add new ArchiveResult fields for hook output
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0028_snapshot_fs_version'),
+        ('machine', '0002_rename_custom_cmds_to_overrides'),
+    ]
+
+    operations = [
+        # Add new output fields (keep old 'output' temporarily for migration)
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_str',
+            field=models.TextField(
+                blank=True,
+                default='',
+                help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_json',
+            field=models.JSONField(
+                null=True,
+                blank=True,
+                default=None,
+                help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_files',
+            field=models.JSONField(
+                default=dict,
+                help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_size',
+            field=models.BigIntegerField(
+                default=0,
+                help_text='Total recursive size in bytes of all output files'
+            ),
+        ),
+
+        migrations.AddField(
+            model_name='archiveresult',
+            name='output_mimetypes',
+            field=models.CharField(
+                max_length=512,
+                blank=True,
+                default='',
+                help_text='CSV of mimetypes sorted by size descending'
+            ),
+        ),
+
+        # Add binary FK (optional)
+        migrations.AddField(
+            model_name='archiveresult',
+            name='binary',
+            field=models.ForeignKey(
+                'machine.InstalledBinary',
+                on_delete=models.SET_NULL,
+                null=True,
+                blank=True,
+                related_name='archiveresults',
+                help_text='Primary binary used by this hook (optional)'
+            ),
+        ),
+    ]
diff --git a/archivebox/core/migrations/0030_migrate_output_field.py b/archivebox/core/migrations/0030_migrate_output_field.py
new file mode 100644
index 00000000..5dafb7e8
--- /dev/null
+++ b/archivebox/core/migrations/0030_migrate_output_field.py
@@ -0,0 +1,64 @@
+# Generated by Django for hook architecture support
+# Phase 1: Migrate existing 'output' field to new split fields
+
+from django.db import migrations
+import json
+
+
+def migrate_output_field(apps, schema_editor):
+    """
+    Migrate existing 'output' field to new split fields.
+
+    Logic:
+    - If output contains JSON {...}, move to output_json
+    - Otherwise, move to output_str
+    """
+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+
+    for ar in ArchiveResult.objects.all().iterator():
+        old_output = ar.output or ''
+
+        # Case 1: JSON output
+        if old_output.strip().startswith('{'):
+            try:
+                parsed = json.loads(old_output)
+                ar.output_json = parsed
+                ar.output_str = ''
+            except json.JSONDecodeError:
+                # Not valid JSON, treat as string
+                ar.output_str = old_output
+
+        # Case 2: File path or plain string
+        else:
+            ar.output_str = old_output
+
+        ar.save(update_fields=['output_str', 'output_json'])
+
+
+def reverse_migrate(apps, schema_editor):
+    """Reverse migration - copy output_str back to output."""
+    ArchiveResult = apps.get_model('core', 'ArchiveResult')
+
+    for ar in ArchiveResult.objects.all().iterator():
+        if ar.output_json:
+            ar.output = json.dumps(ar.output_json)
+        else:
+            ar.output = ar.output_str or ''
+        ar.save(update_fields=['output'])
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0029_archiveresult_hook_fields'),
+    ]
+
+    operations = [
+        migrations.RunPython(migrate_output_field, reverse_migrate),
+
+        # Now safe to remove old 'output' field
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='output',
+        ),
+    ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 6bac5679..1e5dcc0f 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
 from workers.models import ModelWithStateMachine
 from workers.tasks import bg_archive_snapshot
 from crawls.models import Crawl
-from machine.models import NetworkInterface
+from machine.models import NetworkInterface, InstalledBinary
 
 
 
@@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
         def calc_icons():
             if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
-                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
+                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
             else:
-                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
+                # Filter for results that have either output_files or output_str
+                from django.db.models import Q
+                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(
+                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
+                )}
 
             path = self.archive_path
             canon = self.canonical_outputs()
@@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
             for extractor in all_extractors:
                 result = archive_results.get(extractor)
-                existing = result and result.status == 'succeeded' and result.output
+                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
                 icon = get_extractor_icon(extractor)
                 output += format_html(
                     output_template,
@@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
         # Scan each ArchiveResult's output directory for the best file
         snap_dir = Path(self.output_dir)
         for result in self.archiveresult_set.filter(status='succeeded'):
-            if not result.output:
+            if not result.output_files and not result.output_str:
                 continue
 
             # Try to find the best output file for this extractor
             extractor_dir = snap_dir / result.extractor
             best_output = None
 
-            if result.output and (snap_dir / result.output).exists():
-                # Use the explicit output path if it exists
-                best_output = result.output
-            elif extractor_dir.exists():
+            # Check output_files first (new field)
+            if result.output_files:
+                first_file = next(iter(result.output_files.keys()), None)
+                if first_file and (extractor_dir / first_file).exists():
+                    best_output = f'{result.extractor}/{first_file}'
+
+            # Fallback to output_str if it looks like a path
+            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
+                best_output = result.output_str
+
+            if not best_output and extractor_dir.exists():
                 # Intelligently find the best file in the extractor's directory
                 best_output = find_best_output_in_dir(extractor_dir, result.extractor)
 
@@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
         """Get the latest output that each archive method produced"""
         from archivebox.hooks import get_extractors
+        from django.db.models import Q
 
         latest: Dict[str, Any] = {}
         for archive_method in get_extractors():
             results = self.archiveresult_set.filter(extractor=archive_method)
             if status is not None:
                 results = results.filter(status=status)
-            results = results.filter(output__isnull=False).order_by('-start_ts')
-            latest[archive_method] = results.first().output if results.exists() else None
+            # Filter for results with output_files or output_str
+            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
+            result = results.first()
+            # Return embed_path() for backwards compatibility
+            latest[archive_method] = result.embed_path() if result else None
         return latest
 
     # =========================================================================
@@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
     cmd = models.JSONField(default=None, null=True, blank=True)
     cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
-    output = models.CharField(max_length=1024, default=None, null=True, blank=True)
+
+    # New output fields (replacing old 'output' field)
+    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
+    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
+    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
+    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
+    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
+
+    # Binary FK (optional - set when hook reports cmd)
+    binary = models.ForeignKey(
+        'machine.InstalledBinary',
+        on_delete=models.SET_NULL,
+        null=True, blank=True,
+        related_name='archiveresults',
+        help_text='Primary binary used by this hook'
+    )
+
     start_ts = models.DateTimeField(default=None, null=True, blank=True)
     end_ts = models.DateTimeField(default=None, null=True, blank=True)
 
@@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         """
         Get the relative path to the embeddable output file for this result.
 
-        Returns the output field if set and file exists, otherwise tries to
+        Returns the first file from output_files if set, otherwise tries to
         find a reasonable default based on the extractor type.
         """
-        if self.output:
-            return self.output
+        # Check output_files dict for primary output
+        if self.output_files:
+            # Return first file from output_files (dict preserves insertion order)
+            first_file = next(iter(self.output_files.keys()), None)
+            if first_file:
+                return f'{self.extractor}/{first_file}'
+
+        # Fallback: check output_str if it looks like a file path
+        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
+            return self.output_str
 
         # Try to find output file based on extractor's canonical output path
         canonical = self.snapshot.canonical_outputs()
@@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
 
         if not hook:
             self.status = self.StatusChoices.FAILED
-            self.output = f'No hook found for: {self.extractor}'
+            self.output_str = f'No hook found for: {self.extractor}'
             self.retry_at = None
             self.save()
             return
@@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
             url=self.snapshot.url,
             snapshot_id=str(self.snapshot.id),
         )
+
+        # BACKGROUND HOOK - still running, return immediately
+        if result is None:
+            self.status = self.StatusChoices.STARTED
+            self.start_ts = start_ts
+            self.pwd = str(extractor_dir)
+            self.save()
+            return
+
         end_ts = timezone.now()
 
+        # Get records from hook output (new JSONL format)
+        records = result.get('records', [])
+
         # Clean up empty output directory if no files were created
         output_files = result.get('output_files', [])
         if not output_files and extractor_dir.exists():
@@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
             except (OSError, RuntimeError):
                 pass  # Directory not empty or can't be removed, that's fine
 
-        # Determine status from return code and JSON output
+        # Find the ArchiveResult record from hook output (if any)
+        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
         output_json = result.get('output_json') or {}
-        json_status = output_json.get('status')
 
-        if json_status == 'skipped':
-            status = 'skipped'
-        elif json_status == 'failed':
-            status = 'failed'
+        # Determine status from records, output_json, or return code
+        if ar_records:
+            # Use status from first ArchiveResult record
+            hook_data = ar_records[0]
+            status = hook_data.get('status', 'failed')
+        elif output_json.get('status'):
+            status = output_json['status']
         elif result['returncode'] == 0:
             status = 'succeeded'
         else:
@@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
             'skipped': self.StatusChoices.SKIPPED,
         }
         self.status = status_map.get(status, self.StatusChoices.FAILED)
-        self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
+
+        # Set output fields from records or output_json
+        if ar_records:
+            hook_data = ar_records[0]
+            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
+            self.output_json = hook_data.get('output_json')
+            # Set cmd from JSONL record
+            if hook_data.get('cmd'):
+                self.cmd = hook_data['cmd']
+                self._set_binary_from_cmd(hook_data['cmd'])
+            if hook_data.get('cmd_version'):
+                self.cmd_version = hook_data['cmd_version'][:128]
+        else:
+            # Fallback to legacy output_json format
+            self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or ''
+            self.output_json = output_json.get('output_json') if output_json.get('output_json') else None
+            if output_json.get('cmd_version'):
+                self.cmd_version = output_json['cmd_version'][:128]
+            if output_json.get('cmd'):
+                self.cmd = output_json['cmd']
+                self._set_binary_from_cmd(output_json['cmd'])
+
         self.start_ts = start_ts
         self.end_ts = end_ts
         self.retry_at = None
         self.pwd = str(extractor_dir)
 
-        # Save cmd and cmd_version from extractor output
-        if output_json.get('cmd_version'):
-            self.cmd_version = output_json['cmd_version'][:128]  # Max length from model
-        if output_json.get('cmd'):
-            self.cmd = output_json['cmd']
+        # Populate output_files, output_size, output_mimetypes from filesystem
+        if extractor_dir.exists():
+            self._populate_output_fields(extractor_dir)
 
         self.save()
 
+        # Process side-effect records (InstalledBinary, Machine config, etc.)
+        from archivebox.hooks import create_model_record
+        for record in records:
+            if record.get('type') != 'ArchiveResult':
+                create_model_record(record.copy())  # Copy to avoid mutating original
+
         # Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
         self._queue_urls_for_crawl(extractor_dir)
 
@@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         if self.status == self.StatusChoices.SUCCEEDED:
             self.trigger_search_indexing()
 
+    def _populate_output_fields(self, output_dir: Path) -> None:
+        """
+        Walk output directory and populate output_files, output_size, output_mimetypes.
+        """
+        import mimetypes
+        from collections import defaultdict
+
+        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
+
+        # Track mimetypes and sizes for aggregation
+        mime_sizes = defaultdict(int)
+        total_size = 0
+        output_files = {}  # Dict keyed by relative path
+
+        for file_path in output_dir.rglob('*'):
+            # Skip non-files and infrastructure files
+            if not file_path.is_file():
+                continue
+            if file_path.name in exclude_names:
+                continue
+
+            # Get file stats
+            try:
+                stat = file_path.stat()
+                mime_type, _ = mimetypes.guess_type(str(file_path))
+                mime_type = mime_type or 'application/octet-stream'
+
+                # Track for ArchiveResult fields
+                relative_path = str(file_path.relative_to(output_dir))
+                output_files[relative_path] = {}  # Empty dict, extensible for future metadata
+                mime_sizes[mime_type] += stat.st_size
+                total_size += stat.st_size
+            except (OSError, IOError):
+                continue
+
+        # Populate ArchiveResult fields
+        self.output_files = output_files
+        self.output_size = total_size
+
+        # Build output_mimetypes CSV (sorted by size descending)
+        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
+        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
+
+    def _set_binary_from_cmd(self, cmd: list) -> None:
+        """
+        Find InstalledBinary for command and set binary FK.
+
+        Tries matching by absolute path first, then by binary name.
+        Only matches binaries on the current machine.
+        """
+        if not cmd:
+            return
+
+        from machine.models import Machine
+
+        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
+        machine = Machine.current()
+
+        # Try matching by absolute path first
+        binary = InstalledBinary.objects.filter(
+            abspath=bin_path_or_name,
+            machine=machine
+        ).first()
+
+        if binary:
+            self.binary = binary
+            return
+
+        # Fallback: match by binary name
+        bin_name = Path(bin_path_or_name).name
+        binary = InstalledBinary.objects.filter(
+            name=bin_name,
+            machine=machine
+        ).first()
+
+        if binary:
+            self.binary = binary
+
     def _update_snapshot_title(self, extractor_dir: Path):
         """
         Update snapshot title from title extractor output.
@@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     def output_dir(self) -> Path:
         """Get the output directory for this extractor's results."""
         return Path(self.snapshot.output_dir) / self.extractor
+
+    def is_background_hook(self) -> bool:
+        """Check if this ArchiveResult is for a background hook."""
+        extractor_dir = Path(self.pwd) if self.pwd else None
+        if not extractor_dir:
+            return False
+        pid_file = extractor_dir / 'hook.pid'
+        return pid_file.exists()
+
+    def check_background_completed(self) -> bool:
+        """
+        Check if background hook process has exited.
+
+        Returns:
+            True if completed (process exited), False if still running
+        """
+        extractor_dir = Path(self.pwd) if self.pwd else None
+        if not extractor_dir:
+            return True  # No pwd = completed or failed to start
+
+        pid_file = extractor_dir / 'hook.pid'
+        if not pid_file.exists():
+            return True  # No PID file = completed or failed to start
+
+        try:
+            pid = int(pid_file.read_text().strip())
+            os.kill(pid, 0)  # Signal 0 = check if process exists
+            return False  # Still running
+        except (OSError, ValueError):
+            return True  # Process exited or invalid PID
+
+    def finalize_background_hook(self) -> None:
+        """
+        Collect final results from completed background hook.
+
+        Same logic as run() but for background hooks that already started.
+        """
+        from archivebox.hooks import create_model_record
+
+        extractor_dir = Path(self.pwd) if self.pwd else None
+        if not extractor_dir or not extractor_dir.exists():
+            self.status = self.StatusChoices.FAILED
+            self.output_str = 'Background hook output directory not found'
+            self.end_ts = timezone.now()
+            self.retry_at = None
+            self.save()
+            return
+
+        stdout_file = extractor_dir / 'stdout.log'
+        stderr_file = extractor_dir / 'stderr.log'
+
+        # Read logs
+        stdout = stdout_file.read_text() if stdout_file.exists() else ''
+
+        # Parse JSONL output
+        records = []
+        for line in stdout.splitlines():
+            line = line.strip()
+            if not line or not line.startswith('{'):
+                continue
+            try:
+                data = json.loads(line)
+                if 'type' in data:
+                    records.append(data)
+            except json.JSONDecodeError:
+                continue
+
+        # Find the ArchiveResult record
+        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
+
+        if ar_records:
+            hook_data = ar_records[0]
+
+            # Apply hook's data
+            status_str = hook_data.get('status', 'failed')
+            status_map = {
+                'succeeded': self.StatusChoices.SUCCEEDED,
+                'failed': self.StatusChoices.FAILED,
+                'skipped': self.StatusChoices.SKIPPED,
+            }
+            self.status = status_map.get(status_str, self.StatusChoices.FAILED)
+
+            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
+            self.output_json = hook_data.get('output_json')
+
+            # Determine binary FK from cmd
+            if hook_data.get('cmd'):
+                self.cmd = hook_data['cmd']
+                self._set_binary_from_cmd(hook_data['cmd'])
+            if hook_data.get('cmd_version'):
+                self.cmd_version = hook_data['cmd_version'][:128]
+        else:
+            # No output = failed
+            self.status = self.StatusChoices.FAILED
+            self.output_str = 'Background hook did not output ArchiveResult'
+
+        self.end_ts = timezone.now()
+        self.retry_at = None
+
+        # Populate output fields from filesystem
+        if extractor_dir.exists():
+            self._populate_output_fields(extractor_dir)
+
+        self.save()
+
+        # Create any side-effect records
+        for record in records:
+            if record.get('type') != 'ArchiveResult':
+                create_model_record(record.copy())
+
+        # Cleanup PID files and empty logs
+        pid_file = extractor_dir / 'hook.pid'
+        pid_file.unlink(missing_ok=True)
+        if stdout_file.exists() and stdout_file.stat().st_size == 0:
+            stdout_file.unlink()
+        if stderr_file.exists() and stderr_file.stat().st_size == 0:
+            stderr_file.unlink()
diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py
index 610f6fe0..9f277a5c 100644
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True):
         # if no archiveresults exist yet, it's not finished
         if not self.snapshot.archiveresult_set.exists():
             return False
-        
+
         # if archiveresults exist but are still pending, it's not finished
         if self.snapshot.pending_archiveresults().exists():
             return False
-        
+
+        # Check for background hooks that are still running
+        started_results = self.snapshot.archiveresult_set.filter(
+            status=ArchiveResult.StatusChoices.STARTED
+        )
+        for result in started_results:
+            if not result.check_background_completed():
+                return False  # Still running
+
+            # Completed - finalize it
+            result.finalize_background_hook()
+
         # otherwise archiveresults exist and are all finished, so it's finished
         return True
         
@@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
     
     def is_backoff(self) -> bool:
         """Check if we should backoff and retry later."""
-        # Backoff if status is still started (extractor didn't complete) and output is None
+        # Backoff if status is still started (extractor didn't complete) and output_str is empty
         return (
-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and 
-            self.archiveresult.output is None
+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
+            not self.archiveresult.output_str
         )
     
     def is_finished(self) -> bool:
diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py
index b2c126cd..33a620c0 100644
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str:
         return ''
 
     # Use embed_path() for the display path (includes canonical paths)
-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
 
     # Create a mini template and render it with context
     try:
@@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str:
     if not template_str:
         return ''
 
-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
 
     try:
         tpl = template.Template(template_str)
@@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str:
     if not template_str:
         return ''
 
-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
 
     try:
         tpl = template.Template(template_str)
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 7bbbe66e..7ac15d65 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False):
     output_files: List[str]
     duration_ms: int
     hook: str
+    # New fields for JSONL parsing
+    records: List[Dict[str, Any]]  # Parsed JSONL records with 'type' field
 
 
 def discover_hooks(event_name: str) -> List[Path]:
@@ -268,7 +270,9 @@ def run_hook(
     files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
 
     # Detect if this is a background hook (long-running daemon)
-    is_background = '__background' in script.stem
+    # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
+    # Old convention: __background in stem (for backwards compatibility)
+    is_background = '.bg.' in script.name or '__background' in script.stem
 
     # Set up output files for ALL hooks (useful for debugging)
     stdout_file = output_dir / 'stdout.log'
@@ -322,13 +326,44 @@ def run_hook(
         # Exclude the log files themselves from new_files
         new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
 
-        # Parse RESULT_JSON from stdout
+        # Parse JSONL output from stdout
+        # Supports both new JSONL format (any line starting with { that has 'type')
+        # and legacy RESULT_JSON= format for backwards compatibility
         output_json = None
+        records = []
+        plugin_name = script.parent.name  # Plugin directory name (e.g., 'wget')
+
         for line in stdout.splitlines():
-            if line.startswith('RESULT_JSON='):
+            line = line.strip()
+            if not line:
+                continue
+
+            # New JSONL format: any line starting with { that has 'type' field
+            if line.startswith('{'):
                 try:
-                    output_json = json.loads(line[len('RESULT_JSON='):])
-                    break
+                    data = json.loads(line)
+                    if 'type' in data:
+                        # Add plugin metadata to every record
+                        data['plugin'] = plugin_name
+                        data['plugin_hook'] = str(script)
+                        records.append(data)
+                        # For backwards compatibility, also set output_json for first ArchiveResult
+                        if data.get('type') == 'ArchiveResult' and output_json is None:
+                            output_json = data
+                except json.JSONDecodeError:
+                    pass
+
+            # Legacy format: RESULT_JSON=...
+            elif line.startswith('RESULT_JSON='):
+                try:
+                    data = json.loads(line[len('RESULT_JSON='):])
+                    if output_json is None:
+                        output_json = data
+                    # Convert legacy format to new format
+                    data['type'] = 'ArchiveResult'
+                    data['plugin'] = plugin_name
+                    data['plugin_hook'] = str(script)
+                    records.append(data)
                 except json.JSONDecodeError:
                     pass
 
@@ -348,6 +383,7 @@ def run_hook(
             output_files=new_files,
             duration_ms=duration_ms,
             hook=str(script),
+            records=records,
         )
 
     except Exception as e:
@@ -360,6 +396,7 @@ def run_hook(
             output_files=[],
             duration_ms=duration_ms,
             hook=str(script),
+            records=[],
         )
 
 
@@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
     return templates
 
 
+# =============================================================================
+# Hook Result Processing Helpers
+# =============================================================================
+
+
+def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
+    """
+    Find InstalledBinary for a command, trying abspath first then name.
+    Only matches binaries on the current machine.
+
+    Args:
+        cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
+        machine_id: Current machine ID
+
+    Returns:
+        Binary ID as string if found, None otherwise
+    """
+    if not cmd:
+        return None
+
+    from machine.models import InstalledBinary
+
+    bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
+
+    # Try matching by absolute path first
+    binary = InstalledBinary.objects.filter(
+        abspath=bin_path_or_name,
+        machine_id=machine_id
+    ).first()
+
+    if binary:
+        return str(binary.id)
+
+    # Fallback: match by binary name
+    bin_name = Path(bin_path_or_name).name
+    binary = InstalledBinary.objects.filter(
+        name=bin_name,
+        machine_id=machine_id
+    ).first()
+
+    return str(binary.id) if binary else None
+
+
+def create_model_record(record: Dict[str, Any]) -> Any:
+    """
+    Generic helper to create/update model instances from hook JSONL output.
+
+    Args:
+        record: Dict with 'type' field and model data
+
+    Returns:
+        Created/updated model instance, or None if type unknown
+    """
+    from machine.models import InstalledBinary, Machine
+
+    record_type = record.pop('type', None)
+    if not record_type:
+        return None
+
+    # Remove plugin metadata (not model fields)
+    record.pop('plugin', None)
+    record.pop('plugin_hook', None)
+
+    if record_type == 'InstalledBinary':
+        # InstalledBinary requires machine FK
+        machine = Machine.current()
+        record.setdefault('machine', machine)
+
+        # Required fields check
+        name = record.get('name')
+        abspath = record.get('abspath')
+        if not name or not abspath:
+            return None
+
+        obj, created = InstalledBinary.objects.update_or_create(
+            machine=machine,
+            name=name,
+            defaults={
+                'abspath': abspath,
+                'version': record.get('version', ''),
+                'sha256': record.get('sha256', ''),
+                'binprovider': record.get('binprovider', 'env'),
+            }
+        )
+        return obj
+
+    elif record_type == 'Machine':
+        # Machine config update (special _method handling)
+        method = record.pop('_method', None)
+        if method == 'update':
+            key = record.get('key')
+            value = record.get('value')
+            if key and value:
+                machine = Machine.current()
+                if not machine.config:
+                    machine.config = {}
+                machine.config[key] = value
+                machine.save(update_fields=['config'])
+                return machine
+        return None
+
+    # Add more types as needed (Dependency, Snapshot, etc.)
+    else:
+        # Unknown type - log warning but don't fail
+        import sys
+        print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
+        return None
+
+