Implement hook architecture with JSONL output support

Phase 1: Database migration for new ArchiveResult fields - Add output_str (TextField) for human-readable summary - Add output_json (JSONField) for structured metadata - Add output_files (JSONField) for dict of {relative_path: {}} - Add output_size (BigIntegerField) for total bytes - Add output_mimetypes (CharField) for CSV of mimetypes - Add binary FK to InstalledBinary (optional) - Migrate existing 'output' field to new split fields Phase 3: Update run_hook() for JSONL parsing - Support new JSONL format (any line with {type: 'ModelName', ...}) - Maintain backwards compatibility with RESULT_JSON= format - Add plugin metadata to each parsed record - Detect background hooks with .bg. suffix in filename - Add find_binary_for_cmd() helper function - Add create_model_record() for processing side-effect records Phase 6: Update ArchiveResult.run() - Handle background hooks (return immediately when result is None) - Process 'records' from HookResult for side-effect models - Use new output fields (output_str, output_json, output_files, etc.) - Call create_model_record() for InstalledBinary, Machine updates Phase 7: Add background hook support - Add is_background_hook() method to ArchiveResult - Add check_background_completed() to check if process exited - Add finalize_background_hook() to collect results from completed hooks - Update SnapshotMachine.is_finished() to check/finalize background hooks - Update _populate_output_fields() to walk directory and populate stats Also updated references to old 'output' field in: - admin_archiveresults.py - statemachines.py - templatetags/core_tags.py
2026-04-05 15:27:53 +10:00 · 2025-12-27 08:38:49 +00:00
parent 35dd9acafe
commit 3d985fa8c8
7 changed files with 633 additions and 55 deletions
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
        end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
        # Truncate output for display
-        full_output = result.output or '-'
+        full_output = result.output_str or '-'
        output_display = full_output[:60]
        if len(full_output) > 60:
            output_display += '...'
@@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
        # Get full command as tooltip
        cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
-        # Build output link
+        # Build output link - use embed_path() which checks output_files first
-        output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
        output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
        # Get version - try cmd_version field
        version = result.cmd_version if result.cmd_version else '-'
@@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
            ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
        )
-    def output_str(self, result):
+    def output_display(self, result):
-        # Determine output link path - use output if file exists, otherwise link to index
+        # Determine output link path - use embed_path() which checks output_files
-        output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
        output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
        return format_html(
            '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
            result.snapshot.timestamp,
            output_path,
-            result.output,
+            result.output_str,
        )
    def output_summary(self, result):
        snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
-        output_str = format_html(
+        output_html = format_html(
            '<pre style="display: inline-block">{}</pre><br/>',
-            result.output,
+            result.output_str,
        )
-        output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
+        output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
-        path_from_output_str = (snapshot_dir / (result.output or ''))
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
-        output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
+        path_from_embed = (snapshot_dir / (embed_path or ''))
-        if os.access(path_from_output_str, os.R_OK):
+        output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
-            root_dir = str(path_from_output_str)
+        if os.access(path_from_embed, os.R_OK):
            root_dir = str(path_from_embed)
        else:
            root_dir = str(snapshot_dir)
--- a/archivebox/core/migrations/0029_archiveresult_hook_fields.py
+++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
@@ -0,0 +1,80 @@
 # Generated by Django for hook architecture support
 # Phase 1: Add new ArchiveResult fields for hook output
 from django.db import migrations, models
 import django.db.models.deletion
 class Migration(migrations.Migration):
    dependencies = [
        ('core', '0028_snapshot_fs_version'),
        ('machine', '0002_rename_custom_cmds_to_overrides'),
    ]
    operations = [
        # Add new output fields (keep old 'output' temporarily for migration)
        migrations.AddField(
            model_name='archiveresult',
            name='output_str',
            field=models.TextField(
                blank=True,
                default='',
                help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
            ),
        ),
        migrations.AddField(
            model_name='archiveresult',
            name='output_json',
            field=models.JSONField(
                null=True,
                blank=True,
                default=None,
                help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
            ),
        ),
        migrations.AddField(
            model_name='archiveresult',
            name='output_files',
            field=models.JSONField(
                default=dict,
                help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
            ),
        ),
        migrations.AddField(
            model_name='archiveresult',
            name='output_size',
            field=models.BigIntegerField(
                default=0,
                help_text='Total recursive size in bytes of all output files'
            ),
        ),
        migrations.AddField(
            model_name='archiveresult',
            name='output_mimetypes',
            field=models.CharField(
                max_length=512,
                blank=True,
                default='',
                help_text='CSV of mimetypes sorted by size descending'
            ),
        ),
        # Add binary FK (optional)
        migrations.AddField(
            model_name='archiveresult',
            name='binary',
            field=models.ForeignKey(
                'machine.InstalledBinary',
                on_delete=models.SET_NULL,
                null=True,
                blank=True,
                related_name='archiveresults',
                help_text='Primary binary used by this hook (optional)'
            ),
        ),
    ]
--- a/archivebox/core/migrations/0030_migrate_output_field.py
+++ b/archivebox/core/migrations/0030_migrate_output_field.py
@@ -0,0 +1,64 @@
 # Generated by Django for hook architecture support
 # Phase 1: Migrate existing 'output' field to new split fields
 from django.db import migrations
 import json
 def migrate_output_field(apps, schema_editor):
    """
    Migrate existing 'output' field to new split fields.
    Logic:
    - If output contains JSON {...}, move to output_json
    - Otherwise, move to output_str
    """
    ArchiveResult = apps.get_model('core', 'ArchiveResult')
    for ar in ArchiveResult.objects.all().iterator():
        old_output = ar.output or ''
        # Case 1: JSON output
        if old_output.strip().startswith('{'):
            try:
                parsed = json.loads(old_output)
                ar.output_json = parsed
                ar.output_str = ''
            except json.JSONDecodeError:
                # Not valid JSON, treat as string
                ar.output_str = old_output
        # Case 2: File path or plain string
        else:
            ar.output_str = old_output
        ar.save(update_fields=['output_str', 'output_json'])
 def reverse_migrate(apps, schema_editor):
    """Reverse migration - copy output_str back to output."""
    ArchiveResult = apps.get_model('core', 'ArchiveResult')
    for ar in ArchiveResult.objects.all().iterator():
        if ar.output_json:
            ar.output = json.dumps(ar.output_json)
        else:
            ar.output = ar.output_str or ''
        ar.save(update_fields=['output'])
 class Migration(migrations.Migration):
    dependencies = [
        ('core', '0029_archiveresult_hook_fields'),
    ]
    operations = [
        migrations.RunPython(migrate_output_field, reverse_migrate),
        # Now safe to remove old 'output' field
        migrations.RemoveField(
            model_name='archiveresult',
            name='output',
        ),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
 from workers.models import ModelWithStateMachine
 from workers.tasks import bg_archive_snapshot
 from crawls.models import Crawl
-from machine.models import NetworkInterface
+from machine.models import NetworkInterface, InstalledBinary
@@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        def calc_icons():
            if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
-                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
+                archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
            else:
-                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
+                # Filter for results that have either output_files or output_str
                from django.db.models import Q
                archive_results = {r.extractor: r for r in self.archiveresult_set.filter(
                    Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
                )}
            path = self.archive_path
            canon = self.canonical_outputs()
@@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            for extractor in all_extractors:
                result = archive_results.get(extractor)
-                existing = result and result.status == 'succeeded' and result.output
+                existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
                icon = get_extractor_icon(extractor)
                output += format_html(
                    output_template,
@@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        # Scan each ArchiveResult's output directory for the best file
        snap_dir = Path(self.output_dir)
        for result in self.archiveresult_set.filter(status='succeeded'):
-            if not result.output:
+            if not result.output_files and not result.output_str:
                continue
            # Try to find the best output file for this extractor
            extractor_dir = snap_dir / result.extractor
            best_output = None
-            if result.output and (snap_dir / result.output).exists():
+            # Check output_files first (new field)
-                # Use the explicit output path if it exists
+            if result.output_files:
-                best_output = result.output
+                first_file = next(iter(result.output_files.keys()), None)
-            elif extractor_dir.exists():
+                if first_file and (extractor_dir / first_file).exists():
                    best_output = f'{result.extractor}/{first_file}'
            # Fallback to output_str if it looks like a path
            if not best_output and result.output_str and (snap_dir / result.output_str).exists():
                best_output = result.output_str
            if not best_output and extractor_dir.exists():
                # Intelligently find the best file in the extractor's directory
                best_output = find_best_output_in_dir(extractor_dir, result.extractor)
@@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
        """Get the latest output that each archive method produced"""
        from archivebox.hooks import get_extractors
        from django.db.models import Q
        latest: Dict[str, Any] = {}
        for archive_method in get_extractors():
            results = self.archiveresult_set.filter(extractor=archive_method)
            if status is not None:
                results = results.filter(status=status)
-            results = results.filter(output__isnull=False).order_by('-start_ts')
+            # Filter for results with output_files or output_str
-            latest[archive_method] = results.first().output if results.exists() else None
+            results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
            result = results.first()
            # Return embed_path() for backwards compatibility
            latest[archive_method] = result.embed_path() if result else None
        return latest
    # =========================================================================
@@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
    cmd = models.JSONField(default=None, null=True, blank=True)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
-    output = models.CharField(max_length=1024, default=None, null=True, blank=True)
+
    # New output fields (replacing old 'output' field)
    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
    output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
    output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
    output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
    output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
    # Binary FK (optional - set when hook reports cmd)
    binary = models.ForeignKey(
        'machine.InstalledBinary',
        on_delete=models.SET_NULL,
        null=True, blank=True,
        related_name='archiveresults',
        help_text='Primary binary used by this hook'
    )
    start_ts = models.DateTimeField(default=None, null=True, blank=True)
    end_ts = models.DateTimeField(default=None, null=True, blank=True)
@@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        """
        Get the relative path to the embeddable output file for this result.
-        Returns the output field if set and file exists, otherwise tries to
+        Returns the first file from output_files if set, otherwise tries to
        find a reasonable default based on the extractor type.
        """
-        if self.output:
+        # Check output_files dict for primary output
-            return self.output
+        if self.output_files:
            # Return first file from output_files (dict preserves insertion order)
            first_file = next(iter(self.output_files.keys()), None)
            if first_file:
                return f'{self.extractor}/{first_file}'
        # Fallback: check output_str if it looks like a file path
        if self.output_str and ('/' in self.output_str or '.' in self.output_str):
            return self.output_str
        # Try to find output file based on extractor's canonical output path
        canonical = self.snapshot.canonical_outputs()
@@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        if not hook:
            self.status = self.StatusChoices.FAILED
-            self.output = f'No hook found for: {self.extractor}'
+            self.output_str = f'No hook found for: {self.extractor}'
            self.retry_at = None
            self.save()
            return
@@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            url=self.snapshot.url,
            snapshot_id=str(self.snapshot.id),
        )
        # BACKGROUND HOOK - still running, return immediately
        if result is None:
            self.status = self.StatusChoices.STARTED
            self.start_ts = start_ts
            self.pwd = str(extractor_dir)
            self.save()
            return
        end_ts = timezone.now()
        # Get records from hook output (new JSONL format)
        records = result.get('records', [])
        # Clean up empty output directory if no files were created
        output_files = result.get('output_files', [])
        if not output_files and extractor_dir.exists():
@@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            except (OSError, RuntimeError):
                pass  # Directory not empty or can't be removed, that's fine
-        # Determine status from return code and JSON output
+        # Find the ArchiveResult record from hook output (if any)
        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
        output_json = result.get('output_json') or {}
        json_status = output_json.get('status')
-        if json_status == 'skipped':
+        # Determine status from records, output_json, or return code
-            status = 'skipped'
+        if ar_records:
-        elif json_status == 'failed':
+            # Use status from first ArchiveResult record
-            status = 'failed'
+            hook_data = ar_records[0]
            status = hook_data.get('status', 'failed')
        elif output_json.get('status'):
            status = output_json['status']
        elif result['returncode'] == 0:
            status = 'succeeded'
        else:
@@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            'skipped': self.StatusChoices.SKIPPED,
        }
        self.status = status_map.get(status, self.StatusChoices.FAILED)
-        self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
+
        # Set output fields from records or output_json
        if ar_records:
            hook_data = ar_records[0]
            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
            self.output_json = hook_data.get('output_json')
            # Set cmd from JSONL record
            if hook_data.get('cmd'):
                self.cmd = hook_data['cmd']
                self._set_binary_from_cmd(hook_data['cmd'])
            if hook_data.get('cmd_version'):
                self.cmd_version = hook_data['cmd_version'][:128]
        else:
            # Fallback to legacy output_json format
            self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or ''
            self.output_json = output_json.get('output_json') if output_json.get('output_json') else None
            if output_json.get('cmd_version'):
                self.cmd_version = output_json['cmd_version'][:128]
            if output_json.get('cmd'):
                self.cmd = output_json['cmd']
                self._set_binary_from_cmd(output_json['cmd'])
        self.start_ts = start_ts
        self.end_ts = end_ts
        self.retry_at = None
        self.pwd = str(extractor_dir)
-        # Save cmd and cmd_version from extractor output
+        # Populate output_files, output_size, output_mimetypes from filesystem
-        if output_json.get('cmd_version'):
+        if extractor_dir.exists():
-            self.cmd_version = output_json['cmd_version'][:128]  # Max length from model
+            self._populate_output_fields(extractor_dir)
        if output_json.get('cmd'):
            self.cmd = output_json['cmd']
        self.save()
        # Process side-effect records (InstalledBinary, Machine config, etc.)
        from archivebox.hooks import create_model_record
        for record in records:
            if record.get('type') != 'ArchiveResult':
                create_model_record(record.copy())  # Copy to avoid mutating original
        # Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
        self._queue_urls_for_crawl(extractor_dir)
@@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        if self.status == self.StatusChoices.SUCCEEDED:
            self.trigger_search_indexing()
    def _populate_output_fields(self, output_dir: Path) -> None:
        """
        Walk output directory and populate output_files, output_size, output_mimetypes.
        """
        import mimetypes
        from collections import defaultdict
        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
        # Track mimetypes and sizes for aggregation
        mime_sizes = defaultdict(int)
        total_size = 0
        output_files = {}  # Dict keyed by relative path
        for file_path in output_dir.rglob('*'):
            # Skip non-files and infrastructure files
            if not file_path.is_file():
                continue
            if file_path.name in exclude_names:
                continue
            # Get file stats
            try:
                stat = file_path.stat()
                mime_type, _ = mimetypes.guess_type(str(file_path))
                mime_type = mime_type or 'application/octet-stream'
                # Track for ArchiveResult fields
                relative_path = str(file_path.relative_to(output_dir))
                output_files[relative_path] = {}  # Empty dict, extensible for future metadata
                mime_sizes[mime_type] += stat.st_size
                total_size += stat.st_size
            except (OSError, IOError):
                continue
        # Populate ArchiveResult fields
        self.output_files = output_files
        self.output_size = total_size
        # Build output_mimetypes CSV (sorted by size descending)
        sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
        self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
    def _set_binary_from_cmd(self, cmd: list) -> None:
        """
        Find InstalledBinary for command and set binary FK.
        Tries matching by absolute path first, then by binary name.
        Only matches binaries on the current machine.
        """
        if not cmd:
            return
        from machine.models import Machine
        bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
        machine = Machine.current()
        # Try matching by absolute path first
        binary = InstalledBinary.objects.filter(
            abspath=bin_path_or_name,
            machine=machine
        ).first()
        if binary:
            self.binary = binary
            return
        # Fallback: match by binary name
        bin_name = Path(bin_path_or_name).name
        binary = InstalledBinary.objects.filter(
            name=bin_name,
            machine=machine
        ).first()
        if binary:
            self.binary = binary
    def _update_snapshot_title(self, extractor_dir: Path):
        """
        Update snapshot title from title extractor output.
@@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    def output_dir(self) -> Path:
        """Get the output directory for this extractor's results."""
        return Path(self.snapshot.output_dir) / self.extractor
    def is_background_hook(self) -> bool:
        """Check if this ArchiveResult is for a background hook."""
        extractor_dir = Path(self.pwd) if self.pwd else None
        if not extractor_dir:
            return False
        pid_file = extractor_dir / 'hook.pid'
        return pid_file.exists()
    def check_background_completed(self) -> bool:
        """
        Check if background hook process has exited.
        Returns:
            True if completed (process exited), False if still running
        """
        extractor_dir = Path(self.pwd) if self.pwd else None
        if not extractor_dir:
            return True  # No pwd = completed or failed to start
        pid_file = extractor_dir / 'hook.pid'
        if not pid_file.exists():
            return True  # No PID file = completed or failed to start
        try:
            pid = int(pid_file.read_text().strip())
            os.kill(pid, 0)  # Signal 0 = check if process exists
            return False  # Still running
        except (OSError, ValueError):
            return True  # Process exited or invalid PID
    def finalize_background_hook(self) -> None:
        """
        Collect final results from completed background hook.
        Same logic as run() but for background hooks that already started.
        """
        from archivebox.hooks import create_model_record
        extractor_dir = Path(self.pwd) if self.pwd else None
        if not extractor_dir or not extractor_dir.exists():
            self.status = self.StatusChoices.FAILED
            self.output_str = 'Background hook output directory not found'
            self.end_ts = timezone.now()
            self.retry_at = None
            self.save()
            return
        stdout_file = extractor_dir / 'stdout.log'
        stderr_file = extractor_dir / 'stderr.log'
        # Read logs
        stdout = stdout_file.read_text() if stdout_file.exists() else ''
        # Parse JSONL output
        records = []
        for line in stdout.splitlines():
            line = line.strip()
            if not line or not line.startswith('{'):
                continue
            try:
                data = json.loads(line)
                if 'type' in data:
                    records.append(data)
            except json.JSONDecodeError:
                continue
        # Find the ArchiveResult record
        ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
        if ar_records:
            hook_data = ar_records[0]
            # Apply hook's data
            status_str = hook_data.get('status', 'failed')
            status_map = {
                'succeeded': self.StatusChoices.SUCCEEDED,
                'failed': self.StatusChoices.FAILED,
                'skipped': self.StatusChoices.SKIPPED,
            }
            self.status = status_map.get(status_str, self.StatusChoices.FAILED)
            self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
            self.output_json = hook_data.get('output_json')
            # Determine binary FK from cmd
            if hook_data.get('cmd'):
                self.cmd = hook_data['cmd']
                self._set_binary_from_cmd(hook_data['cmd'])
            if hook_data.get('cmd_version'):
                self.cmd_version = hook_data['cmd_version'][:128]
        else:
            # No output = failed
            self.status = self.StatusChoices.FAILED
            self.output_str = 'Background hook did not output ArchiveResult'
        self.end_ts = timezone.now()
        self.retry_at = None
        # Populate output fields from filesystem
        if extractor_dir.exists():
            self._populate_output_fields(extractor_dir)
        self.save()
        # Create any side-effect records
        for record in records:
            if record.get('type') != 'ArchiveResult':
                create_model_record(record.copy())
        # Cleanup PID files and empty logs
        pid_file = extractor_dir / 'hook.pid'
        pid_file.unlink(missing_ok=True)
        if stdout_file.exists() and stdout_file.stat().st_size == 0:
            stdout_file.unlink()
        if stderr_file.exists() and stderr_file.stat().st_size == 0:
            stderr_file.unlink()
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True):
        # if no archiveresults exist yet, it's not finished
        if not self.snapshot.archiveresult_set.exists():
            return False
-        
+
        # if archiveresults exist but are still pending, it's not finished
        if self.snapshot.pending_archiveresults().exists():
            return False
-        
+
        # Check for background hooks that are still running
        started_results = self.snapshot.archiveresult_set.filter(
            status=ArchiveResult.StatusChoices.STARTED
        )
        for result in started_results:
            if not result.check_background_completed():
                return False  # Still running
            # Completed - finalize it
            result.finalize_background_hook()
        # otherwise archiveresults exist and are all finished, so it's finished
        return True
@@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
    def is_backoff(self) -> bool:
        """Check if we should backoff and retry later."""
-        # Backoff if status is still started (extractor didn't complete) and output is None
+        # Backoff if status is still started (extractor didn't complete) and output_str is empty
        return (
-            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and 
+            self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
-            self.archiveresult.output is None
+            not self.archiveresult.output_str
        )
    def is_finished(self) -> bool:
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str:
        return ''
    # Use embed_path() for the display path (includes canonical paths)
-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
    # Create a mini template and render it with context
    try:
@@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str:
    if not template_str:
        return ''
-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
    try:
        tpl = template.Template(template_str)
@@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str:
    if not template_str:
        return ''
-    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+    output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
    try:
        tpl = template.Template(template_str)
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False):
    output_files: List[str]
    duration_ms: int
    hook: str
    # New fields for JSONL parsing
    records: List[Dict[str, Any]]  # Parsed JSONL records with 'type' field
 def discover_hooks(event_name: str) -> List[Path]:
@@ -268,7 +270,9 @@ def run_hook(
    files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
    # Detect if this is a background hook (long-running daemon)
-    is_background = '__background' in script.stem
+    # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
    # Old convention: __background in stem (for backwards compatibility)
    is_background = '.bg.' in script.name or '__background' in script.stem
    # Set up output files for ALL hooks (useful for debugging)
    stdout_file = output_dir / 'stdout.log'
@@ -322,13 +326,44 @@ def run_hook(
        # Exclude the log files themselves from new_files
        new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
-        # Parse RESULT_JSON from stdout
+        # Parse JSONL output from stdout
        # Supports both new JSONL format (any line starting with { that has 'type')
        # and legacy RESULT_JSON= format for backwards compatibility
        output_json = None
        records = []
        plugin_name = script.parent.name  # Plugin directory name (e.g., 'wget')
        for line in stdout.splitlines():
-            if line.startswith('RESULT_JSON='):
+            line = line.strip()
            if not line:
                continue
            # New JSONL format: any line starting with { that has 'type' field
            if line.startswith('{'):
                try:
-                    output_json = json.loads(line[len('RESULT_JSON='):])
+                    data = json.loads(line)
-                    break
+                    if 'type' in data:
                        # Add plugin metadata to every record
                        data['plugin'] = plugin_name
                        data['plugin_hook'] = str(script)
                        records.append(data)
                        # For backwards compatibility, also set output_json for first ArchiveResult
                        if data.get('type') == 'ArchiveResult' and output_json is None:
                            output_json = data
                except json.JSONDecodeError:
                    pass
            # Legacy format: RESULT_JSON=...
            elif line.startswith('RESULT_JSON='):
                try:
                    data = json.loads(line[len('RESULT_JSON='):])
                    if output_json is None:
                        output_json = data
                    # Convert legacy format to new format
                    data['type'] = 'ArchiveResult'
                    data['plugin'] = plugin_name
                    data['plugin_hook'] = str(script)
                    records.append(data)
                except json.JSONDecodeError:
                    pass
@@ -348,6 +383,7 @@ def run_hook(
            output_files=new_files,
            duration_ms=duration_ms,
            hook=str(script),
            records=records,
        )
    except Exception as e:
@@ -360,6 +396,7 @@ def run_hook(
            output_files=[],
            duration_ms=duration_ms,
            hook=str(script),
            records=[],
        )
@@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
    return templates
 # =============================================================================
 # Hook Result Processing Helpers
 # =============================================================================
 def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
    """
    Find InstalledBinary for a command, trying abspath first then name.
    Only matches binaries on the current machine.
    Args:
        cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
        machine_id: Current machine ID
    Returns:
        Binary ID as string if found, None otherwise
    """
    if not cmd:
        return None
    from machine.models import InstalledBinary
    bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
    # Try matching by absolute path first
    binary = InstalledBinary.objects.filter(
        abspath=bin_path_or_name,
        machine_id=machine_id
    ).first()
    if binary:
        return str(binary.id)
    # Fallback: match by binary name
    bin_name = Path(bin_path_or_name).name
    binary = InstalledBinary.objects.filter(
        name=bin_name,
        machine_id=machine_id
    ).first()
    return str(binary.id) if binary else None
 def create_model_record(record: Dict[str, Any]) -> Any:
    """
    Generic helper to create/update model instances from hook JSONL output.
    Args:
        record: Dict with 'type' field and model data
    Returns:
        Created/updated model instance, or None if type unknown
    """
    from machine.models import InstalledBinary, Machine
    record_type = record.pop('type', None)
    if not record_type:
        return None
    # Remove plugin metadata (not model fields)
    record.pop('plugin', None)
    record.pop('plugin_hook', None)
    if record_type == 'InstalledBinary':
        # InstalledBinary requires machine FK
        machine = Machine.current()
        record.setdefault('machine', machine)
        # Required fields check
        name = record.get('name')
        abspath = record.get('abspath')
        if not name or not abspath:
            return None
        obj, created = InstalledBinary.objects.update_or_create(
            machine=machine,
            name=name,
            defaults={
                'abspath': abspath,
                'version': record.get('version', ''),
                'sha256': record.get('sha256', ''),
                'binprovider': record.get('binprovider', 'env'),
            }
        )
        return obj
    elif record_type == 'Machine':
        # Machine config update (special _method handling)
        method = record.pop('_method', None)
        if method == 'update':
            key = record.get('key')
            value = record.get('value')
            if key and value:
                machine = Machine.current()
                if not machine.config:
                    machine.config = {}
                machine.config[key] = value
                machine.save(update_fields=['config'])
                return machine
        return None
    # Add more types as needed (Dependency, Snapshot, etc.)
    else:
        # Unknown type - log warning but don't fail
        import sys
        print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
        return None