From 3d985fa8c88c46e9c16a112f0dba2e8bc21acaac Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 27 Dec 2025 08:38:49 +0000 Subject: [PATCH] Implement hook architecture with JSONL output support Phase 1: Database migration for new ArchiveResult fields - Add output_str (TextField) for human-readable summary - Add output_json (JSONField) for structured metadata - Add output_files (JSONField) for dict of {relative_path: {}} - Add output_size (BigIntegerField) for total bytes - Add output_mimetypes (CharField) for CSV of mimetypes - Add binary FK to InstalledBinary (optional) - Migrate existing 'output' field to new split fields Phase 3: Update run_hook() for JSONL parsing - Support new JSONL format (any line with {type: 'ModelName', ...}) - Maintain backwards compatibility with RESULT_JSON= format - Add plugin metadata to each parsed record - Detect background hooks with .bg. suffix in filename - Add find_binary_for_cmd() helper function - Add create_model_record() for processing side-effect records Phase 6: Update ArchiveResult.run() - Handle background hooks (return immediately when result is None) - Process 'records' from HookResult for side-effect models - Use new output fields (output_str, output_json, output_files, etc.) - Call create_model_record() for InstalledBinary, Machine updates Phase 7: Add background hook support - Add is_background_hook() method to ArchiveResult - Add check_background_completed() to check if process exited - Add finalize_background_hook() to collect results from completed hooks - Update SnapshotMachine.is_finished() to check/finalize background hooks - Update _populate_output_fields() to walk directory and populate stats Also updated references to old 'output' field in: - admin_archiveresults.py - statemachines.py - templatetags/core_tags.py --- archivebox/core/admin_archiveresults.py | 31 +- .../0029_archiveresult_hook_fields.py | 80 +++++ .../migrations/0030_migrate_output_field.py | 64 ++++ archivebox/core/models.py | 330 ++++++++++++++++-- archivebox/core/statemachines.py | 21 +- archivebox/core/templatetags/core_tags.py | 6 +- archivebox/hooks.py | 156 ++++++++- 7 files changed, 633 insertions(+), 55 deletions(-) create mode 100644 archivebox/core/migrations/0029_archiveresult_hook_fields.py create mode 100644 archivebox/core/migrations/0030_migrate_output_field.py diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index f525b84f..18b5fadc 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50): end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-' # Truncate output for display - full_output = result.output or '-' + full_output = result.output_str or '-' output_display = full_output[:60] if len(full_output) > 60: output_display += '...' @@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50): # Get full command as tooltip cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-') - # Build output link - output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/' + # Build output link - use embed_path() which checks output_files first + embed_path = result.embed_path() if hasattr(result, 'embed_path') else None + output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/' # Get version - try cmd_version field version = result.cmd_version if result.cmd_version else '-' @@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin): ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), ) - def output_str(self, result): - # Determine output link path - use output if file exists, otherwise link to index - output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html' + def output_display(self, result): + # Determine output link path - use embed_path() which checks output_files + embed_path = result.embed_path() if hasattr(result, 'embed_path') else None + output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html' return format_html( '↗️
{}
', result.snapshot.timestamp, output_path, - result.output, + result.output_str, ) def output_summary(self, result): snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1] - output_str = format_html( + output_html = format_html( '
{}

', - result.output, + result.output_str, ) - output_str += format_html('See result files ...
', str(result.snapshot.timestamp))
-        path_from_output_str = (snapshot_dir / (result.output or ''))
-        output_str += format_html('{}/{}

', str(snapshot_dir), str(result.output)) - if os.access(path_from_output_str, os.R_OK): - root_dir = str(path_from_output_str) + output_html += format_html('See result files ...
', str(result.snapshot.timestamp))
+        embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+        path_from_embed = (snapshot_dir / (embed_path or ''))
+        output_html += format_html('{}/{}

', str(snapshot_dir), str(embed_path)) + if os.access(path_from_embed, os.R_OK): + root_dir = str(path_from_embed) else: root_dir = str(snapshot_dir) diff --git a/archivebox/core/migrations/0029_archiveresult_hook_fields.py b/archivebox/core/migrations/0029_archiveresult_hook_fields.py new file mode 100644 index 00000000..0ff1f0c2 --- /dev/null +++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py @@ -0,0 +1,80 @@ +# Generated by Django for hook architecture support +# Phase 1: Add new ArchiveResult fields for hook output + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0028_snapshot_fs_version'), + ('machine', '0002_rename_custom_cmds_to_overrides'), + ] + + operations = [ + # Add new output fields (keep old 'output' temporarily for migration) + migrations.AddField( + model_name='archiveresult', + name='output_str', + field=models.TextField( + blank=True, + default='', + help_text='Human-readable output summary (e.g., "Downloaded 5 files")' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_json', + field=models.JSONField( + null=True, + blank=True, + default=None, + help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_files', + field=models.JSONField( + default=dict, + help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_size', + field=models.BigIntegerField( + default=0, + help_text='Total recursive size in bytes of all output files' + ), + ), + + migrations.AddField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField( + max_length=512, + blank=True, + default='', + help_text='CSV of mimetypes sorted by size descending' + ), + ), + + # Add binary FK (optional) + migrations.AddField( + model_name='archiveresult', + name='binary', + field=models.ForeignKey( + 'machine.InstalledBinary', + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name='archiveresults', + help_text='Primary binary used by this hook (optional)' + ), + ), + ] diff --git a/archivebox/core/migrations/0030_migrate_output_field.py b/archivebox/core/migrations/0030_migrate_output_field.py new file mode 100644 index 00000000..5dafb7e8 --- /dev/null +++ b/archivebox/core/migrations/0030_migrate_output_field.py @@ -0,0 +1,64 @@ +# Generated by Django for hook architecture support +# Phase 1: Migrate existing 'output' field to new split fields + +from django.db import migrations +import json + + +def migrate_output_field(apps, schema_editor): + """ + Migrate existing 'output' field to new split fields. + + Logic: + - If output contains JSON {...}, move to output_json + - Otherwise, move to output_str + """ + ArchiveResult = apps.get_model('core', 'ArchiveResult') + + for ar in ArchiveResult.objects.all().iterator(): + old_output = ar.output or '' + + # Case 1: JSON output + if old_output.strip().startswith('{'): + try: + parsed = json.loads(old_output) + ar.output_json = parsed + ar.output_str = '' + except json.JSONDecodeError: + # Not valid JSON, treat as string + ar.output_str = old_output + + # Case 2: File path or plain string + else: + ar.output_str = old_output + + ar.save(update_fields=['output_str', 'output_json']) + + +def reverse_migrate(apps, schema_editor): + """Reverse migration - copy output_str back to output.""" + ArchiveResult = apps.get_model('core', 'ArchiveResult') + + for ar in ArchiveResult.objects.all().iterator(): + if ar.output_json: + ar.output = json.dumps(ar.output_json) + else: + ar.output = ar.output_str or '' + ar.save(update_fields=['output']) + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0029_archiveresult_hook_fields'), + ] + + operations = [ + migrations.RunPython(migrate_output_field, reverse_migrate), + + # Now safe to remove old 'output' field + migrations.RemoveField( + model_name='archiveresult', + name='output', + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 6bac5679..1e5dcc0f 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -36,7 +36,7 @@ from archivebox.base_models.models import ( from workers.models import ModelWithStateMachine from workers.tasks import bg_archive_snapshot from crawls.models import Crawl -from machine.models import NetworkInterface +from machine.models import NetworkInterface, InstalledBinary @@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def calc_icons(): if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache: - archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output} + archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)} else: - archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)} + # Filter for results that have either output_files or output_str + from django.db.models import Q + archive_results = {r.extractor: r for r in self.archiveresult_set.filter( + Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str='')) + )} path = self.archive_path canon = self.canonical_outputs() @@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea for extractor in all_extractors: result = archive_results.get(extractor) - existing = result and result.status == 'succeeded' and result.output + existing = result and result.status == 'succeeded' and (result.output_files or result.output_str) icon = get_extractor_icon(extractor) output += format_html( output_template, @@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Scan each ArchiveResult's output directory for the best file snap_dir = Path(self.output_dir) for result in self.archiveresult_set.filter(status='succeeded'): - if not result.output: + if not result.output_files and not result.output_str: continue # Try to find the best output file for this extractor extractor_dir = snap_dir / result.extractor best_output = None - if result.output and (snap_dir / result.output).exists(): - # Use the explicit output path if it exists - best_output = result.output - elif extractor_dir.exists(): + # Check output_files first (new field) + if result.output_files: + first_file = next(iter(result.output_files.keys()), None) + if first_file and (extractor_dir / first_file).exists(): + best_output = f'{result.extractor}/{first_file}' + + # Fallback to output_str if it looks like a path + if not best_output and result.output_str and (snap_dir / result.output_str).exists(): + best_output = result.output_str + + if not best_output and extractor_dir.exists(): # Intelligently find the best file in the extractor's directory best_output = find_best_output_in_dir(extractor_dir, result.extractor) @@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]: """Get the latest output that each archive method produced""" from archivebox.hooks import get_extractors + from django.db.models import Q latest: Dict[str, Any] = {} for archive_method in get_extractors(): results = self.archiveresult_set.filter(extractor=archive_method) if status is not None: results = results.filter(status=status) - results = results.filter(output__isnull=False).order_by('-start_ts') - latest[archive_method] = results.first().output if results.exists() else None + # Filter for results with output_files or output_str + results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts') + result = results.first() + # Return embed_path() for backwards compatibility + latest[archive_method] = result.embed_path() if result else None return latest # ========================================================================= @@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi pwd = models.CharField(max_length=256, default=None, null=True, blank=True) cmd = models.JSONField(default=None, null=True, blank=True) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) - output = models.CharField(max_length=1024, default=None, null=True, blank=True) + + # New output fields (replacing old 'output' field) + output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary') + output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)') + output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}') + output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files') + output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size') + + # Binary FK (optional - set when hook reports cmd) + binary = models.ForeignKey( + 'machine.InstalledBinary', + on_delete=models.SET_NULL, + null=True, blank=True, + related_name='archiveresults', + help_text='Primary binary used by this hook' + ) + start_ts = models.DateTimeField(default=None, null=True, blank=True) end_ts = models.DateTimeField(default=None, null=True, blank=True) @@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi """ Get the relative path to the embeddable output file for this result. - Returns the output field if set and file exists, otherwise tries to + Returns the first file from output_files if set, otherwise tries to find a reasonable default based on the extractor type. """ - if self.output: - return self.output + # Check output_files dict for primary output + if self.output_files: + # Return first file from output_files (dict preserves insertion order) + first_file = next(iter(self.output_files.keys()), None) + if first_file: + return f'{self.extractor}/{first_file}' + + # Fallback: check output_str if it looks like a file path + if self.output_str and ('/' in self.output_str or '.' in self.output_str): + return self.output_str # Try to find output file based on extractor's canonical output path canonical = self.snapshot.canonical_outputs() @@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi if not hook: self.status = self.StatusChoices.FAILED - self.output = f'No hook found for: {self.extractor}' + self.output_str = f'No hook found for: {self.extractor}' self.retry_at = None self.save() return @@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi url=self.snapshot.url, snapshot_id=str(self.snapshot.id), ) + + # BACKGROUND HOOK - still running, return immediately + if result is None: + self.status = self.StatusChoices.STARTED + self.start_ts = start_ts + self.pwd = str(extractor_dir) + self.save() + return + end_ts = timezone.now() + # Get records from hook output (new JSONL format) + records = result.get('records', []) + # Clean up empty output directory if no files were created output_files = result.get('output_files', []) if not output_files and extractor_dir.exists(): @@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi except (OSError, RuntimeError): pass # Directory not empty or can't be removed, that's fine - # Determine status from return code and JSON output + # Find the ArchiveResult record from hook output (if any) + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] output_json = result.get('output_json') or {} - json_status = output_json.get('status') - if json_status == 'skipped': - status = 'skipped' - elif json_status == 'failed': - status = 'failed' + # Determine status from records, output_json, or return code + if ar_records: + # Use status from first ArchiveResult record + hook_data = ar_records[0] + status = hook_data.get('status', 'failed') + elif output_json.get('status'): + status = output_json['status'] elif result['returncode'] == 0: status = 'succeeded' else: @@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi 'skipped': self.StatusChoices.SKIPPED, } self.status = status_map.get(status, self.StatusChoices.FAILED) - self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None + + # Set output fields from records or output_json + if ar_records: + hook_data = ar_records[0] + self.output_str = hook_data.get('output_str') or hook_data.get('output') or '' + self.output_json = hook_data.get('output_json') + # Set cmd from JSONL record + if hook_data.get('cmd'): + self.cmd = hook_data['cmd'] + self._set_binary_from_cmd(hook_data['cmd']) + if hook_data.get('cmd_version'): + self.cmd_version = hook_data['cmd_version'][:128] + else: + # Fallback to legacy output_json format + self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or '' + self.output_json = output_json.get('output_json') if output_json.get('output_json') else None + if output_json.get('cmd_version'): + self.cmd_version = output_json['cmd_version'][:128] + if output_json.get('cmd'): + self.cmd = output_json['cmd'] + self._set_binary_from_cmd(output_json['cmd']) + self.start_ts = start_ts self.end_ts = end_ts self.retry_at = None self.pwd = str(extractor_dir) - # Save cmd and cmd_version from extractor output - if output_json.get('cmd_version'): - self.cmd_version = output_json['cmd_version'][:128] # Max length from model - if output_json.get('cmd'): - self.cmd = output_json['cmd'] + # Populate output_files, output_size, output_mimetypes from filesystem + if extractor_dir.exists(): + self._populate_output_fields(extractor_dir) self.save() + # Process side-effect records (InstalledBinary, Machine config, etc.) + from archivebox.hooks import create_model_record + for record in records: + if record.get('type') != 'ArchiveResult': + create_model_record(record.copy()) # Copy to avoid mutating original + # Queue any discovered URLs for crawling (parser extractors write urls.jsonl) self._queue_urls_for_crawl(extractor_dir) @@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi if self.status == self.StatusChoices.SUCCEEDED: self.trigger_search_indexing() + def _populate_output_fields(self, output_dir: Path) -> None: + """ + Walk output directory and populate output_files, output_size, output_mimetypes. + """ + import mimetypes + from collections import defaultdict + + exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'} + + # Track mimetypes and sizes for aggregation + mime_sizes = defaultdict(int) + total_size = 0 + output_files = {} # Dict keyed by relative path + + for file_path in output_dir.rglob('*'): + # Skip non-files and infrastructure files + if not file_path.is_file(): + continue + if file_path.name in exclude_names: + continue + + # Get file stats + try: + stat = file_path.stat() + mime_type, _ = mimetypes.guess_type(str(file_path)) + mime_type = mime_type or 'application/octet-stream' + + # Track for ArchiveResult fields + relative_path = str(file_path.relative_to(output_dir)) + output_files[relative_path] = {} # Empty dict, extensible for future metadata + mime_sizes[mime_type] += stat.st_size + total_size += stat.st_size + except (OSError, IOError): + continue + + # Populate ArchiveResult fields + self.output_files = output_files + self.output_size = total_size + + # Build output_mimetypes CSV (sorted by size descending) + sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True) + self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes) + + def _set_binary_from_cmd(self, cmd: list) -> None: + """ + Find InstalledBinary for command and set binary FK. + + Tries matching by absolute path first, then by binary name. + Only matches binaries on the current machine. + """ + if not cmd: + return + + from machine.models import Machine + + bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd + machine = Machine.current() + + # Try matching by absolute path first + binary = InstalledBinary.objects.filter( + abspath=bin_path_or_name, + machine=machine + ).first() + + if binary: + self.binary = binary + return + + # Fallback: match by binary name + bin_name = Path(bin_path_or_name).name + binary = InstalledBinary.objects.filter( + name=bin_name, + machine=machine + ).first() + + if binary: + self.binary = binary + def _update_snapshot_title(self, extractor_dir: Path): """ Update snapshot title from title extractor output. @@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def output_dir(self) -> Path: """Get the output directory for this extractor's results.""" return Path(self.snapshot.output_dir) / self.extractor + + def is_background_hook(self) -> bool: + """Check if this ArchiveResult is for a background hook.""" + extractor_dir = Path(self.pwd) if self.pwd else None + if not extractor_dir: + return False + pid_file = extractor_dir / 'hook.pid' + return pid_file.exists() + + def check_background_completed(self) -> bool: + """ + Check if background hook process has exited. + + Returns: + True if completed (process exited), False if still running + """ + extractor_dir = Path(self.pwd) if self.pwd else None + if not extractor_dir: + return True # No pwd = completed or failed to start + + pid_file = extractor_dir / 'hook.pid' + if not pid_file.exists(): + return True # No PID file = completed or failed to start + + try: + pid = int(pid_file.read_text().strip()) + os.kill(pid, 0) # Signal 0 = check if process exists + return False # Still running + except (OSError, ValueError): + return True # Process exited or invalid PID + + def finalize_background_hook(self) -> None: + """ + Collect final results from completed background hook. + + Same logic as run() but for background hooks that already started. + """ + from archivebox.hooks import create_model_record + + extractor_dir = Path(self.pwd) if self.pwd else None + if not extractor_dir or not extractor_dir.exists(): + self.status = self.StatusChoices.FAILED + self.output_str = 'Background hook output directory not found' + self.end_ts = timezone.now() + self.retry_at = None + self.save() + return + + stdout_file = extractor_dir / 'stdout.log' + stderr_file = extractor_dir / 'stderr.log' + + # Read logs + stdout = stdout_file.read_text() if stdout_file.exists() else '' + + # Parse JSONL output + records = [] + for line in stdout.splitlines(): + line = line.strip() + if not line or not line.startswith('{'): + continue + try: + data = json.loads(line) + if 'type' in data: + records.append(data) + except json.JSONDecodeError: + continue + + # Find the ArchiveResult record + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + + if ar_records: + hook_data = ar_records[0] + + # Apply hook's data + status_str = hook_data.get('status', 'failed') + status_map = { + 'succeeded': self.StatusChoices.SUCCEEDED, + 'failed': self.StatusChoices.FAILED, + 'skipped': self.StatusChoices.SKIPPED, + } + self.status = status_map.get(status_str, self.StatusChoices.FAILED) + + self.output_str = hook_data.get('output_str') or hook_data.get('output') or '' + self.output_json = hook_data.get('output_json') + + # Determine binary FK from cmd + if hook_data.get('cmd'): + self.cmd = hook_data['cmd'] + self._set_binary_from_cmd(hook_data['cmd']) + if hook_data.get('cmd_version'): + self.cmd_version = hook_data['cmd_version'][:128] + else: + # No output = failed + self.status = self.StatusChoices.FAILED + self.output_str = 'Background hook did not output ArchiveResult' + + self.end_ts = timezone.now() + self.retry_at = None + + # Populate output fields from filesystem + if extractor_dir.exists(): + self._populate_output_fields(extractor_dir) + + self.save() + + # Create any side-effect records + for record in records: + if record.get('type') != 'ArchiveResult': + create_model_record(record.copy()) + + # Cleanup PID files and empty logs + pid_file = extractor_dir / 'hook.pid' + pid_file.unlink(missing_ok=True) + if stdout_file.exists() and stdout_file.stat().st_size == 0: + stdout_file.unlink() + if stderr_file.exists() and stderr_file.stat().st_size == 0: + stderr_file.unlink() diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index 610f6fe0..9f277a5c 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True): # if no archiveresults exist yet, it's not finished if not self.snapshot.archiveresult_set.exists(): return False - + # if archiveresults exist but are still pending, it's not finished if self.snapshot.pending_archiveresults().exists(): return False - + + # Check for background hooks that are still running + started_results = self.snapshot.archiveresult_set.filter( + status=ArchiveResult.StatusChoices.STARTED + ) + for result in started_results: + if not result.check_background_completed(): + return False # Still running + + # Completed - finalize it + result.finalize_background_hook() + # otherwise archiveresults exist and are all finished, so it's finished return True @@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True): def is_backoff(self) -> bool: """Check if we should backoff and retry later.""" - # Backoff if status is still started (extractor didn't complete) and output is None + # Backoff if status is still started (extractor didn't complete) and output_str is empty return ( - self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and - self.archiveresult.output is None + self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and + not self.archiveresult.output_str ) def is_finished(self) -> bool: diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py index b2c126cd..33a620c0 100644 --- a/archivebox/core/templatetags/core_tags.py +++ b/archivebox/core/templatetags/core_tags.py @@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str: return '' # Use embed_path() for the display path (includes canonical paths) - output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '') + output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '') # Create a mini template and render it with context try: @@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str: if not template_str: return '' - output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '') + output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '') try: tpl = template.Template(template_str) @@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str: if not template_str: return '' - output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '') + output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '') try: tpl = template.Template(template_str) diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 7bbbe66e..7ac15d65 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False): output_files: List[str] duration_ms: int hook: str + # New fields for JSONL parsing + records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field def discover_hooks(event_name: str) -> List[Path]: @@ -268,7 +270,9 @@ def run_hook( files_before = set(output_dir.rglob('*')) if output_dir.exists() else set() # Detect if this is a background hook (long-running daemon) - is_background = '__background' in script.stem + # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js) + # Old convention: __background in stem (for backwards compatibility) + is_background = '.bg.' in script.name or '__background' in script.stem # Set up output files for ALL hooks (useful for debugging) stdout_file = output_dir / 'stdout.log' @@ -322,13 +326,44 @@ def run_hook( # Exclude the log files themselves from new_files new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')] - # Parse RESULT_JSON from stdout + # Parse JSONL output from stdout + # Supports both new JSONL format (any line starting with { that has 'type') + # and legacy RESULT_JSON= format for backwards compatibility output_json = None + records = [] + plugin_name = script.parent.name # Plugin directory name (e.g., 'wget') + for line in stdout.splitlines(): - if line.startswith('RESULT_JSON='): + line = line.strip() + if not line: + continue + + # New JSONL format: any line starting with { that has 'type' field + if line.startswith('{'): try: - output_json = json.loads(line[len('RESULT_JSON='):]) - break + data = json.loads(line) + if 'type' in data: + # Add plugin metadata to every record + data['plugin'] = plugin_name + data['plugin_hook'] = str(script) + records.append(data) + # For backwards compatibility, also set output_json for first ArchiveResult + if data.get('type') == 'ArchiveResult' and output_json is None: + output_json = data + except json.JSONDecodeError: + pass + + # Legacy format: RESULT_JSON=... + elif line.startswith('RESULT_JSON='): + try: + data = json.loads(line[len('RESULT_JSON='):]) + if output_json is None: + output_json = data + # Convert legacy format to new format + data['type'] = 'ArchiveResult' + data['plugin'] = plugin_name + data['plugin_hook'] = str(script) + records.append(data) except json.JSONDecodeError: pass @@ -348,6 +383,7 @@ def run_hook( output_files=new_files, duration_ms=duration_ms, hook=str(script), + records=records, ) except Exception as e: @@ -360,6 +396,7 @@ def run_hook( output_files=[], duration_ms=duration_ms, hook=str(script), + records=[], ) @@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]: return templates +# ============================================================================= +# Hook Result Processing Helpers +# ============================================================================= + + +def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]: + """ + Find InstalledBinary for a command, trying abspath first then name. + Only matches binaries on the current machine. + + Args: + cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url']) + machine_id: Current machine ID + + Returns: + Binary ID as string if found, None otherwise + """ + if not cmd: + return None + + from machine.models import InstalledBinary + + bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd + + # Try matching by absolute path first + binary = InstalledBinary.objects.filter( + abspath=bin_path_or_name, + machine_id=machine_id + ).first() + + if binary: + return str(binary.id) + + # Fallback: match by binary name + bin_name = Path(bin_path_or_name).name + binary = InstalledBinary.objects.filter( + name=bin_name, + machine_id=machine_id + ).first() + + return str(binary.id) if binary else None + + +def create_model_record(record: Dict[str, Any]) -> Any: + """ + Generic helper to create/update model instances from hook JSONL output. + + Args: + record: Dict with 'type' field and model data + + Returns: + Created/updated model instance, or None if type unknown + """ + from machine.models import InstalledBinary, Machine + + record_type = record.pop('type', None) + if not record_type: + return None + + # Remove plugin metadata (not model fields) + record.pop('plugin', None) + record.pop('plugin_hook', None) + + if record_type == 'InstalledBinary': + # InstalledBinary requires machine FK + machine = Machine.current() + record.setdefault('machine', machine) + + # Required fields check + name = record.get('name') + abspath = record.get('abspath') + if not name or not abspath: + return None + + obj, created = InstalledBinary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + 'abspath': abspath, + 'version': record.get('version', ''), + 'sha256': record.get('sha256', ''), + 'binprovider': record.get('binprovider', 'env'), + } + ) + return obj + + elif record_type == 'Machine': + # Machine config update (special _method handling) + method = record.pop('_method', None) + if method == 'update': + key = record.get('key') + value = record.get('value') + if key and value: + machine = Machine.current() + if not machine.config: + machine.config = {} + machine.config[key] = value + machine.save(update_fields=['config']) + return machine + return None + + # Add more types as needed (Dependency, Snapshot, etc.) + else: + # Unknown type - log warning but don't fail + import sys + print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr) + return None + +