mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
Implement hook architecture with JSONL output support
Phase 1: Database migration for new ArchiveResult fields
- Add output_str (TextField) for human-readable summary
- Add output_json (JSONField) for structured metadata
- Add output_files (JSONField) for dict of {relative_path: {}}
- Add output_size (BigIntegerField) for total bytes
- Add output_mimetypes (CharField) for CSV of mimetypes
- Add binary FK to InstalledBinary (optional)
- Migrate existing 'output' field to new split fields
Phase 3: Update run_hook() for JSONL parsing
- Support new JSONL format (any line with {type: 'ModelName', ...})
- Maintain backwards compatibility with RESULT_JSON= format
- Add plugin metadata to each parsed record
- Detect background hooks with .bg. suffix in filename
- Add find_binary_for_cmd() helper function
- Add create_model_record() for processing side-effect records
Phase 6: Update ArchiveResult.run()
- Handle background hooks (return immediately when result is None)
- Process 'records' from HookResult for side-effect models
- Use new output fields (output_str, output_json, output_files, etc.)
- Call create_model_record() for InstalledBinary, Machine updates
Phase 7: Add background hook support
- Add is_background_hook() method to ArchiveResult
- Add check_background_completed() to check if process exited
- Add finalize_background_hook() to collect results from completed hooks
- Update SnapshotMachine.is_finished() to check/finalize background hooks
- Update _populate_output_fields() to walk directory and populate stats
Also updated references to old 'output' field in:
- admin_archiveresults.py
- statemachines.py
- templatetags/core_tags.py
This commit is contained in:
@@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
|||||||
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
||||||
|
|
||||||
# Truncate output for display
|
# Truncate output for display
|
||||||
full_output = result.output or '-'
|
full_output = result.output_str or '-'
|
||||||
output_display = full_output[:60]
|
output_display = full_output[:60]
|
||||||
if len(full_output) > 60:
|
if len(full_output) > 60:
|
||||||
output_display += '...'
|
output_display += '...'
|
||||||
@@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50):
|
|||||||
# Get full command as tooltip
|
# Get full command as tooltip
|
||||||
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
|
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
|
||||||
|
|
||||||
# Build output link
|
# Build output link - use embed_path() which checks output_files first
|
||||||
output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
|
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||||
|
output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
|
||||||
|
|
||||||
# Get version - try cmd_version field
|
# Get version - try cmd_version field
|
||||||
version = result.cmd_version if result.cmd_version else '-'
|
version = result.cmd_version if result.cmd_version else '-'
|
||||||
@@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
|||||||
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
|
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
|
||||||
)
|
)
|
||||||
|
|
||||||
def output_str(self, result):
|
def output_display(self, result):
|
||||||
# Determine output link path - use output if file exists, otherwise link to index
|
# Determine output link path - use embed_path() which checks output_files
|
||||||
output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
|
embed_path = result.embed_path() if hasattr(result, 'embed_path') else None
|
||||||
|
output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html'
|
||||||
return format_html(
|
return format_html(
|
||||||
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||||
result.snapshot.timestamp,
|
result.snapshot.timestamp,
|
||||||
output_path,
|
output_path,
|
||||||
result.output,
|
result.output_str,
|
||||||
)
|
)
|
||||||
|
|
||||||
def output_summary(self, result):
|
def output_summary(self, result):
|
||||||
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
|
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
|
||||||
output_str = format_html(
|
output_html = format_html(
|
||||||
'<pre style="display: inline-block">{}</pre><br/>',
|
'<pre style="display: inline-block">{}</pre><br/>',
|
||||||
result.output,
|
result.output_str,
|
||||||
)
|
)
|
||||||
output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
|
output_html += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
|
||||||
path_from_output_str = (snapshot_dir / (result.output or ''))
|
embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
|
||||||
output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
|
path_from_embed = (snapshot_dir / (embed_path or ''))
|
||||||
if os.access(path_from_output_str, os.R_OK):
|
output_html += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(embed_path))
|
||||||
root_dir = str(path_from_output_str)
|
if os.access(path_from_embed, os.R_OK):
|
||||||
|
root_dir = str(path_from_embed)
|
||||||
else:
|
else:
|
||||||
root_dir = str(snapshot_dir)
|
root_dir = str(snapshot_dir)
|
||||||
|
|
||||||
|
|||||||
80
archivebox/core/migrations/0029_archiveresult_hook_fields.py
Normal file
80
archivebox/core/migrations/0029_archiveresult_hook_fields.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
# Generated by Django for hook architecture support
|
||||||
|
# Phase 1: Add new ArchiveResult fields for hook output
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0028_snapshot_fs_version'),
|
||||||
|
('machine', '0002_rename_custom_cmds_to_overrides'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
# Add new output fields (keep old 'output' temporarily for migration)
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='output_str',
|
||||||
|
field=models.TextField(
|
||||||
|
blank=True,
|
||||||
|
default='',
|
||||||
|
help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='output_json',
|
||||||
|
field=models.JSONField(
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
default=None,
|
||||||
|
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='output_files',
|
||||||
|
field=models.JSONField(
|
||||||
|
default=dict,
|
||||||
|
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='output_size',
|
||||||
|
field=models.BigIntegerField(
|
||||||
|
default=0,
|
||||||
|
help_text='Total recursive size in bytes of all output files'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='output_mimetypes',
|
||||||
|
field=models.CharField(
|
||||||
|
max_length=512,
|
||||||
|
blank=True,
|
||||||
|
default='',
|
||||||
|
help_text='CSV of mimetypes sorted by size descending'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
|
||||||
|
# Add binary FK (optional)
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='binary',
|
||||||
|
field=models.ForeignKey(
|
||||||
|
'machine.InstalledBinary',
|
||||||
|
on_delete=models.SET_NULL,
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
related_name='archiveresults',
|
||||||
|
help_text='Primary binary used by this hook (optional)'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
64
archivebox/core/migrations/0030_migrate_output_field.py
Normal file
64
archivebox/core/migrations/0030_migrate_output_field.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
# Generated by Django for hook architecture support
|
||||||
|
# Phase 1: Migrate existing 'output' field to new split fields
|
||||||
|
|
||||||
|
from django.db import migrations
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def migrate_output_field(apps, schema_editor):
|
||||||
|
"""
|
||||||
|
Migrate existing 'output' field to new split fields.
|
||||||
|
|
||||||
|
Logic:
|
||||||
|
- If output contains JSON {...}, move to output_json
|
||||||
|
- Otherwise, move to output_str
|
||||||
|
"""
|
||||||
|
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||||
|
|
||||||
|
for ar in ArchiveResult.objects.all().iterator():
|
||||||
|
old_output = ar.output or ''
|
||||||
|
|
||||||
|
# Case 1: JSON output
|
||||||
|
if old_output.strip().startswith('{'):
|
||||||
|
try:
|
||||||
|
parsed = json.loads(old_output)
|
||||||
|
ar.output_json = parsed
|
||||||
|
ar.output_str = ''
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Not valid JSON, treat as string
|
||||||
|
ar.output_str = old_output
|
||||||
|
|
||||||
|
# Case 2: File path or plain string
|
||||||
|
else:
|
||||||
|
ar.output_str = old_output
|
||||||
|
|
||||||
|
ar.save(update_fields=['output_str', 'output_json'])
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_migrate(apps, schema_editor):
|
||||||
|
"""Reverse migration - copy output_str back to output."""
|
||||||
|
ArchiveResult = apps.get_model('core', 'ArchiveResult')
|
||||||
|
|
||||||
|
for ar in ArchiveResult.objects.all().iterator():
|
||||||
|
if ar.output_json:
|
||||||
|
ar.output = json.dumps(ar.output_json)
|
||||||
|
else:
|
||||||
|
ar.output = ar.output_str or ''
|
||||||
|
ar.save(update_fields=['output'])
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0029_archiveresult_hook_fields'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.RunPython(migrate_output_field, reverse_migrate),
|
||||||
|
|
||||||
|
# Now safe to remove old 'output' field
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='archiveresult',
|
||||||
|
name='output',
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
|
|||||||
from workers.models import ModelWithStateMachine
|
from workers.models import ModelWithStateMachine
|
||||||
from workers.tasks import bg_archive_snapshot
|
from workers.tasks import bg_archive_snapshot
|
||||||
from crawls.models import Crawl
|
from crawls.models import Crawl
|
||||||
from machine.models import NetworkInterface
|
from machine.models import NetworkInterface, InstalledBinary
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
|
|
||||||
def calc_icons():
|
def calc_icons():
|
||||||
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
|
||||||
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
|
archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
|
||||||
else:
|
else:
|
||||||
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
|
# Filter for results that have either output_files or output_str
|
||||||
|
from django.db.models import Q
|
||||||
|
archive_results = {r.extractor: r for r in self.archiveresult_set.filter(
|
||||||
|
Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
|
||||||
|
)}
|
||||||
|
|
||||||
path = self.archive_path
|
path = self.archive_path
|
||||||
canon = self.canonical_outputs()
|
canon = self.canonical_outputs()
|
||||||
@@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
|
|
||||||
for extractor in all_extractors:
|
for extractor in all_extractors:
|
||||||
result = archive_results.get(extractor)
|
result = archive_results.get(extractor)
|
||||||
existing = result and result.status == 'succeeded' and result.output
|
existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
|
||||||
icon = get_extractor_icon(extractor)
|
icon = get_extractor_icon(extractor)
|
||||||
output += format_html(
|
output += format_html(
|
||||||
output_template,
|
output_template,
|
||||||
@@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
# Scan each ArchiveResult's output directory for the best file
|
# Scan each ArchiveResult's output directory for the best file
|
||||||
snap_dir = Path(self.output_dir)
|
snap_dir = Path(self.output_dir)
|
||||||
for result in self.archiveresult_set.filter(status='succeeded'):
|
for result in self.archiveresult_set.filter(status='succeeded'):
|
||||||
if not result.output:
|
if not result.output_files and not result.output_str:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Try to find the best output file for this extractor
|
# Try to find the best output file for this extractor
|
||||||
extractor_dir = snap_dir / result.extractor
|
extractor_dir = snap_dir / result.extractor
|
||||||
best_output = None
|
best_output = None
|
||||||
|
|
||||||
if result.output and (snap_dir / result.output).exists():
|
# Check output_files first (new field)
|
||||||
# Use the explicit output path if it exists
|
if result.output_files:
|
||||||
best_output = result.output
|
first_file = next(iter(result.output_files.keys()), None)
|
||||||
elif extractor_dir.exists():
|
if first_file and (extractor_dir / first_file).exists():
|
||||||
|
best_output = f'{result.extractor}/{first_file}'
|
||||||
|
|
||||||
|
# Fallback to output_str if it looks like a path
|
||||||
|
if not best_output and result.output_str and (snap_dir / result.output_str).exists():
|
||||||
|
best_output = result.output_str
|
||||||
|
|
||||||
|
if not best_output and extractor_dir.exists():
|
||||||
# Intelligently find the best file in the extractor's directory
|
# Intelligently find the best file in the extractor's directory
|
||||||
best_output = find_best_output_in_dir(extractor_dir, result.extractor)
|
best_output = find_best_output_in_dir(extractor_dir, result.extractor)
|
||||||
|
|
||||||
@@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
|
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
|
||||||
"""Get the latest output that each archive method produced"""
|
"""Get the latest output that each archive method produced"""
|
||||||
from archivebox.hooks import get_extractors
|
from archivebox.hooks import get_extractors
|
||||||
|
from django.db.models import Q
|
||||||
|
|
||||||
latest: Dict[str, Any] = {}
|
latest: Dict[str, Any] = {}
|
||||||
for archive_method in get_extractors():
|
for archive_method in get_extractors():
|
||||||
results = self.archiveresult_set.filter(extractor=archive_method)
|
results = self.archiveresult_set.filter(extractor=archive_method)
|
||||||
if status is not None:
|
if status is not None:
|
||||||
results = results.filter(status=status)
|
results = results.filter(status=status)
|
||||||
results = results.filter(output__isnull=False).order_by('-start_ts')
|
# Filter for results with output_files or output_str
|
||||||
latest[archive_method] = results.first().output if results.exists() else None
|
results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
|
||||||
|
result = results.first()
|
||||||
|
# Return embed_path() for backwards compatibility
|
||||||
|
latest[archive_method] = result.embed_path() if result else None
|
||||||
return latest
|
return latest
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
@@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
|
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
|
||||||
cmd = models.JSONField(default=None, null=True, blank=True)
|
cmd = models.JSONField(default=None, null=True, blank=True)
|
||||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||||
output = models.CharField(max_length=1024, default=None, null=True, blank=True)
|
|
||||||
|
# New output fields (replacing old 'output' field)
|
||||||
|
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
|
||||||
|
output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
|
||||||
|
output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
|
||||||
|
output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
|
||||||
|
output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
|
||||||
|
|
||||||
|
# Binary FK (optional - set when hook reports cmd)
|
||||||
|
binary = models.ForeignKey(
|
||||||
|
'machine.InstalledBinary',
|
||||||
|
on_delete=models.SET_NULL,
|
||||||
|
null=True, blank=True,
|
||||||
|
related_name='archiveresults',
|
||||||
|
help_text='Primary binary used by this hook'
|
||||||
|
)
|
||||||
|
|
||||||
start_ts = models.DateTimeField(default=None, null=True, blank=True)
|
start_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||||
end_ts = models.DateTimeField(default=None, null=True, blank=True)
|
end_ts = models.DateTimeField(default=None, null=True, blank=True)
|
||||||
|
|
||||||
@@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
"""
|
"""
|
||||||
Get the relative path to the embeddable output file for this result.
|
Get the relative path to the embeddable output file for this result.
|
||||||
|
|
||||||
Returns the output field if set and file exists, otherwise tries to
|
Returns the first file from output_files if set, otherwise tries to
|
||||||
find a reasonable default based on the extractor type.
|
find a reasonable default based on the extractor type.
|
||||||
"""
|
"""
|
||||||
if self.output:
|
# Check output_files dict for primary output
|
||||||
return self.output
|
if self.output_files:
|
||||||
|
# Return first file from output_files (dict preserves insertion order)
|
||||||
|
first_file = next(iter(self.output_files.keys()), None)
|
||||||
|
if first_file:
|
||||||
|
return f'{self.extractor}/{first_file}'
|
||||||
|
|
||||||
|
# Fallback: check output_str if it looks like a file path
|
||||||
|
if self.output_str and ('/' in self.output_str or '.' in self.output_str):
|
||||||
|
return self.output_str
|
||||||
|
|
||||||
# Try to find output file based on extractor's canonical output path
|
# Try to find output file based on extractor's canonical output path
|
||||||
canonical = self.snapshot.canonical_outputs()
|
canonical = self.snapshot.canonical_outputs()
|
||||||
@@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
|
|
||||||
if not hook:
|
if not hook:
|
||||||
self.status = self.StatusChoices.FAILED
|
self.status = self.StatusChoices.FAILED
|
||||||
self.output = f'No hook found for: {self.extractor}'
|
self.output_str = f'No hook found for: {self.extractor}'
|
||||||
self.retry_at = None
|
self.retry_at = None
|
||||||
self.save()
|
self.save()
|
||||||
return
|
return
|
||||||
@@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
url=self.snapshot.url,
|
url=self.snapshot.url,
|
||||||
snapshot_id=str(self.snapshot.id),
|
snapshot_id=str(self.snapshot.id),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# BACKGROUND HOOK - still running, return immediately
|
||||||
|
if result is None:
|
||||||
|
self.status = self.StatusChoices.STARTED
|
||||||
|
self.start_ts = start_ts
|
||||||
|
self.pwd = str(extractor_dir)
|
||||||
|
self.save()
|
||||||
|
return
|
||||||
|
|
||||||
end_ts = timezone.now()
|
end_ts = timezone.now()
|
||||||
|
|
||||||
|
# Get records from hook output (new JSONL format)
|
||||||
|
records = result.get('records', [])
|
||||||
|
|
||||||
# Clean up empty output directory if no files were created
|
# Clean up empty output directory if no files were created
|
||||||
output_files = result.get('output_files', [])
|
output_files = result.get('output_files', [])
|
||||||
if not output_files and extractor_dir.exists():
|
if not output_files and extractor_dir.exists():
|
||||||
@@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
except (OSError, RuntimeError):
|
except (OSError, RuntimeError):
|
||||||
pass # Directory not empty or can't be removed, that's fine
|
pass # Directory not empty or can't be removed, that's fine
|
||||||
|
|
||||||
# Determine status from return code and JSON output
|
# Find the ArchiveResult record from hook output (if any)
|
||||||
|
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||||
output_json = result.get('output_json') or {}
|
output_json = result.get('output_json') or {}
|
||||||
json_status = output_json.get('status')
|
|
||||||
|
|
||||||
if json_status == 'skipped':
|
# Determine status from records, output_json, or return code
|
||||||
status = 'skipped'
|
if ar_records:
|
||||||
elif json_status == 'failed':
|
# Use status from first ArchiveResult record
|
||||||
status = 'failed'
|
hook_data = ar_records[0]
|
||||||
|
status = hook_data.get('status', 'failed')
|
||||||
|
elif output_json.get('status'):
|
||||||
|
status = output_json['status']
|
||||||
elif result['returncode'] == 0:
|
elif result['returncode'] == 0:
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
else:
|
else:
|
||||||
@@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
'skipped': self.StatusChoices.SKIPPED,
|
'skipped': self.StatusChoices.SKIPPED,
|
||||||
}
|
}
|
||||||
self.status = status_map.get(status, self.StatusChoices.FAILED)
|
self.status = status_map.get(status, self.StatusChoices.FAILED)
|
||||||
self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
|
|
||||||
|
# Set output fields from records or output_json
|
||||||
|
if ar_records:
|
||||||
|
hook_data = ar_records[0]
|
||||||
|
self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
|
||||||
|
self.output_json = hook_data.get('output_json')
|
||||||
|
# Set cmd from JSONL record
|
||||||
|
if hook_data.get('cmd'):
|
||||||
|
self.cmd = hook_data['cmd']
|
||||||
|
self._set_binary_from_cmd(hook_data['cmd'])
|
||||||
|
if hook_data.get('cmd_version'):
|
||||||
|
self.cmd_version = hook_data['cmd_version'][:128]
|
||||||
|
else:
|
||||||
|
# Fallback to legacy output_json format
|
||||||
|
self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or ''
|
||||||
|
self.output_json = output_json.get('output_json') if output_json.get('output_json') else None
|
||||||
|
if output_json.get('cmd_version'):
|
||||||
|
self.cmd_version = output_json['cmd_version'][:128]
|
||||||
|
if output_json.get('cmd'):
|
||||||
|
self.cmd = output_json['cmd']
|
||||||
|
self._set_binary_from_cmd(output_json['cmd'])
|
||||||
|
|
||||||
self.start_ts = start_ts
|
self.start_ts = start_ts
|
||||||
self.end_ts = end_ts
|
self.end_ts = end_ts
|
||||||
self.retry_at = None
|
self.retry_at = None
|
||||||
self.pwd = str(extractor_dir)
|
self.pwd = str(extractor_dir)
|
||||||
|
|
||||||
# Save cmd and cmd_version from extractor output
|
# Populate output_files, output_size, output_mimetypes from filesystem
|
||||||
if output_json.get('cmd_version'):
|
if extractor_dir.exists():
|
||||||
self.cmd_version = output_json['cmd_version'][:128] # Max length from model
|
self._populate_output_fields(extractor_dir)
|
||||||
if output_json.get('cmd'):
|
|
||||||
self.cmd = output_json['cmd']
|
|
||||||
|
|
||||||
self.save()
|
self.save()
|
||||||
|
|
||||||
|
# Process side-effect records (InstalledBinary, Machine config, etc.)
|
||||||
|
from archivebox.hooks import create_model_record
|
||||||
|
for record in records:
|
||||||
|
if record.get('type') != 'ArchiveResult':
|
||||||
|
create_model_record(record.copy()) # Copy to avoid mutating original
|
||||||
|
|
||||||
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
|
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
|
||||||
self._queue_urls_for_crawl(extractor_dir)
|
self._queue_urls_for_crawl(extractor_dir)
|
||||||
|
|
||||||
@@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
if self.status == self.StatusChoices.SUCCEEDED:
|
if self.status == self.StatusChoices.SUCCEEDED:
|
||||||
self.trigger_search_indexing()
|
self.trigger_search_indexing()
|
||||||
|
|
||||||
|
def _populate_output_fields(self, output_dir: Path) -> None:
|
||||||
|
"""
|
||||||
|
Walk output directory and populate output_files, output_size, output_mimetypes.
|
||||||
|
"""
|
||||||
|
import mimetypes
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
|
||||||
|
|
||||||
|
# Track mimetypes and sizes for aggregation
|
||||||
|
mime_sizes = defaultdict(int)
|
||||||
|
total_size = 0
|
||||||
|
output_files = {} # Dict keyed by relative path
|
||||||
|
|
||||||
|
for file_path in output_dir.rglob('*'):
|
||||||
|
# Skip non-files and infrastructure files
|
||||||
|
if not file_path.is_file():
|
||||||
|
continue
|
||||||
|
if file_path.name in exclude_names:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get file stats
|
||||||
|
try:
|
||||||
|
stat = file_path.stat()
|
||||||
|
mime_type, _ = mimetypes.guess_type(str(file_path))
|
||||||
|
mime_type = mime_type or 'application/octet-stream'
|
||||||
|
|
||||||
|
# Track for ArchiveResult fields
|
||||||
|
relative_path = str(file_path.relative_to(output_dir))
|
||||||
|
output_files[relative_path] = {} # Empty dict, extensible for future metadata
|
||||||
|
mime_sizes[mime_type] += stat.st_size
|
||||||
|
total_size += stat.st_size
|
||||||
|
except (OSError, IOError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Populate ArchiveResult fields
|
||||||
|
self.output_files = output_files
|
||||||
|
self.output_size = total_size
|
||||||
|
|
||||||
|
# Build output_mimetypes CSV (sorted by size descending)
|
||||||
|
sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
|
||||||
|
self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
|
||||||
|
|
||||||
|
def _set_binary_from_cmd(self, cmd: list) -> None:
|
||||||
|
"""
|
||||||
|
Find InstalledBinary for command and set binary FK.
|
||||||
|
|
||||||
|
Tries matching by absolute path first, then by binary name.
|
||||||
|
Only matches binaries on the current machine.
|
||||||
|
"""
|
||||||
|
if not cmd:
|
||||||
|
return
|
||||||
|
|
||||||
|
from machine.models import Machine
|
||||||
|
|
||||||
|
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||||
|
machine = Machine.current()
|
||||||
|
|
||||||
|
# Try matching by absolute path first
|
||||||
|
binary = InstalledBinary.objects.filter(
|
||||||
|
abspath=bin_path_or_name,
|
||||||
|
machine=machine
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if binary:
|
||||||
|
self.binary = binary
|
||||||
|
return
|
||||||
|
|
||||||
|
# Fallback: match by binary name
|
||||||
|
bin_name = Path(bin_path_or_name).name
|
||||||
|
binary = InstalledBinary.objects.filter(
|
||||||
|
name=bin_name,
|
||||||
|
machine=machine
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if binary:
|
||||||
|
self.binary = binary
|
||||||
|
|
||||||
def _update_snapshot_title(self, extractor_dir: Path):
|
def _update_snapshot_title(self, extractor_dir: Path):
|
||||||
"""
|
"""
|
||||||
Update snapshot title from title extractor output.
|
Update snapshot title from title extractor output.
|
||||||
@@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
def output_dir(self) -> Path:
|
def output_dir(self) -> Path:
|
||||||
"""Get the output directory for this extractor's results."""
|
"""Get the output directory for this extractor's results."""
|
||||||
return Path(self.snapshot.output_dir) / self.extractor
|
return Path(self.snapshot.output_dir) / self.extractor
|
||||||
|
|
||||||
|
def is_background_hook(self) -> bool:
|
||||||
|
"""Check if this ArchiveResult is for a background hook."""
|
||||||
|
extractor_dir = Path(self.pwd) if self.pwd else None
|
||||||
|
if not extractor_dir:
|
||||||
|
return False
|
||||||
|
pid_file = extractor_dir / 'hook.pid'
|
||||||
|
return pid_file.exists()
|
||||||
|
|
||||||
|
def check_background_completed(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if background hook process has exited.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if completed (process exited), False if still running
|
||||||
|
"""
|
||||||
|
extractor_dir = Path(self.pwd) if self.pwd else None
|
||||||
|
if not extractor_dir:
|
||||||
|
return True # No pwd = completed or failed to start
|
||||||
|
|
||||||
|
pid_file = extractor_dir / 'hook.pid'
|
||||||
|
if not pid_file.exists():
|
||||||
|
return True # No PID file = completed or failed to start
|
||||||
|
|
||||||
|
try:
|
||||||
|
pid = int(pid_file.read_text().strip())
|
||||||
|
os.kill(pid, 0) # Signal 0 = check if process exists
|
||||||
|
return False # Still running
|
||||||
|
except (OSError, ValueError):
|
||||||
|
return True # Process exited or invalid PID
|
||||||
|
|
||||||
|
def finalize_background_hook(self) -> None:
|
||||||
|
"""
|
||||||
|
Collect final results from completed background hook.
|
||||||
|
|
||||||
|
Same logic as run() but for background hooks that already started.
|
||||||
|
"""
|
||||||
|
from archivebox.hooks import create_model_record
|
||||||
|
|
||||||
|
extractor_dir = Path(self.pwd) if self.pwd else None
|
||||||
|
if not extractor_dir or not extractor_dir.exists():
|
||||||
|
self.status = self.StatusChoices.FAILED
|
||||||
|
self.output_str = 'Background hook output directory not found'
|
||||||
|
self.end_ts = timezone.now()
|
||||||
|
self.retry_at = None
|
||||||
|
self.save()
|
||||||
|
return
|
||||||
|
|
||||||
|
stdout_file = extractor_dir / 'stdout.log'
|
||||||
|
stderr_file = extractor_dir / 'stderr.log'
|
||||||
|
|
||||||
|
# Read logs
|
||||||
|
stdout = stdout_file.read_text() if stdout_file.exists() else ''
|
||||||
|
|
||||||
|
# Parse JSONL output
|
||||||
|
records = []
|
||||||
|
for line in stdout.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line or not line.startswith('{'):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
if 'type' in data:
|
||||||
|
records.append(data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find the ArchiveResult record
|
||||||
|
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||||
|
|
||||||
|
if ar_records:
|
||||||
|
hook_data = ar_records[0]
|
||||||
|
|
||||||
|
# Apply hook's data
|
||||||
|
status_str = hook_data.get('status', 'failed')
|
||||||
|
status_map = {
|
||||||
|
'succeeded': self.StatusChoices.SUCCEEDED,
|
||||||
|
'failed': self.StatusChoices.FAILED,
|
||||||
|
'skipped': self.StatusChoices.SKIPPED,
|
||||||
|
}
|
||||||
|
self.status = status_map.get(status_str, self.StatusChoices.FAILED)
|
||||||
|
|
||||||
|
self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
|
||||||
|
self.output_json = hook_data.get('output_json')
|
||||||
|
|
||||||
|
# Determine binary FK from cmd
|
||||||
|
if hook_data.get('cmd'):
|
||||||
|
self.cmd = hook_data['cmd']
|
||||||
|
self._set_binary_from_cmd(hook_data['cmd'])
|
||||||
|
if hook_data.get('cmd_version'):
|
||||||
|
self.cmd_version = hook_data['cmd_version'][:128]
|
||||||
|
else:
|
||||||
|
# No output = failed
|
||||||
|
self.status = self.StatusChoices.FAILED
|
||||||
|
self.output_str = 'Background hook did not output ArchiveResult'
|
||||||
|
|
||||||
|
self.end_ts = timezone.now()
|
||||||
|
self.retry_at = None
|
||||||
|
|
||||||
|
# Populate output fields from filesystem
|
||||||
|
if extractor_dir.exists():
|
||||||
|
self._populate_output_fields(extractor_dir)
|
||||||
|
|
||||||
|
self.save()
|
||||||
|
|
||||||
|
# Create any side-effect records
|
||||||
|
for record in records:
|
||||||
|
if record.get('type') != 'ArchiveResult':
|
||||||
|
create_model_record(record.copy())
|
||||||
|
|
||||||
|
# Cleanup PID files and empty logs
|
||||||
|
pid_file = extractor_dir / 'hook.pid'
|
||||||
|
pid_file.unlink(missing_ok=True)
|
||||||
|
if stdout_file.exists() and stdout_file.stat().st_size == 0:
|
||||||
|
stdout_file.unlink()
|
||||||
|
if stderr_file.exists() and stderr_file.stat().st_size == 0:
|
||||||
|
stderr_file.unlink()
|
||||||
|
|||||||
@@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
|||||||
# if no archiveresults exist yet, it's not finished
|
# if no archiveresults exist yet, it's not finished
|
||||||
if not self.snapshot.archiveresult_set.exists():
|
if not self.snapshot.archiveresult_set.exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# if archiveresults exist but are still pending, it's not finished
|
# if archiveresults exist but are still pending, it's not finished
|
||||||
if self.snapshot.pending_archiveresults().exists():
|
if self.snapshot.pending_archiveresults().exists():
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# Check for background hooks that are still running
|
||||||
|
started_results = self.snapshot.archiveresult_set.filter(
|
||||||
|
status=ArchiveResult.StatusChoices.STARTED
|
||||||
|
)
|
||||||
|
for result in started_results:
|
||||||
|
if not result.check_background_completed():
|
||||||
|
return False # Still running
|
||||||
|
|
||||||
|
# Completed - finalize it
|
||||||
|
result.finalize_background_hook()
|
||||||
|
|
||||||
# otherwise archiveresults exist and are all finished, so it's finished
|
# otherwise archiveresults exist and are all finished, so it's finished
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||||||
|
|
||||||
def is_backoff(self) -> bool:
|
def is_backoff(self) -> bool:
|
||||||
"""Check if we should backoff and retry later."""
|
"""Check if we should backoff and retry later."""
|
||||||
# Backoff if status is still started (extractor didn't complete) and output is None
|
# Backoff if status is still started (extractor didn't complete) and output_str is empty
|
||||||
return (
|
return (
|
||||||
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
|
||||||
self.archiveresult.output is None
|
not self.archiveresult.output_str
|
||||||
)
|
)
|
||||||
|
|
||||||
def is_finished(self) -> bool:
|
def is_finished(self) -> bool:
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str:
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
# Use embed_path() for the display path (includes canonical paths)
|
# Use embed_path() for the display path (includes canonical paths)
|
||||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||||
|
|
||||||
# Create a mini template and render it with context
|
# Create a mini template and render it with context
|
||||||
try:
|
try:
|
||||||
@@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str:
|
|||||||
if not template_str:
|
if not template_str:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tpl = template.Template(template_str)
|
tpl = template.Template(template_str)
|
||||||
@@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str:
|
|||||||
if not template_str:
|
if not template_str:
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
|
output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
tpl = template.Template(template_str)
|
tpl = template.Template(template_str)
|
||||||
|
|||||||
@@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False):
|
|||||||
output_files: List[str]
|
output_files: List[str]
|
||||||
duration_ms: int
|
duration_ms: int
|
||||||
hook: str
|
hook: str
|
||||||
|
# New fields for JSONL parsing
|
||||||
|
records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
|
||||||
|
|
||||||
|
|
||||||
def discover_hooks(event_name: str) -> List[Path]:
|
def discover_hooks(event_name: str) -> List[Path]:
|
||||||
@@ -268,7 +270,9 @@ def run_hook(
|
|||||||
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
|
||||||
|
|
||||||
# Detect if this is a background hook (long-running daemon)
|
# Detect if this is a background hook (long-running daemon)
|
||||||
is_background = '__background' in script.stem
|
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
|
||||||
|
# Old convention: __background in stem (for backwards compatibility)
|
||||||
|
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||||
|
|
||||||
# Set up output files for ALL hooks (useful for debugging)
|
# Set up output files for ALL hooks (useful for debugging)
|
||||||
stdout_file = output_dir / 'stdout.log'
|
stdout_file = output_dir / 'stdout.log'
|
||||||
@@ -322,13 +326,44 @@ def run_hook(
|
|||||||
# Exclude the log files themselves from new_files
|
# Exclude the log files themselves from new_files
|
||||||
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
|
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
|
||||||
|
|
||||||
# Parse RESULT_JSON from stdout
|
# Parse JSONL output from stdout
|
||||||
|
# Supports both new JSONL format (any line starting with { that has 'type')
|
||||||
|
# and legacy RESULT_JSON= format for backwards compatibility
|
||||||
output_json = None
|
output_json = None
|
||||||
|
records = []
|
||||||
|
plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
|
||||||
|
|
||||||
for line in stdout.splitlines():
|
for line in stdout.splitlines():
|
||||||
if line.startswith('RESULT_JSON='):
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# New JSONL format: any line starting with { that has 'type' field
|
||||||
|
if line.startswith('{'):
|
||||||
try:
|
try:
|
||||||
output_json = json.loads(line[len('RESULT_JSON='):])
|
data = json.loads(line)
|
||||||
break
|
if 'type' in data:
|
||||||
|
# Add plugin metadata to every record
|
||||||
|
data['plugin'] = plugin_name
|
||||||
|
data['plugin_hook'] = str(script)
|
||||||
|
records.append(data)
|
||||||
|
# For backwards compatibility, also set output_json for first ArchiveResult
|
||||||
|
if data.get('type') == 'ArchiveResult' and output_json is None:
|
||||||
|
output_json = data
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Legacy format: RESULT_JSON=...
|
||||||
|
elif line.startswith('RESULT_JSON='):
|
||||||
|
try:
|
||||||
|
data = json.loads(line[len('RESULT_JSON='):])
|
||||||
|
if output_json is None:
|
||||||
|
output_json = data
|
||||||
|
# Convert legacy format to new format
|
||||||
|
data['type'] = 'ArchiveResult'
|
||||||
|
data['plugin'] = plugin_name
|
||||||
|
data['plugin_hook'] = str(script)
|
||||||
|
records.append(data)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -348,6 +383,7 @@ def run_hook(
|
|||||||
output_files=new_files,
|
output_files=new_files,
|
||||||
duration_ms=duration_ms,
|
duration_ms=duration_ms,
|
||||||
hook=str(script),
|
hook=str(script),
|
||||||
|
records=records,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -360,6 +396,7 @@ def run_hook(
|
|||||||
output_files=[],
|
output_files=[],
|
||||||
duration_ms=duration_ms,
|
duration_ms=duration_ms,
|
||||||
hook=str(script),
|
hook=str(script),
|
||||||
|
records=[],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
|||||||
return templates
|
return templates
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Hook Result Processing Helpers
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Find InstalledBinary for a command, trying abspath first then name.
|
||||||
|
Only matches binaries on the current machine.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
|
||||||
|
machine_id: Current machine ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Binary ID as string if found, None otherwise
|
||||||
|
"""
|
||||||
|
if not cmd:
|
||||||
|
return None
|
||||||
|
|
||||||
|
from machine.models import InstalledBinary
|
||||||
|
|
||||||
|
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||||
|
|
||||||
|
# Try matching by absolute path first
|
||||||
|
binary = InstalledBinary.objects.filter(
|
||||||
|
abspath=bin_path_or_name,
|
||||||
|
machine_id=machine_id
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if binary:
|
||||||
|
return str(binary.id)
|
||||||
|
|
||||||
|
# Fallback: match by binary name
|
||||||
|
bin_name = Path(bin_path_or_name).name
|
||||||
|
binary = InstalledBinary.objects.filter(
|
||||||
|
name=bin_name,
|
||||||
|
machine_id=machine_id
|
||||||
|
).first()
|
||||||
|
|
||||||
|
return str(binary.id) if binary else None
|
||||||
|
|
||||||
|
|
||||||
|
def create_model_record(record: Dict[str, Any]) -> Any:
|
||||||
|
"""
|
||||||
|
Generic helper to create/update model instances from hook JSONL output.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
record: Dict with 'type' field and model data
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Created/updated model instance, or None if type unknown
|
||||||
|
"""
|
||||||
|
from machine.models import InstalledBinary, Machine
|
||||||
|
|
||||||
|
record_type = record.pop('type', None)
|
||||||
|
if not record_type:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Remove plugin metadata (not model fields)
|
||||||
|
record.pop('plugin', None)
|
||||||
|
record.pop('plugin_hook', None)
|
||||||
|
|
||||||
|
if record_type == 'InstalledBinary':
|
||||||
|
# InstalledBinary requires machine FK
|
||||||
|
machine = Machine.current()
|
||||||
|
record.setdefault('machine', machine)
|
||||||
|
|
||||||
|
# Required fields check
|
||||||
|
name = record.get('name')
|
||||||
|
abspath = record.get('abspath')
|
||||||
|
if not name or not abspath:
|
||||||
|
return None
|
||||||
|
|
||||||
|
obj, created = InstalledBinary.objects.update_or_create(
|
||||||
|
machine=machine,
|
||||||
|
name=name,
|
||||||
|
defaults={
|
||||||
|
'abspath': abspath,
|
||||||
|
'version': record.get('version', ''),
|
||||||
|
'sha256': record.get('sha256', ''),
|
||||||
|
'binprovider': record.get('binprovider', 'env'),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return obj
|
||||||
|
|
||||||
|
elif record_type == 'Machine':
|
||||||
|
# Machine config update (special _method handling)
|
||||||
|
method = record.pop('_method', None)
|
||||||
|
if method == 'update':
|
||||||
|
key = record.get('key')
|
||||||
|
value = record.get('value')
|
||||||
|
if key and value:
|
||||||
|
machine = Machine.current()
|
||||||
|
if not machine.config:
|
||||||
|
machine.config = {}
|
||||||
|
machine.config[key] = value
|
||||||
|
machine.save(update_fields=['config'])
|
||||||
|
return machine
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Add more types as needed (Dependency, Snapshot, etc.)
|
||||||
|
else:
|
||||||
|
# Unknown type - log warning but don't fail
|
||||||
|
import sys
|
||||||
|
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user