diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py index f525b84f..18b5fadc 100644 --- a/archivebox/core/admin_archiveresults.py +++ b/archivebox/core/admin_archiveresults.py @@ -47,7 +47,7 @@ def render_archiveresults_list(archiveresults_qs, limit=50): end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-' # Truncate output for display - full_output = result.output or '-' + full_output = result.output_str or '-' output_display = full_output[:60] if len(full_output) > 60: output_display += '...' @@ -55,8 +55,9 @@ def render_archiveresults_list(archiveresults_qs, limit=50): # Get full command as tooltip cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-') - # Build output link - output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/' + # Build output link - use embed_path() which checks output_files first + embed_path = result.embed_path() if hasattr(result, 'embed_path') else None + output_link = f'/archive/{result.snapshot.timestamp}/{embed_path}' if embed_path and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/' # Get version - try cmd_version field version = result.cmd_version if result.cmd_version else '-' @@ -336,27 +337,29 @@ class ArchiveResultAdmin(BaseModelAdmin): ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), ) - def output_str(self, result): - # Determine output link path - use output if file exists, otherwise link to index - output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html' + def output_display(self, result): + # Determine output link path - use embed_path() which checks output_files + embed_path = result.embed_path() if hasattr(result, 'embed_path') else None + output_path = embed_path if (result.status == 'succeeded' and embed_path) else 'index.html' return format_html( '↗️
{}',
result.snapshot.timestamp,
output_path,
- result.output,
+ result.output_str,
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
- output_str = format_html(
+ output_html = format_html(
'{}', str(result.snapshot.timestamp))
- path_from_output_str = (snapshot_dir / (result.output or ''))
- output_str += format_html('{}/{}
', str(snapshot_dir), str(result.output))
- if os.access(path_from_output_str, os.R_OK):
- root_dir = str(path_from_output_str)
+ output_html += format_html('See result files ...
', str(result.snapshot.timestamp))
+ embed_path = result.embed_path() if hasattr(result, 'embed_path') else ''
+ path_from_embed = (snapshot_dir / (embed_path or ''))
+ output_html += format_html('{}/{}
', str(snapshot_dir), str(embed_path))
+ if os.access(path_from_embed, os.R_OK):
+ root_dir = str(path_from_embed)
else:
root_dir = str(snapshot_dir)
diff --git a/archivebox/core/migrations/0029_archiveresult_hook_fields.py b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
new file mode 100644
index 00000000..0ff1f0c2
--- /dev/null
+++ b/archivebox/core/migrations/0029_archiveresult_hook_fields.py
@@ -0,0 +1,80 @@
+# Generated by Django for hook architecture support
+# Phase 1: Add new ArchiveResult fields for hook output
+
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0028_snapshot_fs_version'),
+ ('machine', '0002_rename_custom_cmds_to_overrides'),
+ ]
+
+ operations = [
+ # Add new output fields (keep old 'output' temporarily for migration)
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_str',
+ field=models.TextField(
+ blank=True,
+ default='',
+ help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
+ ),
+ ),
+
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_json',
+ field=models.JSONField(
+ null=True,
+ blank=True,
+ default=None,
+ help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
+ ),
+ ),
+
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_files',
+ field=models.JSONField(
+ default=dict,
+ help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
+ ),
+ ),
+
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_size',
+ field=models.BigIntegerField(
+ default=0,
+ help_text='Total recursive size in bytes of all output files'
+ ),
+ ),
+
+ migrations.AddField(
+ model_name='archiveresult',
+ name='output_mimetypes',
+ field=models.CharField(
+ max_length=512,
+ blank=True,
+ default='',
+ help_text='CSV of mimetypes sorted by size descending'
+ ),
+ ),
+
+ # Add binary FK (optional)
+ migrations.AddField(
+ model_name='archiveresult',
+ name='binary',
+ field=models.ForeignKey(
+ 'machine.InstalledBinary',
+ on_delete=models.SET_NULL,
+ null=True,
+ blank=True,
+ related_name='archiveresults',
+ help_text='Primary binary used by this hook (optional)'
+ ),
+ ),
+ ]
diff --git a/archivebox/core/migrations/0030_migrate_output_field.py b/archivebox/core/migrations/0030_migrate_output_field.py
new file mode 100644
index 00000000..5dafb7e8
--- /dev/null
+++ b/archivebox/core/migrations/0030_migrate_output_field.py
@@ -0,0 +1,64 @@
+# Generated by Django for hook architecture support
+# Phase 1: Migrate existing 'output' field to new split fields
+
+from django.db import migrations
+import json
+
+
+def migrate_output_field(apps, schema_editor):
+ """
+ Migrate existing 'output' field to new split fields.
+
+ Logic:
+ - If output contains JSON {...}, move to output_json
+ - Otherwise, move to output_str
+ """
+ ArchiveResult = apps.get_model('core', 'ArchiveResult')
+
+ for ar in ArchiveResult.objects.all().iterator():
+ old_output = ar.output or ''
+
+ # Case 1: JSON output
+ if old_output.strip().startswith('{'):
+ try:
+ parsed = json.loads(old_output)
+ ar.output_json = parsed
+ ar.output_str = ''
+ except json.JSONDecodeError:
+ # Not valid JSON, treat as string
+ ar.output_str = old_output
+
+ # Case 2: File path or plain string
+ else:
+ ar.output_str = old_output
+
+ ar.save(update_fields=['output_str', 'output_json'])
+
+
+def reverse_migrate(apps, schema_editor):
+ """Reverse migration - copy output_str back to output."""
+ ArchiveResult = apps.get_model('core', 'ArchiveResult')
+
+ for ar in ArchiveResult.objects.all().iterator():
+ if ar.output_json:
+ ar.output = json.dumps(ar.output_json)
+ else:
+ ar.output = ar.output_str or ''
+ ar.save(update_fields=['output'])
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('core', '0029_archiveresult_hook_fields'),
+ ]
+
+ operations = [
+ migrations.RunPython(migrate_output_field, reverse_migrate),
+
+ # Now safe to remove old 'output' field
+ migrations.RemoveField(
+ model_name='archiveresult',
+ name='output',
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 6bac5679..1e5dcc0f 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -36,7 +36,7 @@ from archivebox.base_models.models import (
from workers.models import ModelWithStateMachine
from workers.tasks import bg_archive_snapshot
from crawls.models import Crawl
-from machine.models import NetworkInterface
+from machine.models import NetworkInterface, InstalledBinary
@@ -485,9 +485,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
- archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
+ archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and (r.output_files or r.output_str)}
else:
- archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
+ # Filter for results that have either output_files or output_str
+ from django.db.models import Q
+ archive_results = {r.extractor: r for r in self.archiveresult_set.filter(
+ Q(status="succeeded") & (Q(output_files__isnull=False) | ~Q(output_str=''))
+ )}
path = self.archive_path
canon = self.canonical_outputs()
@@ -499,7 +503,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
for extractor in all_extractors:
result = archive_results.get(extractor)
- existing = result and result.status == 'succeeded' and result.output
+ existing = result and result.status == 'succeeded' and (result.output_files or result.output_str)
icon = get_extractor_icon(extractor)
output += format_html(
output_template,
@@ -825,17 +829,24 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Scan each ArchiveResult's output directory for the best file
snap_dir = Path(self.output_dir)
for result in self.archiveresult_set.filter(status='succeeded'):
- if not result.output:
+ if not result.output_files and not result.output_str:
continue
# Try to find the best output file for this extractor
extractor_dir = snap_dir / result.extractor
best_output = None
- if result.output and (snap_dir / result.output).exists():
- # Use the explicit output path if it exists
- best_output = result.output
- elif extractor_dir.exists():
+ # Check output_files first (new field)
+ if result.output_files:
+ first_file = next(iter(result.output_files.keys()), None)
+ if first_file and (extractor_dir / first_file).exists():
+ best_output = f'{result.extractor}/{first_file}'
+
+ # Fallback to output_str if it looks like a path
+ if not best_output and result.output_str and (snap_dir / result.output_str).exists():
+ best_output = result.output_str
+
+ if not best_output and extractor_dir.exists():
# Intelligently find the best file in the extractor's directory
best_output = find_best_output_in_dir(extractor_dir, result.extractor)
@@ -873,14 +884,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
"""Get the latest output that each archive method produced"""
from archivebox.hooks import get_extractors
+ from django.db.models import Q
latest: Dict[str, Any] = {}
for archive_method in get_extractors():
results = self.archiveresult_set.filter(extractor=archive_method)
if status is not None:
results = results.filter(status=status)
- results = results.filter(output__isnull=False).order_by('-start_ts')
- latest[archive_method] = results.first().output if results.exists() else None
+ # Filter for results with output_files or output_str
+ results = results.filter(Q(output_files__isnull=False) | ~Q(output_str='')).order_by('-start_ts')
+ result = results.first()
+ # Return embed_path() for backwards compatibility
+ latest[archive_method] = result.embed_path() if result else None
return latest
# =========================================================================
@@ -1021,7 +1036,23 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
cmd = models.JSONField(default=None, null=True, blank=True)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
- output = models.CharField(max_length=1024, default=None, null=True, blank=True)
+
+ # New output fields (replacing old 'output' field)
+ output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
+ output_json = models.JSONField(null=True, blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)')
+ output_files = models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}')
+ output_size = models.BigIntegerField(default=0, help_text='Total bytes of all output files')
+ output_mimetypes = models.CharField(max_length=512, blank=True, default='', help_text='CSV of mimetypes sorted by size')
+
+ # Binary FK (optional - set when hook reports cmd)
+ binary = models.ForeignKey(
+ 'machine.InstalledBinary',
+ on_delete=models.SET_NULL,
+ null=True, blank=True,
+ related_name='archiveresults',
+ help_text='Primary binary used by this hook'
+ )
+
start_ts = models.DateTimeField(default=None, null=True, blank=True)
end_ts = models.DateTimeField(default=None, null=True, blank=True)
@@ -1094,11 +1125,19 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""
Get the relative path to the embeddable output file for this result.
- Returns the output field if set and file exists, otherwise tries to
+ Returns the first file from output_files if set, otherwise tries to
find a reasonable default based on the extractor type.
"""
- if self.output:
- return self.output
+ # Check output_files dict for primary output
+ if self.output_files:
+ # Return first file from output_files (dict preserves insertion order)
+ first_file = next(iter(self.output_files.keys()), None)
+ if first_file:
+ return f'{self.extractor}/{first_file}'
+
+ # Fallback: check output_str if it looks like a file path
+ if self.output_str and ('/' in self.output_str or '.' in self.output_str):
+ return self.output_str
# Try to find output file based on extractor's canonical output path
canonical = self.snapshot.canonical_outputs()
@@ -1149,7 +1188,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if not hook:
self.status = self.StatusChoices.FAILED
- self.output = f'No hook found for: {self.extractor}'
+ self.output_str = f'No hook found for: {self.extractor}'
self.retry_at = None
self.save()
return
@@ -1167,8 +1206,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
)
+
+ # BACKGROUND HOOK - still running, return immediately
+ if result is None:
+ self.status = self.StatusChoices.STARTED
+ self.start_ts = start_ts
+ self.pwd = str(extractor_dir)
+ self.save()
+ return
+
end_ts = timezone.now()
+ # Get records from hook output (new JSONL format)
+ records = result.get('records', [])
+
# Clean up empty output directory if no files were created
output_files = result.get('output_files', [])
if not output_files and extractor_dir.exists():
@@ -1179,14 +1230,17 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
except (OSError, RuntimeError):
pass # Directory not empty or can't be removed, that's fine
- # Determine status from return code and JSON output
+ # Find the ArchiveResult record from hook output (if any)
+ ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
output_json = result.get('output_json') or {}
- json_status = output_json.get('status')
- if json_status == 'skipped':
- status = 'skipped'
- elif json_status == 'failed':
- status = 'failed'
+ # Determine status from records, output_json, or return code
+ if ar_records:
+ # Use status from first ArchiveResult record
+ hook_data = ar_records[0]
+ status = hook_data.get('status', 'failed')
+ elif output_json.get('status'):
+ status = output_json['status']
elif result['returncode'] == 0:
status = 'succeeded'
else:
@@ -1199,20 +1253,45 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
'skipped': self.StatusChoices.SKIPPED,
}
self.status = status_map.get(status, self.StatusChoices.FAILED)
- self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
+
+ # Set output fields from records or output_json
+ if ar_records:
+ hook_data = ar_records[0]
+ self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
+ self.output_json = hook_data.get('output_json')
+ # Set cmd from JSONL record
+ if hook_data.get('cmd'):
+ self.cmd = hook_data['cmd']
+ self._set_binary_from_cmd(hook_data['cmd'])
+ if hook_data.get('cmd_version'):
+ self.cmd_version = hook_data['cmd_version'][:128]
+ else:
+ # Fallback to legacy output_json format
+ self.output_str = output_json.get('output_str') or output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or ''
+ self.output_json = output_json.get('output_json') if output_json.get('output_json') else None
+ if output_json.get('cmd_version'):
+ self.cmd_version = output_json['cmd_version'][:128]
+ if output_json.get('cmd'):
+ self.cmd = output_json['cmd']
+ self._set_binary_from_cmd(output_json['cmd'])
+
self.start_ts = start_ts
self.end_ts = end_ts
self.retry_at = None
self.pwd = str(extractor_dir)
- # Save cmd and cmd_version from extractor output
- if output_json.get('cmd_version'):
- self.cmd_version = output_json['cmd_version'][:128] # Max length from model
- if output_json.get('cmd'):
- self.cmd = output_json['cmd']
+ # Populate output_files, output_size, output_mimetypes from filesystem
+ if extractor_dir.exists():
+ self._populate_output_fields(extractor_dir)
self.save()
+ # Process side-effect records (InstalledBinary, Machine config, etc.)
+ from archivebox.hooks import create_model_record
+ for record in records:
+ if record.get('type') != 'ArchiveResult':
+ create_model_record(record.copy()) # Copy to avoid mutating original
+
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
self._queue_urls_for_crawl(extractor_dir)
@@ -1226,6 +1305,84 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if self.status == self.StatusChoices.SUCCEEDED:
self.trigger_search_indexing()
+ def _populate_output_fields(self, output_dir: Path) -> None:
+ """
+ Walk output directory and populate output_files, output_size, output_mimetypes.
+ """
+ import mimetypes
+ from collections import defaultdict
+
+ exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
+
+ # Track mimetypes and sizes for aggregation
+ mime_sizes = defaultdict(int)
+ total_size = 0
+ output_files = {} # Dict keyed by relative path
+
+ for file_path in output_dir.rglob('*'):
+ # Skip non-files and infrastructure files
+ if not file_path.is_file():
+ continue
+ if file_path.name in exclude_names:
+ continue
+
+ # Get file stats
+ try:
+ stat = file_path.stat()
+ mime_type, _ = mimetypes.guess_type(str(file_path))
+ mime_type = mime_type or 'application/octet-stream'
+
+ # Track for ArchiveResult fields
+ relative_path = str(file_path.relative_to(output_dir))
+ output_files[relative_path] = {} # Empty dict, extensible for future metadata
+ mime_sizes[mime_type] += stat.st_size
+ total_size += stat.st_size
+ except (OSError, IOError):
+ continue
+
+ # Populate ArchiveResult fields
+ self.output_files = output_files
+ self.output_size = total_size
+
+ # Build output_mimetypes CSV (sorted by size descending)
+ sorted_mimes = sorted(mime_sizes.items(), key=lambda x: x[1], reverse=True)
+ self.output_mimetypes = ','.join(mime for mime, _ in sorted_mimes)
+
+ def _set_binary_from_cmd(self, cmd: list) -> None:
+ """
+ Find InstalledBinary for command and set binary FK.
+
+ Tries matching by absolute path first, then by binary name.
+ Only matches binaries on the current machine.
+ """
+ if not cmd:
+ return
+
+ from machine.models import Machine
+
+ bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
+ machine = Machine.current()
+
+ # Try matching by absolute path first
+ binary = InstalledBinary.objects.filter(
+ abspath=bin_path_or_name,
+ machine=machine
+ ).first()
+
+ if binary:
+ self.binary = binary
+ return
+
+ # Fallback: match by binary name
+ bin_name = Path(bin_path_or_name).name
+ binary = InstalledBinary.objects.filter(
+ name=bin_name,
+ machine=machine
+ ).first()
+
+ if binary:
+ self.binary = binary
+
def _update_snapshot_title(self, extractor_dir: Path):
"""
Update snapshot title from title extractor output.
@@ -1325,3 +1482,120 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_dir(self) -> Path:
"""Get the output directory for this extractor's results."""
return Path(self.snapshot.output_dir) / self.extractor
+
+ def is_background_hook(self) -> bool:
+ """Check if this ArchiveResult is for a background hook."""
+ extractor_dir = Path(self.pwd) if self.pwd else None
+ if not extractor_dir:
+ return False
+ pid_file = extractor_dir / 'hook.pid'
+ return pid_file.exists()
+
+ def check_background_completed(self) -> bool:
+ """
+ Check if background hook process has exited.
+
+ Returns:
+ True if completed (process exited), False if still running
+ """
+ extractor_dir = Path(self.pwd) if self.pwd else None
+ if not extractor_dir:
+ return True # No pwd = completed or failed to start
+
+ pid_file = extractor_dir / 'hook.pid'
+ if not pid_file.exists():
+ return True # No PID file = completed or failed to start
+
+ try:
+ pid = int(pid_file.read_text().strip())
+ os.kill(pid, 0) # Signal 0 = check if process exists
+ return False # Still running
+ except (OSError, ValueError):
+ return True # Process exited or invalid PID
+
+ def finalize_background_hook(self) -> None:
+ """
+ Collect final results from completed background hook.
+
+ Same logic as run() but for background hooks that already started.
+ """
+ from archivebox.hooks import create_model_record
+
+ extractor_dir = Path(self.pwd) if self.pwd else None
+ if not extractor_dir or not extractor_dir.exists():
+ self.status = self.StatusChoices.FAILED
+ self.output_str = 'Background hook output directory not found'
+ self.end_ts = timezone.now()
+ self.retry_at = None
+ self.save()
+ return
+
+ stdout_file = extractor_dir / 'stdout.log'
+ stderr_file = extractor_dir / 'stderr.log'
+
+ # Read logs
+ stdout = stdout_file.read_text() if stdout_file.exists() else ''
+
+ # Parse JSONL output
+ records = []
+ for line in stdout.splitlines():
+ line = line.strip()
+ if not line or not line.startswith('{'):
+ continue
+ try:
+ data = json.loads(line)
+ if 'type' in data:
+ records.append(data)
+ except json.JSONDecodeError:
+ continue
+
+ # Find the ArchiveResult record
+ ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
+
+ if ar_records:
+ hook_data = ar_records[0]
+
+ # Apply hook's data
+ status_str = hook_data.get('status', 'failed')
+ status_map = {
+ 'succeeded': self.StatusChoices.SUCCEEDED,
+ 'failed': self.StatusChoices.FAILED,
+ 'skipped': self.StatusChoices.SKIPPED,
+ }
+ self.status = status_map.get(status_str, self.StatusChoices.FAILED)
+
+ self.output_str = hook_data.get('output_str') or hook_data.get('output') or ''
+ self.output_json = hook_data.get('output_json')
+
+ # Determine binary FK from cmd
+ if hook_data.get('cmd'):
+ self.cmd = hook_data['cmd']
+ self._set_binary_from_cmd(hook_data['cmd'])
+ if hook_data.get('cmd_version'):
+ self.cmd_version = hook_data['cmd_version'][:128]
+ else:
+ # No output = failed
+ self.status = self.StatusChoices.FAILED
+ self.output_str = 'Background hook did not output ArchiveResult'
+
+ self.end_ts = timezone.now()
+ self.retry_at = None
+
+ # Populate output fields from filesystem
+ if extractor_dir.exists():
+ self._populate_output_fields(extractor_dir)
+
+ self.save()
+
+ # Create any side-effect records
+ for record in records:
+ if record.get('type') != 'ArchiveResult':
+ create_model_record(record.copy())
+
+ # Cleanup PID files and empty logs
+ pid_file = extractor_dir / 'hook.pid'
+ pid_file.unlink(missing_ok=True)
+ if stdout_file.exists() and stdout_file.stat().st_size == 0:
+ stdout_file.unlink()
+ if stderr_file.exists() and stderr_file.stat().st_size == 0:
+ stderr_file.unlink()
diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py
index 610f6fe0..9f277a5c 100644
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -59,11 +59,22 @@ class SnapshotMachine(StateMachine, strict_states=True):
# if no archiveresults exist yet, it's not finished
if not self.snapshot.archiveresult_set.exists():
return False
-
+
# if archiveresults exist but are still pending, it's not finished
if self.snapshot.pending_archiveresults().exists():
return False
-
+
+ # Check for background hooks that are still running
+ started_results = self.snapshot.archiveresult_set.filter(
+ status=ArchiveResult.StatusChoices.STARTED
+ )
+ for result in started_results:
+ if not result.check_background_completed():
+ return False # Still running
+
+ # Completed - finalize it
+ result.finalize_background_hook()
+
# otherwise archiveresults exist and are all finished, so it's finished
return True
@@ -184,10 +195,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
def is_backoff(self) -> bool:
"""Check if we should backoff and retry later."""
- # Backoff if status is still started (extractor didn't complete) and output is None
+ # Backoff if status is still started (extractor didn't complete) and output_str is empty
return (
- self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
- self.archiveresult.output is None
+ self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
+ not self.archiveresult.output_str
)
def is_finished(self) -> bool:
diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py
index b2c126cd..33a620c0 100644
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -80,7 +80,7 @@ def extractor_thumbnail(context, result) -> str:
return ''
# Use embed_path() for the display path (includes canonical paths)
- output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+ output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
# Create a mini template and render it with context
try:
@@ -109,7 +109,7 @@ def extractor_embed(context, result) -> str:
if not template_str:
return ''
- output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+ output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
try:
tpl = template.Template(template_str)
@@ -137,7 +137,7 @@ def extractor_fullscreen(context, result) -> str:
if not template_str:
return ''
- output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+ output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output_str or '')
try:
tpl = template.Template(template_str)
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 7bbbe66e..7ac15d65 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -68,6 +68,8 @@ class HookResult(TypedDict, total=False):
output_files: List[str]
duration_ms: int
hook: str
+ # New fields for JSONL parsing
+ records: List[Dict[str, Any]] # Parsed JSONL records with 'type' field
def discover_hooks(event_name: str) -> List[Path]:
@@ -268,7 +270,9 @@ def run_hook(
files_before = set(output_dir.rglob('*')) if output_dir.exists() else set()
# Detect if this is a background hook (long-running daemon)
- is_background = '__background' in script.stem
+ # New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
+ # Old convention: __background in stem (for backwards compatibility)
+ is_background = '.bg.' in script.name or '__background' in script.stem
# Set up output files for ALL hooks (useful for debugging)
stdout_file = output_dir / 'stdout.log'
@@ -322,13 +326,44 @@ def run_hook(
# Exclude the log files themselves from new_files
new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
- # Parse RESULT_JSON from stdout
+ # Parse JSONL output from stdout
+ # Supports both new JSONL format (any line starting with { that has 'type')
+ # and legacy RESULT_JSON= format for backwards compatibility
output_json = None
+ records = []
+ plugin_name = script.parent.name # Plugin directory name (e.g., 'wget')
+
for line in stdout.splitlines():
- if line.startswith('RESULT_JSON='):
+ line = line.strip()
+ if not line:
+ continue
+
+ # New JSONL format: any line starting with { that has 'type' field
+ if line.startswith('{'):
try:
- output_json = json.loads(line[len('RESULT_JSON='):])
- break
+ data = json.loads(line)
+ if 'type' in data:
+ # Add plugin metadata to every record
+ data['plugin'] = plugin_name
+ data['plugin_hook'] = str(script)
+ records.append(data)
+ # For backwards compatibility, also set output_json for first ArchiveResult
+ if data.get('type') == 'ArchiveResult' and output_json is None:
+ output_json = data
+ except json.JSONDecodeError:
+ pass
+
+ # Legacy format: RESULT_JSON=...
+ elif line.startswith('RESULT_JSON='):
+ try:
+ data = json.loads(line[len('RESULT_JSON='):])
+ if output_json is None:
+ output_json = data
+ # Convert legacy format to new format
+ data['type'] = 'ArchiveResult'
+ data['plugin'] = plugin_name
+ data['plugin_hook'] = str(script)
+ records.append(data)
except json.JSONDecodeError:
pass
@@ -348,6 +383,7 @@ def run_hook(
output_files=new_files,
duration_ms=duration_ms,
hook=str(script),
+ records=records,
)
except Exception as e:
@@ -360,6 +396,7 @@ def run_hook(
output_files=[],
duration_ms=duration_ms,
hook=str(script),
+ records=[],
)
@@ -1104,3 +1141,112 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
return templates
+# =============================================================================
+# Hook Result Processing Helpers
+# =============================================================================
+
+
+def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
+ """
+ Find InstalledBinary for a command, trying abspath first then name.
+ Only matches binaries on the current machine.
+
+ Args:
+ cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
+ machine_id: Current machine ID
+
+ Returns:
+ Binary ID as string if found, None otherwise
+ """
+ if not cmd:
+ return None
+
+ from machine.models import InstalledBinary
+
+ bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
+
+ # Try matching by absolute path first
+ binary = InstalledBinary.objects.filter(
+ abspath=bin_path_or_name,
+ machine_id=machine_id
+ ).first()
+
+ if binary:
+ return str(binary.id)
+
+ # Fallback: match by binary name
+ bin_name = Path(bin_path_or_name).name
+ binary = InstalledBinary.objects.filter(
+ name=bin_name,
+ machine_id=machine_id
+ ).first()
+
+ return str(binary.id) if binary else None
+
+
+def create_model_record(record: Dict[str, Any]) -> Any:
+ """
+ Generic helper to create/update model instances from hook JSONL output.
+
+ Args:
+ record: Dict with 'type' field and model data
+
+ Returns:
+ Created/updated model instance, or None if type unknown
+ """
+ from machine.models import InstalledBinary, Machine
+
+ record_type = record.pop('type', None)
+ if not record_type:
+ return None
+
+ # Remove plugin metadata (not model fields)
+ record.pop('plugin', None)
+ record.pop('plugin_hook', None)
+
+ if record_type == 'InstalledBinary':
+ # InstalledBinary requires machine FK
+ machine = Machine.current()
+ record.setdefault('machine', machine)
+
+ # Required fields check
+ name = record.get('name')
+ abspath = record.get('abspath')
+ if not name or not abspath:
+ return None
+
+ obj, created = InstalledBinary.objects.update_or_create(
+ machine=machine,
+ name=name,
+ defaults={
+ 'abspath': abspath,
+ 'version': record.get('version', ''),
+ 'sha256': record.get('sha256', ''),
+ 'binprovider': record.get('binprovider', 'env'),
+ }
+ )
+ return obj
+
+ elif record_type == 'Machine':
+ # Machine config update (special _method handling)
+ method = record.pop('_method', None)
+ if method == 'update':
+ key = record.get('key')
+ value = record.get('value')
+ if key and value:
+ machine = Machine.current()
+ if not machine.config:
+ machine.config = {}
+ machine.config[key] = value
+ machine.save(update_fields=['config'])
+ return machine
+ return None
+
+ # Add more types as needed (Dependency, Snapshot, etc.)
+ else:
+ # Unknown type - log warning but don't fail
+ import sys
+ print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
+ return None
+
+