mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
logging and admin ui improvements
This commit is contained in:
@@ -8,6 +8,7 @@ from django.contrib import admin, messages
|
||||
from django.urls import path
|
||||
from django.http import JsonResponse
|
||||
from django.views.decorators.http import require_POST
|
||||
from django.db.models import Count, Q
|
||||
|
||||
from archivebox import DATA_DIR
|
||||
|
||||
@@ -19,13 +20,155 @@ from core.models import Snapshot
|
||||
from crawls.models import Seed, Crawl, CrawlSchedule
|
||||
|
||||
|
||||
def render_snapshots_list(snapshots_qs, limit=20):
|
||||
"""Render a nice inline list view of snapshots with status, title, URL, and progress."""
|
||||
|
||||
snapshots = snapshots_qs.order_by('-created_at')[:limit].annotate(
|
||||
total_results=Count('archiveresult'),
|
||||
succeeded_results=Count('archiveresult', filter=Q(archiveresult__status='succeeded')),
|
||||
failed_results=Count('archiveresult', filter=Q(archiveresult__status='failed')),
|
||||
)
|
||||
|
||||
if not snapshots:
|
||||
return mark_safe('<div style="color: #666; font-style: italic; padding: 8px 0;">No Snapshots yet...</div>')
|
||||
|
||||
# Status colors matching Django admin and progress monitor
|
||||
status_colors = {
|
||||
'queued': ('#6c757d', '#f8f9fa'), # gray
|
||||
'started': ('#856404', '#fff3cd'), # amber
|
||||
'sealed': ('#155724', '#d4edda'), # green
|
||||
'failed': ('#721c24', '#f8d7da'), # red
|
||||
}
|
||||
|
||||
rows = []
|
||||
for snapshot in snapshots:
|
||||
status = snapshot.status or 'queued'
|
||||
color, bg = status_colors.get(status, ('#6c757d', '#f8f9fa'))
|
||||
|
||||
# Calculate progress
|
||||
total = snapshot.total_results
|
||||
done = snapshot.succeeded_results + snapshot.failed_results
|
||||
progress_pct = int((done / total) * 100) if total > 0 else 0
|
||||
progress_text = f'{done}/{total}' if total > 0 else '-'
|
||||
|
||||
# Truncate title and URL
|
||||
title = (snapshot.title or 'Untitled')[:60]
|
||||
if len(snapshot.title or '') > 60:
|
||||
title += '...'
|
||||
url_display = snapshot.url[:50]
|
||||
if len(snapshot.url) > 50:
|
||||
url_display += '...'
|
||||
|
||||
# Format date
|
||||
date_str = snapshot.created_at.strftime('%Y-%m-%d %H:%M') if snapshot.created_at else '-'
|
||||
|
||||
rows.append(f'''
|
||||
<tr style="border-bottom: 1px solid #eee;">
|
||||
<td style="padding: 6px 8px; white-space: nowrap;">
|
||||
<span style="display: inline-block; padding: 2px 8px; border-radius: 10px;
|
||||
font-size: 11px; font-weight: 500; text-transform: uppercase;
|
||||
color: {color}; background: {bg};">{status}</span>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap;">
|
||||
<a href="/archive/{snapshot.timestamp}/" style="text-decoration: none;">
|
||||
<img src="/archive/{snapshot.timestamp}/favicon.ico"
|
||||
style="width: 16px; height: 16px; vertical-align: middle; margin-right: 4px;"
|
||||
onerror="this.style.display='none'"/>
|
||||
</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; max-width: 300px;">
|
||||
<a href="{snapshot.admin_change_url}" style="color: #417690; text-decoration: none; font-weight: 500;"
|
||||
title="{snapshot.title or 'Untitled'}">{title}</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; max-width: 250px;">
|
||||
<a href="{snapshot.url}" target="_blank"
|
||||
style="color: #666; text-decoration: none; font-family: monospace; font-size: 11px;"
|
||||
title="{snapshot.url}">{url_display}</a>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap; text-align: center;">
|
||||
<div style="display: inline-flex; align-items: center; gap: 6px;">
|
||||
<div style="width: 60px; height: 6px; background: #eee; border-radius: 3px; overflow: hidden;">
|
||||
<div style="width: {progress_pct}%; height: 100%;
|
||||
background: {'#28a745' if snapshot.failed_results == 0 else '#ffc107' if snapshot.succeeded_results > 0 else '#dc3545'};
|
||||
transition: width 0.3s;"></div>
|
||||
</div>
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={snapshot.id}"
|
||||
style="font-size: 11px; color: #417690; min-width: 35px; text-decoration: none;"
|
||||
title="View archive results">{progress_text}</a>
|
||||
</div>
|
||||
</td>
|
||||
<td style="padding: 6px 8px; white-space: nowrap; color: #888; font-size: 11px;">
|
||||
{date_str}
|
||||
</td>
|
||||
</tr>
|
||||
''')
|
||||
|
||||
total_count = snapshots_qs.count()
|
||||
footer = ''
|
||||
if total_count > limit:
|
||||
footer = f'''
|
||||
<tr>
|
||||
<td colspan="6" style="padding: 8px; text-align: center; color: #666; font-size: 12px; background: #f8f9fa;">
|
||||
Showing {limit} of {total_count} snapshots
|
||||
</td>
|
||||
</tr>
|
||||
'''
|
||||
|
||||
return mark_safe(f'''
|
||||
<div style="border: 1px solid #ddd; border-radius: 6px; overflow: hidden; max-width: 100%;">
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 13px;">
|
||||
<thead>
|
||||
<tr style="background: #f5f5f5; border-bottom: 2px solid #ddd;">
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Status</th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333; width: 24px;"></th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Title</th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">URL</th>
|
||||
<th style="padding: 8px; text-align: center; font-weight: 600; color: #333;">Progress</th>
|
||||
<th style="padding: 8px; text-align: left; font-weight: 600; color: #333;">Created</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{''.join(rows)}
|
||||
{footer}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
''')
|
||||
|
||||
|
||||
class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
|
||||
fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Source', {
|
||||
'fields': ('uri', 'contents'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
'fields': ('label', 'notes', 'tags_str'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': ('extractor', 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Crawls', {
|
||||
'fields': ('scheduled_crawls', 'crawls'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('extractor', 'created_by')
|
||||
ordering = ['-created_at']
|
||||
@@ -51,22 +194,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
)) or mark_safe('<i>No Crawls yet...</i>')
|
||||
|
||||
def snapshots(self, obj):
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
return render_snapshots_list(obj.snapshot_set.all())
|
||||
|
||||
def contents(self, obj):
|
||||
if obj.uri.startswith('file:///data/'):
|
||||
source_file = DATA_DIR / obj.uri.replace('file:///data/', '', 1)
|
||||
source_file = obj.get_file_path()
|
||||
if source_file:
|
||||
contents = ""
|
||||
try:
|
||||
contents = source_file.read_text().strip()[:14_000]
|
||||
except Exception as e:
|
||||
contents = f'Error reading {source_file}: {e}'
|
||||
|
||||
|
||||
return format_html('<b><code>{}</code>:</b><br/><pre>{}</pre>', source_file, contents)
|
||||
|
||||
|
||||
return format_html('See URLs here: <a href="{}">{}</a>', obj.uri, obj.uri)
|
||||
|
||||
|
||||
@@ -78,7 +218,37 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
|
||||
fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
|
||||
|
||||
fieldsets = (
|
||||
('URLs', {
|
||||
'fields': ('seed_urls_editor',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Info', {
|
||||
'fields': ('label', 'notes'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Settings', {
|
||||
'fields': ('max_depth', 'config'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('status', 'retry_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('seed', 'schedule', 'created_by'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
|
||||
ordering = ['-created_at', '-retry_at']
|
||||
@@ -90,6 +260,16 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
def recrawl(self, request, obj):
|
||||
"""Duplicate this crawl as a new crawl with the same seed and settings."""
|
||||
from django.utils import timezone
|
||||
from django.shortcuts import redirect
|
||||
|
||||
# Validate seed has a URI (required for crawl to start)
|
||||
if not obj.seed:
|
||||
messages.error(request, 'Cannot recrawl: original crawl has no seed.')
|
||||
return redirect('admin:crawls_crawl_change', obj.id)
|
||||
|
||||
if not obj.seed.uri:
|
||||
messages.error(request, 'Cannot recrawl: seed has no URI.')
|
||||
return redirect('admin:crawls_crawl_change', obj.id)
|
||||
|
||||
new_crawl = Crawl.objects.create(
|
||||
seed=obj.seed,
|
||||
@@ -110,8 +290,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
f'It will start processing shortly.'
|
||||
)
|
||||
|
||||
# Redirect to the new crawl's change page
|
||||
from django.shortcuts import redirect
|
||||
return redirect('admin:crawls_crawl_change', new_crawl.id)
|
||||
|
||||
def get_urls(self):
|
||||
@@ -133,7 +311,8 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
except Crawl.DoesNotExist:
|
||||
return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
|
||||
|
||||
if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
|
||||
source_file = crawl.seed.get_file_path() if crawl.seed else None
|
||||
if not source_file:
|
||||
return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
|
||||
|
||||
try:
|
||||
@@ -142,8 +321,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
except json.JSONDecodeError:
|
||||
return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
|
||||
|
||||
source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
|
||||
|
||||
try:
|
||||
# Ensure parent directory exists
|
||||
source_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -156,10 +333,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
return obj.snapshot_set.count()
|
||||
|
||||
def snapshots(self, obj):
|
||||
return format_html_join('<br/>', '<a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
return render_snapshots_list(obj.snapshot_set.all())
|
||||
|
||||
@admin.display(description='Schedule', ordering='schedule')
|
||||
def schedule_str(self, obj):
|
||||
@@ -186,13 +360,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
seed_uri = obj.urls
|
||||
|
||||
# Check if it's a local file we can edit
|
||||
is_file = seed_uri.startswith('file:///data/')
|
||||
source_file = obj.seed.get_file_path() if obj.seed else None
|
||||
is_file = source_file is not None
|
||||
contents = ""
|
||||
error = None
|
||||
source_file = None
|
||||
|
||||
if is_file:
|
||||
source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
|
||||
if is_file and source_file:
|
||||
try:
|
||||
contents = source_file.read_text().strip()
|
||||
except Exception as e:
|
||||
@@ -337,7 +510,29 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
||||
search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots')
|
||||
fields = ('label', 'notes', 'schedule', 'template', 'created_by', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('Schedule Info', {
|
||||
'fields': ('label', 'notes'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Configuration', {
|
||||
'fields': ('schedule', 'template'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Crawls', {
|
||||
'fields': ('crawls',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('created_by',)
|
||||
ordering = ['-created_at']
|
||||
@@ -362,10 +557,7 @@ class CrawlScheduleAdmin(BaseModelAdmin):
|
||||
|
||||
def snapshots(self, obj):
|
||||
crawl_ids = obj.crawl_set.values_list('pk', flat=True)
|
||||
return format_html_join('<br/>', ' - <a href="{}">{}</a>', (
|
||||
(snapshot.admin_change_url, snapshot)
|
||||
for snapshot in Snapshot.objects.filter(crawl_id__in=crawl_ids).order_by('-created_at')[:20]
|
||||
)) or mark_safe('<i>No Snapshots yet...</i>')
|
||||
return render_snapshots_list(Snapshot.objects.filter(crawl_id__in=crawl_ids))
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
|
||||
@@ -44,9 +44,27 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.uri[:64]}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Seed',
|
||||
indent_level=0,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'uri': str(self.uri)[:64],
|
||||
'extractor': self.extractor,
|
||||
'label': self.label or None,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None):
|
||||
source_path = str(source_file.resolve()).replace(str(CONSTANTS.DATA_DIR), '/data')
|
||||
# Use absolute path for file:// URLs so extractors can find the files
|
||||
source_path = str(source_file.resolve())
|
||||
seed, _ = cls.objects.get_or_create(
|
||||
label=label or source_file.name, uri=f'file://{source_path}',
|
||||
created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(),
|
||||
@@ -62,6 +80,25 @@ class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthS
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_seed', args=[self.id])
|
||||
|
||||
def get_file_path(self) -> Path | None:
|
||||
"""
|
||||
Get the filesystem path for file:// URIs.
|
||||
Handles both old format (file:///data/...) and new format (file:///absolute/path).
|
||||
Returns None if URI is not a file:// URI.
|
||||
"""
|
||||
if not self.uri.startswith('file://'):
|
||||
return None
|
||||
|
||||
# Remove file:// prefix
|
||||
path_str = self.uri.replace('file://', '', 1)
|
||||
|
||||
# Handle old format: file:///data/... -> DATA_DIR/...
|
||||
if path_str.startswith('/data/'):
|
||||
return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1)
|
||||
|
||||
# Handle new format: file:///absolute/path
|
||||
return Path(path_str)
|
||||
|
||||
@property
|
||||
def snapshot_set(self) -> QuerySet['Snapshot']:
|
||||
from core.models import Snapshot
|
||||
@@ -136,6 +173,23 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Crawl',
|
||||
indent_level=1,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'seed_uri': str(self.seed.uri)[:64] if self.seed else None,
|
||||
'max_depth': self.max_depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None):
|
||||
crawl, _ = cls.objects.get_or_create(
|
||||
|
||||
@@ -36,13 +36,19 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||
super().__init__(crawl, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]Crawl\\[{self.crawl.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
return f'Crawl[{self.crawl.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
return bool(self.crawl.seed and self.crawl.seed.uri)
|
||||
if not self.crawl.seed:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]')
|
||||
return False
|
||||
if not self.crawl.seed.uri:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]')
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
@@ -73,25 +79,121 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.run()')
|
||||
# Suppressed: state transition logs
|
||||
# lock the crawl object while we create snapshots
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
# Run the crawl - creates root snapshot and processes queued URLs
|
||||
self.crawl.run()
|
||||
try:
|
||||
# Run on_Crawl hooks to validate/install dependencies
|
||||
self._run_crawl_hooks()
|
||||
|
||||
# only update status to STARTED once snapshots are created
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
# Run the crawl - creates root snapshot and processes queued URLs
|
||||
self.crawl.run()
|
||||
|
||||
# only update status to STARTED once snapshots are created
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Re-raise so the worker knows it failed
|
||||
raise
|
||||
|
||||
def _run_crawl_hooks(self):
|
||||
"""Run on_Crawl hooks to validate/install dependencies."""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import run_hooks, discover_hooks
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
# Discover and run all on_Crawl hooks
|
||||
hooks = discover_hooks('Crawl')
|
||||
if not hooks:
|
||||
return
|
||||
|
||||
# Create a temporary output directory for hook results
|
||||
output_dir = Path(CONSTANTS.DATA_DIR) / 'tmp' / f'crawl_{self.crawl.id}'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run all on_Crawl hooks
|
||||
results = run_hooks(
|
||||
event_name='Crawl',
|
||||
output_dir=output_dir,
|
||||
timeout=60,
|
||||
config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl],
|
||||
crawl_id=str(self.crawl.id),
|
||||
seed_uri=self.crawl.seed.uri if self.crawl.seed else '',
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
# Process hook results - parse JSONL output and create DB objects
|
||||
self._process_hook_results(results)
|
||||
|
||||
def _process_hook_results(self, results: list):
|
||||
"""Process JSONL output from hooks to create InstalledBinary and update Machine config."""
|
||||
import json
|
||||
from machine.models import Machine, InstalledBinary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
for result in results:
|
||||
if result['returncode'] != 0:
|
||||
# Hook failed - might indicate missing dependency
|
||||
continue
|
||||
|
||||
# Parse JSONL output
|
||||
for line in result['stdout'].strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
obj_type = obj.get('type')
|
||||
|
||||
if obj_type == 'InstalledBinary':
|
||||
# Create or update InstalledBinary record
|
||||
# Skip if essential fields are missing
|
||||
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
|
||||
continue
|
||||
|
||||
InstalledBinary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=obj['name'],
|
||||
defaults={
|
||||
'abspath': obj['abspath'],
|
||||
'version': obj['version'],
|
||||
'sha256': obj.get('sha256') or '',
|
||||
'binprovider': obj.get('binprovider') or 'env',
|
||||
}
|
||||
)
|
||||
|
||||
elif obj_type == 'Machine':
|
||||
# Update Machine config
|
||||
method = obj.get('_method', 'update')
|
||||
if method == 'update':
|
||||
key = obj.get('key', '')
|
||||
value = obj.get('value')
|
||||
if key.startswith('config/'):
|
||||
config_key = key[7:] # Remove 'config/' prefix
|
||||
machine.config[config_key] = value
|
||||
machine.save(update_fields=['config'])
|
||||
|
||||
elif obj_type == 'Dependency':
|
||||
# Dependency request - could trigger installation
|
||||
# For now just log it (installation hooks would be separate)
|
||||
print(f'[yellow]Dependency requested: {obj.get("bin_name")}[/yellow]')
|
||||
|
||||
except json.JSONDecodeError:
|
||||
# Not JSON, skip
|
||||
continue
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
|
||||
# Suppressed: state transition logs
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
|
||||
Reference in New Issue
Block a user