mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
logging and admin ui improvements
This commit is contained in:
@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
|
||||
from core.models import ArchiveResult, Snapshot
|
||||
|
||||
|
||||
def render_archiveresults_list(archiveresults_qs, limit=50):
|
||||
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
|
||||
|
||||
results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
|
||||
|
||||
if not results:
|
||||
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
|
||||
|
||||
# Status colors
|
||||
status_colors = {
|
||||
'succeeded': ('#166534', '#dcfce7'), # green
|
||||
'failed': ('#991b1b', '#fee2e2'), # red
|
||||
'queued': ('#6b7280', '#f3f4f6'), # gray
|
||||
'started': ('#92400e', '#fef3c7'), # amber
|
||||
}
|
||||
|
||||
rows = []
|
||||
for idx, result in enumerate(results):
|
||||
status = result.status or 'queued'
|
||||
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
|
||||
|
||||
# Get extractor icon
|
||||
icon = get_extractor_icon(result.extractor)
|
||||
|
||||
# Format timestamp
|
||||
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
|
||||
|
||||
# Truncate output for display
|
||||
full_output = result.output or '-'
|
||||
output_display = full_output[:60]
|
||||
if len(full_output) > 60:
|
||||
output_display += '...'
|
||||
|
||||
# Get full command as tooltip
|
||||
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
|
||||
|
||||
# Build output link
|
||||
output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
|
||||
|
||||
# Get version - try cmd_version field
|
||||
version = result.cmd_version if result.cmd_version else '-'
|
||||
|
||||
# Unique ID for this row's expandable output
|
||||
row_id = f'output_{idx}_{str(result.id)[:8]}'
|
||||
|
||||
rows.append(f'''
|
||||
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
|
||||
<td style="padding: 10px 12px; white-space: nowrap;">
|
||||
<span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
|
||||
font-size: 11px; font-weight: 600; text-transform: uppercase;
|
||||
color: {color}; background: {bg};">{status}</span>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
|
||||
{icon}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
|
||||
{result.extractor}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; max-width: 280px;">
|
||||
<span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
|
||||
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
|
||||
title="Click to expand full output">
|
||||
{output_display}
|
||||
</span>
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
|
||||
{end_time}
|
||||
</td>
|
||||
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
|
||||
{version}
|
||||
</td>
|
||||
<td style="padding: 10px 8px; white-space: nowrap;">
|
||||
<div style="display: flex; gap: 4px;">
|
||||
<a href="{output_link}" target="_blank"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="View output">📄</a>
|
||||
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
|
||||
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
|
||||
title="Edit">✏️</a>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr style="border-bottom: 1px solid #e2e8f0;">
|
||||
<td colspan="7" style="padding: 0 12px 10px 12px;">
|
||||
<details id="{row_id}" style="margin: 0;">
|
||||
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
|
||||
Details & Output
|
||||
</summary>
|
||||
<div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
|
||||
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
|
||||
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
|
||||
</div>
|
||||
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
|
||||
<b>Output:</b>
|
||||
</div>
|
||||
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
|
||||
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
|
||||
<b>Command:</b>
|
||||
</div>
|
||||
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
|
||||
</div>
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
''')
|
||||
|
||||
total_count = archiveresults_qs.count()
|
||||
footer = ''
|
||||
if total_count > limit:
|
||||
footer = f'''
|
||||
<tr>
|
||||
<td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
|
||||
Showing {limit} of {total_count} results
|
||||
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
|
||||
style="color: #2563eb;">View all →</a>
|
||||
</td>
|
||||
</tr>
|
||||
'''
|
||||
|
||||
return mark_safe(f'''
|
||||
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
|
||||
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
|
||||
<thead>
|
||||
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
|
||||
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
|
||||
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{''.join(rows)}
|
||||
{footer}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
''')
|
||||
|
||||
|
||||
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
name = 'Archive Results Log'
|
||||
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
|
||||
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
|
||||
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
|
||||
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
fieldsets = (
|
||||
('Snapshot', {
|
||||
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Extractor', {
|
||||
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timing', {
|
||||
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Command', {
|
||||
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Output', {
|
||||
'fields': ('output', 'output_summary'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('created_by',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
||||
ordering = ['-start_ts']
|
||||
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
|
||||
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
save_on_top = True
|
||||
|
||||
|
||||
actions = ['delete_selected']
|
||||
|
||||
|
||||
class Meta:
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results'
|
||||
|
||||
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
|
||||
|
||||
from core.models import Tag
|
||||
from core.admin_tags import TagInline
|
||||
from core.admin_archiveresults import ArchiveResultInline
|
||||
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
|
||||
|
||||
|
||||
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
|
||||
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
|
||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
|
||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
|
||||
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
|
||||
|
||||
fieldsets = (
|
||||
('URL', {
|
||||
'fields': ('url', 'title'),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Status', {
|
||||
'fields': ('status', 'retry_at', 'status_info'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Relations', {
|
||||
'fields': ('crawl', 'created_by', 'tags_str'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Config', {
|
||||
'fields': ('config',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Files', {
|
||||
'fields': ('output_dir',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Actions', {
|
||||
'fields': ('admin_actions',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Archive Results', {
|
||||
'fields': ('archiveresults_list',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
ordering = ['-created_at']
|
||||
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
|
||||
inlines = [TagInline, ArchiveResultInline]
|
||||
inlines = [TagInline] # Removed ArchiveResultInline, using custom renderer instead
|
||||
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
|
||||
|
||||
action_form = SnapshotActionForm
|
||||
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
obj.extension or '-',
|
||||
)
|
||||
|
||||
@admin.display(description='Archive Results')
|
||||
def archiveresults_list(self, obj):
|
||||
return render_archiveresults_list(obj.archiveresult_set.all())
|
||||
|
||||
@admin.display(
|
||||
description='Title',
|
||||
ordering='title',
|
||||
|
||||
@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
|
||||
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
|
||||
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
fields = ('name', 'created_by', *readonly_fields)
|
||||
actions = ['delete_selected', 'merge_tags']
|
||||
ordering = ['-created_at']
|
||||
# inlines = [TaggedItemInline]
|
||||
|
||||
fieldsets = (
|
||||
('Tag Info', {
|
||||
'fields': ('name', 'slug'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Metadata', {
|
||||
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Snapshots', {
|
||||
'fields': ('snapshots',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
)
|
||||
|
||||
paginator = AccelleratedPaginator
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
__package__ = 'archivebox.core'
|
||||
|
||||
import sys
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
# Auto-start the orchestrator when running the web server
|
||||
self._maybe_start_orchestrator()
|
||||
|
||||
def _maybe_start_orchestrator(self):
|
||||
"""Start the orchestrator if we're running a web server."""
|
||||
import os
|
||||
|
||||
# Don't start orchestrator during migrations, shell, tests, etc.
|
||||
# Only start when running: runserver, daphne, gunicorn, uwsgi
|
||||
if not self._is_web_server():
|
||||
return
|
||||
|
||||
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
|
||||
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
|
||||
return
|
||||
|
||||
# Don't start in autoreload child process (avoid double-start)
|
||||
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
|
||||
return
|
||||
|
||||
try:
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
# Start orchestrator as daemon (won't exit on idle when started by server)
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start()
|
||||
except Exception as e:
|
||||
# Don't crash the server if orchestrator fails to start
|
||||
import logging
|
||||
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
|
||||
|
||||
def _is_web_server(self) -> bool:
|
||||
"""Check if we're running a web server command."""
|
||||
# Check for common web server indicators
|
||||
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
|
||||
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0024_snapshot_crawl'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Remove the unique constraint on url
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='url',
|
||||
field=models.URLField(db_index=True, unique=False),
|
||||
),
|
||||
# Add unique constraint on (url, crawl) combination
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
|
||||
),
|
||||
]
|
||||
@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
|
||||
return self.name
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if self._state.adding:
|
||||
is_new = self._state.adding
|
||||
if is_new:
|
||||
self.slug = slugify(self.name)
|
||||
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
|
||||
i = None
|
||||
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
|
||||
i = (i or 0) + 1
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Tag',
|
||||
indent_level=0,
|
||||
metadata={
|
||||
'id': self.id,
|
||||
'name': self.name,
|
||||
'slug': self.slug,
|
||||
},
|
||||
)
|
||||
|
||||
@property
|
||||
def api_url(self) -> str:
|
||||
return reverse_lazy('api-1:get_tag', args=[self.id])
|
||||
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
|
||||
if tag.strip()
|
||||
))
|
||||
|
||||
try:
|
||||
snapshot = self.get(url=url)
|
||||
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
|
||||
snapshot = self.filter(url=url).order_by('-created_at').first()
|
||||
if snapshot:
|
||||
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
|
||||
snapshot.title = title
|
||||
snapshot.save(update_fields=['title', 'modified_at'])
|
||||
except self.model.DoesNotExist:
|
||||
else:
|
||||
if timestamp:
|
||||
while self.filter(timestamp=timestamp).exists():
|
||||
timestamp = str(float(timestamp) + 1.0)
|
||||
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
url = models.URLField(unique=True, db_index=True)
|
||||
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
|
||||
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
|
||||
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = "Snapshot"
|
||||
verbose_name_plural = "Snapshots"
|
||||
constraints = [
|
||||
# Allow same URL in different crawls, but not duplicates within same crawl
|
||||
models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.url[:64]}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
if not self.bookmarked_at:
|
||||
self.bookmarked_at = self.created_at or timezone.now()
|
||||
if not self.timestamp:
|
||||
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
self.crawl.urls += f'\n{self.url}'
|
||||
self.crawl.save()
|
||||
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created Snapshot',
|
||||
indent_level=2,
|
||||
url=self.url,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
|
||||
'depth': self.depth,
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
|
||||
def output_dir_parent(self) -> str:
|
||||
return 'archive'
|
||||
|
||||
@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
def __str__(self):
|
||||
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
is_new = self._state.adding
|
||||
super().save(*args, **kwargs)
|
||||
if is_new:
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
log_worker_event(
|
||||
worker_type='DB',
|
||||
event='Created ArchiveResult',
|
||||
indent_level=3,
|
||||
extractor=self.extractor,
|
||||
metadata={
|
||||
'id': str(self.id),
|
||||
'snapshot_id': str(self.snapshot_id),
|
||||
'snapshot_url': str(self.snapshot.url)[:64],
|
||||
'status': self.status,
|
||||
},
|
||||
)
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.output_dir)
|
||||
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
|
||||
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
# Find hook for this extractor
|
||||
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Use plugin directory name instead of extractor name (removes numeric prefix)
|
||||
plugin_name = hook.parent.name
|
||||
extractor_dir = Path(self.snapshot.output_dir) / plugin_name
|
||||
|
||||
# Run the hook
|
||||
start_ts = timezone.now()
|
||||
result = run_hook(
|
||||
|
||||
@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
super().__init__(snapshot, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
return f'Snapshot[{self.snapshot.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
if not can_start:
|
||||
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started() ↳ snapshot.run()')
|
||||
# Suppressed: state transition logs
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
|
||||
# Suppressed: state transition logs
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
super().__init__(archiveresult, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
return f'ArchiveResult[{self.archiveresult.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
if not can_start:
|
||||
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
|
||||
# Suppressed: queue waiting logs
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
start_ts=None,
|
||||
) # bump the snapshot's retry_at so they pickup any new changes
|
||||
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
|
||||
|
||||
# Suppressed: state transition logs
|
||||
# Lock the object and mark start time
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
|
||||
status=ArchiveResult.StatusChoices.STARTED,
|
||||
start_ts=timezone.now(),
|
||||
)
|
||||
|
||||
|
||||
# Run the extractor - this updates status, output, timestamps, etc.
|
||||
self.archiveresult.run()
|
||||
|
||||
|
||||
# Save the updated result
|
||||
self.archiveresult.save()
|
||||
|
||||
# Log the result
|
||||
if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
||||
print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
|
||||
elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
||||
print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
|
||||
elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
|
||||
print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
|
||||
|
||||
# Suppressed: extractor result logs (already logged by worker)
|
||||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
|
||||
)
|
||||
self.archiveresult.save(write_indexes=True)
|
||||
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
# Suppressed: state transition logs
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
|
||||
@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
|
||||
)
|
||||
|
||||
# Start orchestrator in background to process the queued crawl
|
||||
try:
|
||||
from archivebox.workers.tasks import ensure_orchestrator_running
|
||||
ensure_orchestrator_running()
|
||||
except Exception as e:
|
||||
# Orchestrator may already be running via supervisord, or fail to start
|
||||
# This is not fatal - the crawl will be processed when orchestrator runs
|
||||
print(f'[!] Failed to start orchestrator: {e}')
|
||||
|
||||
# Orchestrator (managed by supervisord) will pick up the queued crawl
|
||||
return redirect(crawl.admin_change_url)
|
||||
|
||||
|
||||
@@ -539,6 +531,7 @@ def live_progress_view(request):
|
||||
from workers.orchestrator import Orchestrator
|
||||
from crawls.models import Crawl
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
# Get orchestrator status
|
||||
orchestrator_running = Orchestrator.is_running()
|
||||
@@ -570,8 +563,26 @@ def live_progress_view(request):
|
||||
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
|
||||
total_snapshots = crawl_snapshots.count()
|
||||
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
|
||||
started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
|
||||
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
|
||||
|
||||
# Count URLs in the crawl (for when snapshots haven't been created yet)
|
||||
urls_count = 0
|
||||
if crawl.urls:
|
||||
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
|
||||
elif crawl.seed and crawl.seed.uri:
|
||||
# Try to get URL count from seed
|
||||
if crawl.seed.uri.startswith('file:///'):
|
||||
try:
|
||||
from pathlib import Path
|
||||
seed_file = Path(crawl.seed.uri.replace('file://', ''))
|
||||
if seed_file.exists():
|
||||
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
urls_count = 1 # Single URL seed
|
||||
|
||||
# Calculate crawl progress
|
||||
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
|
||||
|
||||
@@ -590,16 +601,24 @@ def live_progress_view(request):
|
||||
# Calculate snapshot progress
|
||||
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
|
||||
|
||||
# Get active extractors for this snapshot
|
||||
active_extractors = [
|
||||
# Get all extractors for this snapshot
|
||||
# Order: started first, then queued, then completed
|
||||
all_extractors = [
|
||||
{
|
||||
'id': str(ar.id),
|
||||
'extractor': ar.extractor,
|
||||
'status': ar.status,
|
||||
'started': ar.start_ts.isoformat() if ar.start_ts else None,
|
||||
'progress': 50,
|
||||
}
|
||||
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
|
||||
for ar in snapshot_results.annotate(
|
||||
status_order=Case(
|
||||
When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
|
||||
When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
|
||||
When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
|
||||
When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
|
||||
default=Value(4),
|
||||
output_field=IntegerField(),
|
||||
)
|
||||
).order_by('status_order', 'extractor')
|
||||
]
|
||||
|
||||
active_snapshots_for_crawl.append({
|
||||
@@ -612,9 +631,17 @@ def live_progress_view(request):
|
||||
'completed_extractors': completed_extractors,
|
||||
'failed_extractors': failed_extractors,
|
||||
'pending_extractors': pending_extractors,
|
||||
'active_extractors': active_extractors,
|
||||
'all_extractors': all_extractors,
|
||||
})
|
||||
|
||||
# Check if crawl can start (for debugging stuck crawls)
|
||||
can_start = bool(crawl.seed and crawl.seed.uri)
|
||||
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
|
||||
|
||||
# Check if retry_at is in the future (would prevent worker from claiming)
|
||||
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
|
||||
seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
|
||||
|
||||
active_crawls.append({
|
||||
'id': str(crawl.id),
|
||||
'label': str(crawl)[:60],
|
||||
@@ -622,11 +649,17 @@ def live_progress_view(request):
|
||||
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
|
||||
'progress': crawl_progress,
|
||||
'max_depth': crawl.max_depth,
|
||||
'urls_count': urls_count,
|
||||
'total_snapshots': total_snapshots,
|
||||
'completed_snapshots': completed_snapshots,
|
||||
'started_snapshots': started_snapshots,
|
||||
'failed_snapshots': 0,
|
||||
'pending_snapshots': pending_snapshots,
|
||||
'active_snapshots': active_snapshots_for_crawl,
|
||||
'can_start': can_start,
|
||||
'seed_uri': seed_uri,
|
||||
'retry_at_future': retry_at_future,
|
||||
'seconds_until_retry': seconds_until_retry,
|
||||
})
|
||||
|
||||
return JsonResponse({
|
||||
|
||||
Reference in New Issue
Block a user