logging and admin ui improvements

This commit is contained in:
Nick Sweeting
2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions

View File

@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
from core.models import ArchiveResult, Snapshot
def render_archiveresults_list(archiveresults_qs, limit=50):
"""Render a nice inline list view of archive results with status, extractor, output, and actions."""
results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
if not results:
return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
# Status colors
status_colors = {
'succeeded': ('#166534', '#dcfce7'), # green
'failed': ('#991b1b', '#fee2e2'), # red
'queued': ('#6b7280', '#f3f4f6'), # gray
'started': ('#92400e', '#fef3c7'), # amber
}
rows = []
for idx, result in enumerate(results):
status = result.status or 'queued'
color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
# Get extractor icon
icon = get_extractor_icon(result.extractor)
# Format timestamp
end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
# Truncate output for display
full_output = result.output or '-'
output_display = full_output[:60]
if len(full_output) > 60:
output_display += '...'
# Get full command as tooltip
cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
# Build output link
output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
# Get version - try cmd_version field
version = result.cmd_version if result.cmd_version else '-'
# Unique ID for this row's expandable output
row_id = f'output_{idx}_{str(result.id)[:8]}'
rows.append(f'''
<tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
<td style="padding: 10px 12px; white-space: nowrap;">
<span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
font-size: 11px; font-weight: 600; text-transform: uppercase;
color: {color}; background: {bg};">{status}</span>
</td>
<td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
{icon}
</td>
<td style="padding: 10px 12px; font-weight: 500; color: #334155;">
{result.extractor}
</td>
<td style="padding: 10px 12px; max-width: 280px;">
<span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
title="Click to expand full output">
{output_display}
</span>
</td>
<td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
{end_time}
</td>
<td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
{version}
</td>
<td style="padding: 10px 8px; white-space: nowrap;">
<div style="display: flex; gap: 4px;">
<a href="{output_link}" target="_blank"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="View output">📄</a>
<a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
title="Edit">✏️</a>
</div>
</td>
</tr>
<tr style="border-bottom: 1px solid #e2e8f0;">
<td colspan="7" style="padding: 0 12px 10px 12px;">
<details id="{row_id}" style="margin: 0;">
<summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
Details &amp; Output
</summary>
<div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
<span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
<span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
</div>
<div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
<b>Output:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
<div style="font-size: 11px; color: #64748b; margin-top: 8px;">
<b>Command:</b>
</div>
<pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
</div>
</details>
</td>
</tr>
''')
total_count = archiveresults_qs.count()
footer = ''
if total_count > limit:
footer = f'''
<tr>
<td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
Showing {limit} of {total_count} results &nbsp;
<a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
style="color: #2563eb;">View all →</a>
</td>
</tr>
'''
return mark_safe(f'''
<div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
<table style="width: 100%; border-collapse: collapse; font-size: 14px;">
<thead>
<tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
<th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
<th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
{footer}
</tbody>
</table>
</div>
''')
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
autocomplete_fields = ['snapshot']
fieldsets = (
('Snapshot', {
'fields': ('snapshot', 'snapshot_info', 'tags_str'),
'classes': ('card', 'wide'),
}),
('Extractor', {
'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
'classes': ('card',),
}),
('Timing', {
'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Command', {
'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
'classes': ('card',),
}),
('Output', {
'fields': ('output', 'output_summary'),
'classes': ('card', 'wide'),
}),
('Metadata', {
'fields': ('created_by',),
'classes': ('card',),
}),
)
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
save_on_top = True
actions = ['delete_selected']
class Meta:
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results'

View File

@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline
from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
fieldsets = (
('URL', {
'fields': ('url', 'title'),
'classes': ('card', 'wide'),
}),
('Status', {
'fields': ('status', 'retry_at', 'status_info'),
'classes': ('card',),
}),
('Timestamps', {
'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
'classes': ('card',),
}),
('Relations', {
'fields': ('crawl', 'created_by', 'tags_str'),
'classes': ('card',),
}),
('Config', {
'fields': ('config',),
'classes': ('card',),
}),
('Files', {
'fields': ('output_dir',),
'classes': ('card',),
}),
('Actions', {
'fields': ('admin_actions',),
'classes': ('card', 'wide'),
}),
('Archive Results', {
'fields': ('archiveresults_list',),
'classes': ('card', 'wide'),
}),
)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
inlines = [TagInline] # Removed ArchiveResultInline, using custom renderer instead
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.extension or '-',
)
@admin.display(description='Archive Results')
def archiveresults_list(self, obj):
return render_archiveresults_list(obj.archiveresult_set.all())
@admin.display(
description='Title',
ordering='title',

View File

@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
search_fields = ('id', 'name', 'slug')
fields = ('name', 'created_by', *readonly_fields)
actions = ['delete_selected', 'merge_tags']
ordering = ['-created_at']
# inlines = [TaggedItemInline]
fieldsets = (
('Tag Info', {
'fields': ('name', 'slug'),
'classes': ('card',),
}),
('Metadata', {
'fields': ('id', 'created_by', 'created_at', 'modified_at'),
'classes': ('card',),
}),
('Snapshots', {
'fields': ('snapshots',),
'classes': ('card', 'wide'),
}),
)
paginator = AccelleratedPaginator

View File

@@ -1,7 +1,5 @@
__package__ = 'archivebox.core'
import sys
from django.apps import AppConfig
@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
register_admin_site()
# Auto-start the orchestrator when running the web server
self._maybe_start_orchestrator()
def _maybe_start_orchestrator(self):
"""Start the orchestrator if we're running a web server."""
import os
# Don't start orchestrator during migrations, shell, tests, etc.
# Only start when running: runserver, daphne, gunicorn, uwsgi
if not self._is_web_server():
return
# Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
return
# Don't start in autoreload child process (avoid double-start)
if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
return
try:
from workers.orchestrator import Orchestrator
if not Orchestrator.is_running():
# Start orchestrator as daemon (won't exit on idle when started by server)
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
except Exception as e:
# Don't crash the server if orchestrator fails to start
import logging
logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
def _is_web_server(self) -> bool:
"""Check if we're running a web server command."""
# Check for common web server indicators
server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)

View File

@@ -0,0 +1,22 @@
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_snapshot_crawl'),
]
operations = [
# Remove the unique constraint on url
migrations.AlterField(
model_name='snapshot',
name='url',
field=models.URLField(db_index=True, unique=False),
),
# Add unique constraint on (url, crawl) combination
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
),
]

View File

@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
return self.name
def save(self, *args, **kwargs):
if self._state.adding:
is_new = self._state.adding
if is_new:
self.slug = slugify(self.name)
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
i = None
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
i = (i or 0) + 1
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Tag',
indent_level=0,
metadata={
'id': self.id,
'name': self.name,
'slug': self.slug,
},
)
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
if tag.strip()
))
try:
snapshot = self.get(url=url)
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = self.filter(url=url).order_by('-created_at').first()
if snapshot:
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
except self.model.DoesNotExist:
else:
if timestamp:
while self.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
url = models.URLField(unique=True, db_index=True)
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
class Meta(TypedModelMeta):
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
constraints = [
# Allow same URL in different crawls, but not duplicates within same crawl
models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
]
def __str__(self):
return f'[{self.id}] {self.url[:64]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp:
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Snapshot',
indent_level=2,
url=self.url,
metadata={
'id': str(self.id),
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
'depth': self.depth,
'status': self.status,
},
)
def output_dir_parent(self) -> str:
return 'archive'
@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created ArchiveResult',
indent_level=3,
extractor=self.extractor,
metadata={
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
'snapshot_url': str(self.snapshot.url)[:64],
'status': self.status,
},
)
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.output_dir)
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Find hook for this extractor
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.save()
return
# Use plugin directory name instead of extractor name (removes numeric prefix)
plugin_name = hook.parent.name
extractor_dir = Path(self.snapshot.output_dir) / plugin_name
# Run the hook
start_ts = timezone.now()
result = run_hook(

View File

@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
super().__init__(snapshot, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
return f'Snapshot[{self.snapshot.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
if not can_start:
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
# Suppressed: queue waiting logs
return can_start
def is_finished(self) -> bool:
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
@queued.enter
def enter_queued(self):
print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
print(f'{self}.on_started() ↳ snapshot.run()')
# Suppressed: state transition logs
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
status=Snapshot.StatusChoices.STARTED,
)
@sealed.enter
def enter_sealed(self):
print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
# Suppressed: state transition logs
self.snapshot.update_for_workers(
retry_at=None,
status=Snapshot.StatusChoices.SEALED,
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
super().__init__(archiveresult, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
return f'ArchiveResult[{self.archiveresult.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url)
if not can_start:
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
# Suppressed: queue waiting logs
return can_start
def is_succeeded(self) -> bool:
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@queued.enter
def enter_queued(self):
print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=None,
) # bump the snapshot's retry_at so they pickup any new changes
@started.enter
def enter_started(self):
print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
# Suppressed: state transition logs
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
)
# Run the extractor - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
# Log the result
if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
# Suppressed: extractor result logs (already logged by worker)
@backoff.enter
def enter_backoff(self):
print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF,
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
# retries=F('retries') + 1, # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
)
self.archiveresult.save(write_indexes=True)
@succeeded.enter
def enter_succeeded(self):
print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@failed.enter
def enter_failed(self):
print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@skipped.enter
def enter_skipped(self):
print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
# Suppressed: state transition logs
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,

View File

@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
)
# Start orchestrator in background to process the queued crawl
try:
from archivebox.workers.tasks import ensure_orchestrator_running
ensure_orchestrator_running()
except Exception as e:
# Orchestrator may already be running via supervisord, or fail to start
# This is not fatal - the crawl will be processed when orchestrator runs
print(f'[!] Failed to start orchestrator: {e}')
# Orchestrator (managed by supervisord) will pick up the queued crawl
return redirect(crawl.admin_change_url)
@@ -539,6 +531,7 @@ def live_progress_view(request):
from workers.orchestrator import Orchestrator
from crawls.models import Crawl
from core.models import Snapshot, ArchiveResult
from django.db.models import Case, When, Value, IntegerField
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
@@ -570,8 +563,26 @@ def live_progress_view(request):
crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
total_snapshots = crawl_snapshots.count()
completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
if crawl.urls:
urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
elif crawl.seed and crawl.seed.uri:
# Try to get URL count from seed
if crawl.seed.uri.startswith('file:///'):
try:
from pathlib import Path
seed_file = Path(crawl.seed.uri.replace('file://', ''))
if seed_file.exists():
urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
except:
pass
else:
urls_count = 1 # Single URL seed
# Calculate crawl progress
crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
@@ -590,16 +601,24 @@ def live_progress_view(request):
# Calculate snapshot progress
snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
# Get active extractors for this snapshot
active_extractors = [
# Get all extractors for this snapshot
# Order: started first, then queued, then completed
all_extractors = [
{
'id': str(ar.id),
'extractor': ar.extractor,
'status': ar.status,
'started': ar.start_ts.isoformat() if ar.start_ts else None,
'progress': 50,
}
for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
for ar in snapshot_results.annotate(
status_order=Case(
When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
default=Value(4),
output_field=IntegerField(),
)
).order_by('status_order', 'extractor')
]
active_snapshots_for_crawl.append({
@@ -612,9 +631,17 @@ def live_progress_view(request):
'completed_extractors': completed_extractors,
'failed_extractors': failed_extractors,
'pending_extractors': pending_extractors,
'active_extractors': active_extractors,
'all_extractors': all_extractors,
})
# Check if crawl can start (for debugging stuck crawls)
can_start = bool(crawl.seed and crawl.seed.uri)
seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
# Check if retry_at is in the future (would prevent worker from claiming)
retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
active_crawls.append({
'id': str(crawl.id),
'label': str(crawl)[:60],
@@ -622,11 +649,17 @@ def live_progress_view(request):
'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
'progress': crawl_progress,
'max_depth': crawl.max_depth,
'urls_count': urls_count,
'total_snapshots': total_snapshots,
'completed_snapshots': completed_snapshots,
'started_snapshots': started_snapshots,
'failed_snapshots': 0,
'pending_snapshots': pending_snapshots,
'active_snapshots': active_snapshots_for_crawl,
'can_start': can_start,
'seed_uri': seed_uri,
'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry,
})
return JsonResponse({