logging and admin ui improvements

2026-04-05 23:37:58 +10:00 · 2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -19,6 +19,150 @@ from archivebox.hooks import get_extractor_icon
 from core.models import ArchiveResult, Snapshot


+def render_archiveresults_list(archiveresults_qs, limit=50):
+    """Render a nice inline list view of archive results with status, extractor, output, and actions."""
+
+    results = list(archiveresults_qs.order_by('-end_ts').select_related('snapshot')[:limit])
+
+    if not results:
+        return mark_safe('<div style="color: #64748b; font-style: italic; padding: 16px 0;">No Archive Results yet...</div>')
+
+    # Status colors
+    status_colors = {
+        'succeeded': ('#166534', '#dcfce7'),   # green
+        'failed': ('#991b1b', '#fee2e2'),       # red
+        'queued': ('#6b7280', '#f3f4f6'),       # gray
+        'started': ('#92400e', '#fef3c7'),      # amber
+    }
+
+    rows = []
+    for idx, result in enumerate(results):
+        status = result.status or 'queued'
+        color, bg = status_colors.get(status, ('#6b7280', '#f3f4f6'))
+
+        # Get extractor icon
+        icon = get_extractor_icon(result.extractor)
+
+        # Format timestamp
+        end_time = result.end_ts.strftime('%Y-%m-%d %H:%M:%S') if result.end_ts else '-'
+
+        # Truncate output for display
+        full_output = result.output or '-'
+        output_display = full_output[:60]
+        if len(full_output) > 60:
+            output_display += '...'
+
+        # Get full command as tooltip
+        cmd_str = ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd or '-')
+
+        # Build output link
+        output_link = f'/archive/{result.snapshot.timestamp}/{result.output}' if result.output and result.status == 'succeeded' else f'/archive/{result.snapshot.timestamp}/'
+
+        # Get version - try cmd_version field
+        version = result.cmd_version if result.cmd_version else '-'
+
+        # Unique ID for this row's expandable output
+        row_id = f'output_{idx}_{str(result.id)[:8]}'
+
+        rows.append(f'''
+            <tr style="border-bottom: 1px solid #f1f5f9; transition: background 0.15s;" onmouseover="this.style.background='#f8fafc'" onmouseout="this.style.background='transparent'">
+                <td style="padding: 10px 12px; white-space: nowrap;">
+                    <span style="display: inline-block; padding: 3px 10px; border-radius: 12px;
+                                 font-size: 11px; font-weight: 600; text-transform: uppercase;
+                                 color: {color}; background: {bg};">{status}</span>
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; font-size: 20px;" title="{result.extractor}">
+                    {icon}
+                </td>
+                <td style="padding: 10px 12px; font-weight: 500; color: #334155;">
+                    {result.extractor}
+                </td>
+                <td style="padding: 10px 12px; max-width: 280px;">
+                    <span onclick="document.getElementById('{row_id}').open = !document.getElementById('{row_id}').open"
+                          style="color: #2563eb; text-decoration: none; font-family: ui-monospace, monospace; font-size: 12px; cursor: pointer;"
+                          title="Click to expand full output">
+                        {output_display}
+                    </span>
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; color: #64748b; font-size: 12px;">
+                    {end_time}
+                </td>
+                <td style="padding: 10px 12px; white-space: nowrap; font-family: ui-monospace, monospace; font-size: 11px; color: #64748b;">
+                    {version}
+                </td>
+                <td style="padding: 10px 8px; white-space: nowrap;">
+                    <div style="display: flex; gap: 4px;">
+                        <a href="{output_link}" target="_blank"
+                           style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
+                           title="View output">📄</a>
+                        <a href="{reverse('admin:core_archiveresult_change', args=[result.id])}"
+                           style="padding: 4px 8px; background: #f1f5f9; border-radius: 4px; color: #475569; text-decoration: none; font-size: 11px;"
+                           title="Edit">✏️</a>
+                    </div>
+                </td>
+            </tr>
+            <tr style="border-bottom: 1px solid #e2e8f0;">
+                <td colspan="7" style="padding: 0 12px 10px 12px;">
+                    <details id="{row_id}" style="margin: 0;">
+                        <summary style="cursor: pointer; font-size: 11px; color: #94a3b8; user-select: none;">
+                            Details &amp; Output
+                        </summary>
+                        <div style="margin-top: 8px; padding: 10px; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 6px; max-height: 200px; overflow: auto;">
+                            <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
+                                <span style="margin-right: 16px;"><b>ID:</b> <code>{str(result.id)[:8]}...</code></span>
+                                <span style="margin-right: 16px;"><b>Version:</b> <code>{version}</code></span>
+                                <span style="margin-right: 16px;"><b>PWD:</b> <code>{result.pwd or '-'}</code></span>
+                            </div>
+                            <div style="font-size: 11px; color: #64748b; margin-bottom: 8px;">
+                                <b>Output:</b>
+                            </div>
+                            <pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 12px; white-space: pre-wrap; word-break: break-all; max-height: 120px; overflow: auto;">{full_output}</pre>
+                            <div style="font-size: 11px; color: #64748b; margin-top: 8px;">
+                                <b>Command:</b>
+                            </div>
+                            <pre style="margin: 0; padding: 8px; background: #1e293b; border-radius: 4px; color: #e2e8f0; font-size: 11px; white-space: pre-wrap; word-break: break-all;">{cmd_str}</pre>
+                        </div>
+                    </details>
+                </td>
+            </tr>
+        ''')
+
+    total_count = archiveresults_qs.count()
+    footer = ''
+    if total_count > limit:
+        footer = f'''
+            <tr>
+                <td colspan="7" style="padding: 12px; text-align: center; color: #64748b; font-size: 13px; background: #f8fafc;">
+                    Showing {limit} of {total_count} results &nbsp;
+                    <a href="/admin/core/archiveresult/?snapshot__id__exact={results[0].snapshot_id if results else ''}"
+                       style="color: #2563eb;">View all →</a>
+                </td>
+            </tr>
+        '''
+
+    return mark_safe(f'''
+        <div style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden; background: #fff; width: 100%;">
+            <table style="width: 100%; border-collapse: collapse; font-size: 14px;">
+                <thead>
+                    <tr style="background: #f8fafc; border-bottom: 2px solid #e2e8f0;">
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Status</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; width: 32px;"></th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Extractor</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Output</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Completed</th>
+                        <th style="padding: 10px 12px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Version</th>
+                        <th style="padding: 10px 8px; text-align: left; font-weight: 600; color: #475569; font-size: 12px; text-transform: uppercase; letter-spacing: 0.05em;">Actions</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    {''.join(rows)}
+                    {footer}
+                </tbody>
+            </table>
+        </div>
+    ''')
+
+

 class ArchiveResultInline(admin.TabularInline):
    name = 'Archive Results Log'
@@ -97,18 +241,44 @@ class ArchiveResultAdmin(BaseModelAdmin):
    sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
    readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
    search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
-    fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
    autocomplete_fields = ['snapshot']

+    fieldsets = (
+        ('Snapshot', {
+            'fields': ('snapshot', 'snapshot_info', 'tags_str'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Extractor', {
+            'fields': ('extractor', 'extractor_with_icon', 'status', 'retry_at'),
+            'classes': ('card',),
+        }),
+        ('Timing', {
+            'fields': ('start_ts', 'end_ts', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Command', {
+            'fields': ('cmd', 'cmd_str', 'cmd_version', 'pwd'),
+            'classes': ('card',),
+        }),
+        ('Output', {
+            'fields': ('output', 'output_summary'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Metadata', {
+            'fields': ('created_by',),
+            'classes': ('card',),
+        }),
+    )
+
    list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
    ordering = ['-start_ts']
    list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
-    
+
    paginator = AccelleratedPaginator
    save_on_top = True
-    
+
    actions = ['delete_selected']
-    
+
    class Meta:
        verbose_name = 'Archive Result'
        verbose_name_plural = 'Archive Results'
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add

 from core.models import Tag
 from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline
+from core.admin_archiveresults import ArchiveResultInline, render_archiveresults_list


 # GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,13 +54,48 @@ class SnapshotActionForm(ActionForm):
 class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
    sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
-    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
+    readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
    search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
    list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
-    fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
+
+    fieldsets = (
+        ('URL', {
+            'fields': ('url', 'title'),
+            'classes': ('card', 'wide'),
+        }),
+        ('Status', {
+            'fields': ('status', 'retry_at', 'status_info'),
+            'classes': ('card',),
+        }),
+        ('Timestamps', {
+            'fields': ('bookmarked_at', 'created_at', 'modified_at', 'downloaded_at'),
+            'classes': ('card',),
+        }),
+        ('Relations', {
+            'fields': ('crawl', 'created_by', 'tags_str'),
+            'classes': ('card',),
+        }),
+        ('Config', {
+            'fields': ('config',),
+            'classes': ('card',),
+        }),
+        ('Files', {
+            'fields': ('output_dir',),
+            'classes': ('card',),
+        }),
+        ('Actions', {
+            'fields': ('admin_actions',),
+            'classes': ('card', 'wide'),
+        }),
+        ('Archive Results', {
+            'fields': ('archiveresults_list',),
+            'classes': ('card', 'wide'),
+        }),
+    )
+
    ordering = ['-created_at']
    actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
-    inlines = [TagInline, ArchiveResultInline]
+    inlines = [TagInline]  # Removed ArchiveResultInline, using custom renderer instead
    list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)

    action_form = SnapshotActionForm
@@ -155,6 +190,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            obj.extension or '-',
        )

+    @admin.display(description='Archive Results')
+    def archiveresults_list(self, obj):
+        return render_archiveresults_list(obj.archiveresult_set.all())
+
    @admin.display(
        description='Title',
        ordering='title',
--- a/archivebox/core/admin_tags.py
+++ b/archivebox/core/admin_tags.py
@@ -51,11 +51,25 @@ class TagAdmin(BaseModelAdmin):
    sort_fields = ('name', 'slug', 'id', 'created_by', 'created_at')
    readonly_fields = ('slug', 'id', 'created_at', 'modified_at', 'snapshots')
    search_fields = ('id', 'name', 'slug')
-    fields = ('name', 'created_by', *readonly_fields)
    actions = ['delete_selected', 'merge_tags']
    ordering = ['-created_at']
    # inlines = [TaggedItemInline]

+    fieldsets = (
+        ('Tag Info', {
+            'fields': ('name', 'slug'),
+            'classes': ('card',),
+        }),
+        ('Metadata', {
+            'fields': ('id', 'created_by', 'created_at', 'modified_at'),
+            'classes': ('card',),
+        }),
+        ('Snapshots', {
+            'fields': ('snapshots',),
+            'classes': ('card', 'wide'),
+        }),
+    )
+
    paginator = AccelleratedPaginator


--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -1,7 +1,5 @@
 __package__ = 'archivebox.core'

-import sys
-
 from django.apps import AppConfig


@@ -12,41 +10,3 @@ class CoreConfig(AppConfig):
        """Register the archivebox.core.admin_site as the main django admin site"""
        from core.admin_site import register_admin_site
        register_admin_site()
-
-        # Auto-start the orchestrator when running the web server
-        self._maybe_start_orchestrator()
-
-    def _maybe_start_orchestrator(self):
-        """Start the orchestrator if we're running a web server."""
-        import os
-
-        # Don't start orchestrator during migrations, shell, tests, etc.
-        # Only start when running: runserver, daphne, gunicorn, uwsgi
-        if not self._is_web_server():
-            return
-
-        # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
-        if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
-            return
-
-        # Don't start in autoreload child process (avoid double-start)
-        if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
-            return
-
-        try:
-            from workers.orchestrator import Orchestrator
-
-            if not Orchestrator.is_running():
-                # Start orchestrator as daemon (won't exit on idle when started by server)
-                orchestrator = Orchestrator(exit_on_idle=False)
-                orchestrator.start()
-        except Exception as e:
-            # Don't crash the server if orchestrator fails to start
-            import logging
-            logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
-
-    def _is_web_server(self) -> bool:
-        """Check if we're running a web server command."""
-        # Check for common web server indicators
-        server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
-        return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
--- a/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py
+++ b/archivebox/core/migrations/0025_allow_duplicate_urls_per_crawl.py
@@ -0,0 +1,22 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0024_snapshot_crawl'),
+    ]
+
+    operations = [
+        # Remove the unique constraint on url
+        migrations.AlterField(
+            model_name='snapshot',
+            name='url',
+            field=models.URLField(db_index=True, unique=False),
+        ),
+        # Add unique constraint on (url, crawl) combination
+        migrations.AddConstraint(
+            model_name='snapshot',
+            constraint=models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
        return self.name

    def save(self, *args, **kwargs):
-        if self._state.adding:
+        is_new = self._state.adding
+        if is_new:
            self.slug = slugify(self.name)
            existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
            i = None
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
                i = (i or 0) + 1
        super().save(*args, **kwargs)

+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Tag',
+                indent_level=0,
+                metadata={
+                    'id': self.id,
+                    'name': self.name,
+                    'slug': self.slug,
+                },
+            )
+
    @property
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_tag', args=[self.id])
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
                if tag.strip()
            ))

-        try:
-            snapshot = self.get(url=url)
+        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
+        snapshot = self.filter(url=url).order_by('-created_at').first()
+        if snapshot:
            if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
                snapshot.title = title
                snapshot.save(update_fields=['title', 'modified_at'])
-        except self.model.DoesNotExist:
+        else:
            if timestamp:
                while self.filter(timestamp=timestamp).exists():
                    timestamp = str(float(timestamp) + 1.0)
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

-    url = models.URLField(unique=True, db_index=True)
+    url = models.URLField(unique=False, db_index=True)  # URLs can appear in multiple crawls
    timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
    bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
    crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True)  # type: ignore
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    class Meta(TypedModelMeta):
        verbose_name = "Snapshot"
        verbose_name_plural = "Snapshots"
+        constraints = [
+            # Allow same URL in different crawls, but not duplicates within same crawl
+            models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
+        ]

    def __str__(self):
        return f'[{self.id}] {self.url[:64]}'

    def save(self, *args, **kwargs):
+        is_new = self._state.adding
        if not self.bookmarked_at:
            self.bookmarked_at = self.created_at or timezone.now()
        if not self.timestamp:
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            self.crawl.urls += f'\n{self.url}'
            self.crawl.save()

+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created Snapshot',
+                indent_level=2,
+                url=self.url,
+                metadata={
+                    'id': str(self.id),
+                    'crawl_id': str(self.crawl_id) if self.crawl_id else None,
+                    'depth': self.depth,
+                    'status': self.status,
+                },
+            )
+
    def output_dir_parent(self) -> str:
        return 'archive'

@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    def __str__(self):
        return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'

+    def save(self, *args, **kwargs):
+        is_new = self._state.adding
+        super().save(*args, **kwargs)
+        if is_new:
+            from archivebox.misc.logging_util import log_worker_event
+            log_worker_event(
+                worker_type='DB',
+                event='Created ArchiveResult',
+                indent_level=3,
+                extractor=self.extractor,
+                metadata={
+                    'id': str(self.id),
+                    'snapshot_id': str(self.snapshot_id),
+                    'snapshot_url': str(self.snapshot.url)[:64],
+                    'status': self.status,
+                },
+            )
+
    @cached_property
    def snapshot_dir(self):
        return Path(self.snapshot.output_dir)
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        from django.utils import timezone
        from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook

-        extractor_dir = Path(self.snapshot.output_dir) / self.extractor
        config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]

        # Find hook for this extractor
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            self.save()
            return

+        # Use plugin directory name instead of extractor name (removes numeric prefix)
+        plugin_name = hook.parent.name
+        extractor_dir = Path(self.snapshot.output_dir) / plugin_name
+
        # Run the hook
        start_ts = timezone.now()
        result = run_hook(
--- a/archivebox/core/statemachines.py
+++ b/archivebox/core/statemachines.py
@@ -45,15 +45,14 @@ class SnapshotMachine(StateMachine, strict_states=True):
        super().__init__(snapshot, *args, **kwargs)
        
    def __repr__(self) -> str:
-        return f'[grey53]Snapshot\\[{self.snapshot.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'Snapshot[{self.snapshot.id}]'
+
    def __str__(self) -> str:
        return self.__repr__()
-        
+
    def can_start(self) -> bool:
        can_start = bool(self.snapshot.url)
-        if not can_start:
-            print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
+        # Suppressed: queue waiting logs
        return can_start
        
    def is_finished(self) -> bool:
@@ -73,15 +72,15 @@ class SnapshotMachine(StateMachine, strict_states=True):
        
    @queued.enter
    def enter_queued(self):
-        print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
+        # Suppressed: state transition logs
        self.snapshot.update_for_workers(
            retry_at=timezone.now(),
            status=Snapshot.StatusChoices.QUEUED,
        )
-        
+
    @started.enter
    def enter_started(self):
-        print(f'{self}.on_started() ↳ snapshot.run()')
+        # Suppressed: state transition logs
        # lock the snapshot while we create the pending archiveresults
        self.snapshot.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
@@ -95,10 +94,10 @@ class SnapshotMachine(StateMachine, strict_states=True):
            retry_at=timezone.now() + timedelta(seconds=5),  # wait 5s before checking it again
            status=Snapshot.StatusChoices.STARTED,
        )
-        
+
    @sealed.enter
    def enter_sealed(self):
-        print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
+        # Suppressed: state transition logs
        self.snapshot.update_for_workers(
            retry_at=None,
            status=Snapshot.StatusChoices.SEALED,
@@ -161,15 +160,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
        super().__init__(archiveresult, *args, **kwargs)
    
    def __repr__(self) -> str:
-        return f'[grey53]ArchiveResult\\[{self.archiveresult.id}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
-    
+        return f'ArchiveResult[{self.archiveresult.id}]'
+
    def __str__(self) -> str:
        return self.__repr__()
-        
+
    def can_start(self) -> bool:
        can_start = bool(self.archiveresult.snapshot.url)
-        if not can_start:
-            print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
+        # Suppressed: queue waiting logs
        return can_start
    
    def is_succeeded(self) -> bool:
@@ -202,41 +200,34 @@ class ArchiveResultMachine(StateMachine, strict_states=True):

    @queued.enter
    def enter_queued(self):
-        print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=timezone.now(),
            status=ArchiveResult.StatusChoices.QUEUED,
            start_ts=None,
        )  # bump the snapshot's retry_at so they pickup any new changes
-        
+
    @started.enter
    def enter_started(self):
-        print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
-        
+        # Suppressed: state transition logs
        # Lock the object and mark start time
        self.archiveresult.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=120),  # 2 min timeout for extractor
            status=ArchiveResult.StatusChoices.STARTED,
            start_ts=timezone.now(),
        )
-        
+
        # Run the extractor - this updates status, output, timestamps, etc.
        self.archiveresult.run()
-        
+
        # Save the updated result
        self.archiveresult.save()
-        
-        # Log the result
-        if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
-            print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
-        elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
-            print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
-        elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
-            print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
+
+        # Suppressed: extractor result logs (already logged by worker)

    @backoff.enter
    def enter_backoff(self):
-        print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=timezone.now() + timedelta(seconds=60),
            status=ArchiveResult.StatusChoices.BACKOFF,
@@ -244,10 +235,10 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
            # retries=F('retries') + 1,               # F() equivalent to getattr(self.archiveresult, 'retries', 0) + 1,
        )
        self.archiveresult.save(write_indexes=True)
-        
+
    @succeeded.enter
    def enter_succeeded(self):
-        print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -270,7 +261,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):

    @failed.enter
    def enter_failed(self):
-        print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=None,
            status=ArchiveResult.StatusChoices.FAILED,
@@ -291,7 +282,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):

    @skipped.enter
    def enter_skipped(self):
-        print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
+        # Suppressed: state transition logs
        self.archiveresult.update_for_workers(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SKIPPED,
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -503,15 +503,7 @@ class AddView(UserPassesTestMixin, FormView):
            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
        )

-        # Start orchestrator in background to process the queued crawl
-        try:
-            from archivebox.workers.tasks import ensure_orchestrator_running
-            ensure_orchestrator_running()
-        except Exception as e:
-            # Orchestrator may already be running via supervisord, or fail to start
-            # This is not fatal - the crawl will be processed when orchestrator runs
-            print(f'[!] Failed to start orchestrator: {e}')
-
+        # Orchestrator (managed by supervisord) will pick up the queued crawl
        return redirect(crawl.admin_change_url)


@@ -539,6 +531,7 @@ def live_progress_view(request):
        from workers.orchestrator import Orchestrator
        from crawls.models import Crawl
        from core.models import Snapshot, ArchiveResult
+        from django.db.models import Case, When, Value, IntegerField

        # Get orchestrator status
        orchestrator_running = Orchestrator.is_running()
@@ -570,8 +563,26 @@ def live_progress_view(request):
            crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
            total_snapshots = crawl_snapshots.count()
            completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
+            started_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.STARTED).count()
            pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()

+            # Count URLs in the crawl (for when snapshots haven't been created yet)
+            urls_count = 0
+            if crawl.urls:
+                urls_count = len([u for u in crawl.urls.split('\n') if u.strip()])
+            elif crawl.seed and crawl.seed.uri:
+                # Try to get URL count from seed
+                if crawl.seed.uri.startswith('file:///'):
+                    try:
+                        from pathlib import Path
+                        seed_file = Path(crawl.seed.uri.replace('file://', ''))
+                        if seed_file.exists():
+                            urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')])
+                    except:
+                        pass
+                else:
+                    urls_count = 1  # Single URL seed
+
            # Calculate crawl progress
            crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0

@@ -590,16 +601,24 @@ def live_progress_view(request):
                # Calculate snapshot progress
                snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0

-                # Get active extractors for this snapshot
-                active_extractors = [
+                # Get all extractors for this snapshot
+                # Order: started first, then queued, then completed
+                all_extractors = [
                    {
                        'id': str(ar.id),
                        'extractor': ar.extractor,
                        'status': ar.status,
-                        'started': ar.start_ts.isoformat() if ar.start_ts else None,
-                        'progress': 50,
                    }
-                    for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
+                    for ar in snapshot_results.annotate(
+                        status_order=Case(
+                            When(status=ArchiveResult.StatusChoices.STARTED, then=Value(0)),
+                            When(status=ArchiveResult.StatusChoices.QUEUED, then=Value(1)),
+                            When(status=ArchiveResult.StatusChoices.SUCCEEDED, then=Value(2)),
+                            When(status=ArchiveResult.StatusChoices.FAILED, then=Value(3)),
+                            default=Value(4),
+                            output_field=IntegerField(),
+                        )
+                    ).order_by('status_order', 'extractor')
                ]

                active_snapshots_for_crawl.append({
@@ -612,9 +631,17 @@ def live_progress_view(request):
                    'completed_extractors': completed_extractors,
                    'failed_extractors': failed_extractors,
                    'pending_extractors': pending_extractors,
-                    'active_extractors': active_extractors,
+                    'all_extractors': all_extractors,
                })

+            # Check if crawl can start (for debugging stuck crawls)
+            can_start = bool(crawl.seed and crawl.seed.uri)
+            seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None
+
+            # Check if retry_at is in the future (would prevent worker from claiming)
+            retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False
+            seconds_until_retry = int((crawl.retry_at - timezone.now()).total_seconds()) if crawl.retry_at and retry_at_future else 0
+
            active_crawls.append({
                'id': str(crawl.id),
                'label': str(crawl)[:60],
@@ -622,11 +649,17 @@ def live_progress_view(request):
                'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
                'progress': crawl_progress,
                'max_depth': crawl.max_depth,
+                'urls_count': urls_count,
                'total_snapshots': total_snapshots,
                'completed_snapshots': completed_snapshots,
+                'started_snapshots': started_snapshots,
                'failed_snapshots': 0,
                'pending_snapshots': pending_snapshots,
                'active_snapshots': active_snapshots_for_crawl,
+                'can_start': can_start,
+                'seed_uri': seed_uri,
+                'retry_at_future': retry_at_future,
+                'seconds_until_retry': seconds_until_retry,
            })

        return JsonResponse({