remove huey

2026-01-04 09:55:33 +10:00 · 2025-12-24 23:40:18 -08:00
parent 6c769d831c
commit d95f0dc186
105 changed files with 3635 additions and 1402 deletions
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
 from core.models import Snapshot
 from core.forms import AddLinkForm
 from crawls.models import Seed, Crawl
+from archivebox.hooks import get_extractors, get_extractor_name



@@ -54,8 +55,10 @@ class SnapshotView(View):
    @staticmethod
    def render_live_index(request, snapshot):
        TITLE_LOADING_MSG = 'Not yet archived...'
-        HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')

+        # Dict of extractor -> ArchiveResult object
+        archiveresult_objects = {}
+        # Dict of extractor -> result info dict (for template compatibility)
        archiveresults = {}

        results = snapshot.archiveresult_set.all()
@@ -65,18 +68,21 @@ class SnapshotView(View):
            abs_path = result.snapshot_dir / (embed_path or 'None')

            if (result.status == 'succeeded'
-                and (result.extractor not in HIDDEN_RESULTS)
                and embed_path
                and os.access(abs_path, os.R_OK)
                and abs_path.exists()):
                if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
                    continue

+                # Store the full ArchiveResult object for template tags
+                archiveresult_objects[result.extractor] = result
+
                result_info = {
                    'name': result.extractor,
                    'path': embed_path,
                    'ts': ts_to_date_str(result.end_ts),
                    'size': abs_path.stat().st_size or '?',
+                    'result': result,  # Include the full object for template tags
                }
                archiveresults[result.extractor] = result_info

@@ -101,11 +107,11 @@ class SnapshotView(View):
        }


-        # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
+        # iterate through all the files in the snapshot dir and add the biggest ones to the result list
        snap_dir = Path(snapshot.output_dir)
        if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
            return {}
-        
+
        for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
            extension = result_file.suffix.lstrip('.').lower()
            if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
@@ -121,12 +127,16 @@ class SnapshotView(View):
                    'path': result_file.relative_to(snap_dir),
                    'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
                    'size': file_size,
+                    'result': None,  # No ArchiveResult object for filesystem-discovered files
                }

-        preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
+        # Get available extractors from hooks (sorted by numeric prefix for ordering)
+        # Convert to base names for display ordering
+        all_extractors = [get_extractor_name(e) for e in get_extractors()]
+        preferred_types = tuple(all_extractors)
        all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)

-        best_result = {'path': 'None'}
+        best_result = {'path': 'None', 'result': None}
        for result_type in preferred_types:
            if result_type in archiveresults:
                best_result = archiveresults[result_type]
@@ -157,6 +167,7 @@ class SnapshotView(View):
            'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
            'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
            'best_result': best_result,
+            'snapshot': snapshot,  # Pass the snapshot object for template tags
        }
        return render(template_name='core/snapshot_live.html', request=request, context=context)

@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
    def form_valid(self, form):
        urls = form.cleaned_data["url"]
        print(f'[+] Adding URL: {urls}')
-        parser = form.cleaned_data["parser"]
+        parser = form.cleaned_data.get("parser", "auto")  # default to auto-detect parser
        tag = form.cleaned_data["tag"]
        depth = 0 if form.cleaned_data["depth"] == "0" else 1
        extractors = ','.join(form.cleaned_data["archive_methods"])
@@ -452,18 +463,19 @@ class AddView(UserPassesTestMixin, FormView):
        if extractors:
            input_kwargs.update({"extractors": extractors})

-        
+
        from archivebox.config.permissions import HOSTNAME
-    
-    
+
+
        # 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
        sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
-        
+
        # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_<user_pk>.txt
+        timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
        seed = Seed.from_file(
            sources_file,
-            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
+            label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
            parser=parser,
            tag=tag,
            created_by=self.request.user.pk,
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
                # 'INDEX_ONLY': index_only,
                # 'OVERWRITE': False,
                'DEPTH': depth,
-                'EXTRACTORS': parser,
+                'EXTRACTORS': extractors or '',
                # 'DEFAULT_PERSONA': persona or 'Default',
            })
        # 3. create a new Crawl pointing to the Seed
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
            self.request,
            mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
        )
-        # if not bg:
-        #     from workers.orchestrator import Orchestrator
-        #     orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
-        #     orchestrator.start()
+
+        # Start orchestrator in background to process the queued crawl
+        try:
+            from archivebox.workers.tasks import ensure_orchestrator_running
+            ensure_orchestrator_running()
+        except Exception as e:
+            # Orchestrator may already be running via supervisord, or fail to start
+            # This is not fatal - the crawl will be processed when orchestrator runs
+            print(f'[!] Failed to start orchestrator: {e}')

        return redirect(crawl.admin_change_url)

@@ -513,6 +530,141 @@ class HealthCheckView(View):
        )


+import json
+from django.http import JsonResponse
+
+def live_progress_view(request):
+    """Simple JSON endpoint for live progress status - used by admin progress monitor."""
+    try:
+        from workers.orchestrator import Orchestrator
+        from crawls.models import Crawl
+        from core.models import Snapshot, ArchiveResult
+
+        # Get orchestrator status
+        orchestrator_running = Orchestrator.is_running()
+        total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
+
+        # Get model counts by status
+        crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
+        crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
+
+        # Get recent crawls (last 24 hours)
+        from datetime import timedelta
+        one_day_ago = timezone.now() - timedelta(days=1)
+        crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
+
+        snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
+        snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
+
+        archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
+        archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
+        archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
+        archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
+
+        # Build hierarchical active crawls with nested snapshots and archive results
+        active_crawls = []
+        for crawl in Crawl.objects.filter(
+            status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
+        ).order_by('-modified_at')[:10]:
+            # Get snapshots for this crawl
+            crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
+            total_snapshots = crawl_snapshots.count()
+            completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
+            pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
+
+            # Calculate crawl progress
+            crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
+
+            # Get active snapshots for this crawl
+            active_snapshots_for_crawl = []
+            for snapshot in crawl_snapshots.filter(
+                status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
+            ).order_by('-modified_at')[:5]:
+                # Get archive results for this snapshot
+                snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
+                total_extractors = snapshot_results.count()
+                completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
+                failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
+                pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
+
+                # Calculate snapshot progress
+                snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
+
+                # Get active extractors for this snapshot
+                active_extractors = [
+                    {
+                        'id': str(ar.id),
+                        'extractor': ar.extractor,
+                        'status': ar.status,
+                        'started': ar.start_ts.isoformat() if ar.start_ts else None,
+                        'progress': 50,
+                    }
+                    for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
+                ]
+
+                active_snapshots_for_crawl.append({
+                    'id': str(snapshot.id),
+                    'url': snapshot.url[:80],
+                    'status': snapshot.status,
+                    'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
+                    'progress': snapshot_progress,
+                    'total_extractors': total_extractors,
+                    'completed_extractors': completed_extractors,
+                    'failed_extractors': failed_extractors,
+                    'pending_extractors': pending_extractors,
+                    'active_extractors': active_extractors,
+                })
+
+            active_crawls.append({
+                'id': str(crawl.id),
+                'label': str(crawl)[:60],
+                'status': crawl.status,
+                'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
+                'progress': crawl_progress,
+                'max_depth': crawl.max_depth,
+                'total_snapshots': total_snapshots,
+                'completed_snapshots': completed_snapshots,
+                'failed_snapshots': 0,
+                'pending_snapshots': pending_snapshots,
+                'active_snapshots': active_snapshots_for_crawl,
+            })
+
+        return JsonResponse({
+            'orchestrator_running': orchestrator_running,
+            'total_workers': total_workers,
+            'crawls_pending': crawls_pending,
+            'crawls_started': crawls_started,
+            'crawls_recent': crawls_recent,
+            'snapshots_pending': snapshots_pending,
+            'snapshots_started': snapshots_started,
+            'archiveresults_pending': archiveresults_pending,
+            'archiveresults_started': archiveresults_started,
+            'archiveresults_succeeded': archiveresults_succeeded,
+            'archiveresults_failed': archiveresults_failed,
+            'active_crawls': active_crawls,
+            'server_time': timezone.now().isoformat(),
+        })
+    except Exception as e:
+        import traceback
+        return JsonResponse({
+            'error': str(e),
+            'traceback': traceback.format_exc(),
+            'orchestrator_running': False,
+            'total_workers': 0,
+            'crawls_pending': 0,
+            'crawls_started': 0,
+            'crawls_recent': 0,
+            'snapshots_pending': 0,
+            'snapshots_started': 0,
+            'archiveresults_pending': 0,
+            'archiveresults_started': 0,
+            'archiveresults_succeeded': 0,
+            'archiveresults_failed': 0,
+            'active_crawls': [],
+            'server_time': timezone.now().isoformat(),
+        }, status=500)
+
+
 def find_config_section(key: str) -> str:
    CONFIGS = get_all_configs()