Refactor ArchiveBox onto abx-dl bus runner

2026-04-06 15:57:53 +10:00 · 2026-03-21 11:47:57 -07:00
parent ee9ed440d1
commit c87079aa0a
45 changed files with 1282 additions and 6396 deletions
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -723,7 +723,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):

        messages.success(
            request,
-            f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
+            f"Queued {queued} snapshots for re-archiving. The background runner will process them.",
        )


@@ -739,7 +739,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):

        messages.success(
            request,
-            f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
+            f"Creating {queryset.count()} new fresh snapshots. The background runner will process them.",
        )

    @admin.action(
@@ -750,7 +750,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):

        messages.success(
            request,
-            f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
+            f"Queued {queued} snapshots for full re-archive (overwriting existing). The background runner will process them.",
        )

    @admin.action(
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -3,8 +3,6 @@ __package__ = 'archivebox.core'
 from django.apps import AppConfig
 import os

-_ORCHESTRATOR_BOOTSTRAPPED = False
-

 class CoreConfig(AppConfig):
    name = 'archivebox.core'
@@ -35,32 +33,15 @@ class CoreConfig(AppConfig):
                except Exception:
                    pass

-        def _should_manage_orchestrator() -> bool:
-            if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER') == '1':
-                return False
-            if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_PROCESS') == '1':
-                return False
+        def _should_prepare_runtime() -> bool:
            if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
                if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
                    return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
                return True
+            return False

-            argv = ' '.join(sys.argv).lower()
-            if 'orchestrator' in argv:
-                return False
-            return 'daphne' in argv and '--reload' in sys.argv
-
-        if _should_manage_orchestrator():
-            global _ORCHESTRATOR_BOOTSTRAPPED
-            if _ORCHESTRATOR_BOOTSTRAPPED:
-                return
-            _ORCHESTRATOR_BOOTSTRAPPED = True
-
+        if _should_prepare_runtime():
            from archivebox.machine.models import Process, Machine
-            from archivebox.workers.orchestrator import Orchestrator

            Process.cleanup_stale_running()
            Machine.current()
-
-            if not Orchestrator.is_running():
-                Orchestrator(exit_on_idle=False).start()
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1821,7 +1821,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        Check if all ArchiveResults are finished.

        Note: This is only called for observability/progress tracking.
-        SnapshotWorker owns the execution and doesn't poll this.
+        The shared runner owns execution and does not poll this.
        """
        # Check if any ARs are still pending/started
        pending = self.archiveresult_set.exclude(
@@ -2325,7 +2325,7 @@ class SnapshotMachine(BaseStateMachine):

    @started.enter
    def enter_started(self):
-        """Just mark as started - SnapshotWorker will create ARs and run hooks."""
+        """Just mark as started. The shared runner creates ArchiveResults and runs hooks."""
        self.snapshot.status = Snapshot.StatusChoices.STARTED
        self.snapshot.retry_at = None  # No more polling
        self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
@@ -3344,8 +3344,8 @@ class ArchiveResultMachine(BaseStateMachine):
        """
        Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.

-        Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
-        This method is kept for backwards compatibility with manual CLI commands.
+        Note: In the new architecture, the shared runner handles step advancement and sealing.
+        This method is kept for direct model-driven edge cases.
        """
        import sys

--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -1068,21 +1068,27 @@ class HealthCheckView(View):
 def live_progress_view(request):
    """Simple JSON endpoint for live progress status - used by admin progress monitor."""
    try:
-        from archivebox.workers.orchestrator import Orchestrator
        from archivebox.crawls.models import Crawl
        from archivebox.core.models import Snapshot, ArchiveResult
        from archivebox.machine.models import Process, Machine

-        # Get orchestrator status
-        orchestrator_running = Orchestrator.is_running()
-        total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
        machine = Machine.current()
        orchestrator_proc = Process.objects.filter(
            machine=machine,
            process_type=Process.TypeChoices.ORCHESTRATOR,
            status=Process.StatusChoices.RUNNING,
        ).order_by('-started_at').first()
+        orchestrator_running = orchestrator_proc is not None
        orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
+        total_workers = Process.objects.filter(
+            machine=machine,
+            status=Process.StatusChoices.RUNNING,
+            process_type__in=[
+                Process.TypeChoices.WORKER,
+                Process.TypeChoices.HOOK,
+                Process.TypeChoices.BINARY,
+            ],
+        ).count()

        # Get model counts by status
        crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
@@ -1128,43 +1134,27 @@ def live_progress_view(request):

        # Build hierarchical active crawls with nested snapshots and archive results

-        running_workers = Process.objects.filter(
+        running_processes = Process.objects.filter(
            machine=machine,
-            process_type=Process.TypeChoices.WORKER,
            status=Process.StatusChoices.RUNNING,
+            process_type__in=[
+                Process.TypeChoices.HOOK,
+                Process.TypeChoices.BINARY,
+            ],
        )
-        crawl_worker_pids: dict[str, int] = {}
-        snapshot_worker_pids: dict[str, int] = {}
-        for proc in running_workers:
+        crawl_process_pids: dict[str, int] = {}
+        snapshot_process_pids: dict[str, int] = {}
+        for proc in running_processes:
            env = proc.env or {}
            if not isinstance(env, dict):
                env = {}

-            cmd = proc.cmd or []
-            if proc.worker_type == 'crawl':
-                crawl_id = env.get('CRAWL_ID')
-                if not crawl_id:
-                    for i, part in enumerate(cmd):
-                        if part == '--crawl-id' and i + 1 < len(cmd):
-                            crawl_id = cmd[i + 1]
-                            break
-                        if part.startswith('--crawl-id='):
-                            crawl_id = part.split('=', 1)[1]
-                            break
-                if crawl_id:
-                    crawl_worker_pids[str(crawl_id)] = proc.pid
-            elif proc.worker_type == 'snapshot':
-                snapshot_id = env.get('SNAPSHOT_ID')
-                if not snapshot_id:
-                    for i, part in enumerate(cmd):
-                        if part == '--snapshot-id' and i + 1 < len(cmd):
-                            snapshot_id = cmd[i + 1]
-                            break
-                        if part.startswith('--snapshot-id='):
-                            snapshot_id = part.split('=', 1)[1]
-                            break
-                if snapshot_id:
-                    snapshot_worker_pids[str(snapshot_id)] = proc.pid
+            crawl_id = env.get('CRAWL_ID')
+            snapshot_id = env.get('SNAPSHOT_ID')
+            if crawl_id and proc.pid:
+                crawl_process_pids.setdefault(str(crawl_id), proc.pid)
+            if snapshot_id and proc.pid:
+                snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)

        active_crawls_qs = Crawl.objects.filter(
            status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
@@ -1274,7 +1264,7 @@ def live_progress_view(request):
                    'failed_plugins': failed_plugins,
                    'pending_plugins': pending_plugins,
                    'all_plugins': all_plugins,
-                    'worker_pid': snapshot_worker_pids.get(str(snapshot.id)),
+                    'worker_pid': snapshot_process_pids.get(str(snapshot.id)),
                })

            # Check if crawl can start (for debugging stuck crawls)
@@ -1303,7 +1293,7 @@ def live_progress_view(request):
                'urls_preview': urls_preview,
                'retry_at_future': retry_at_future,
                'seconds_until_retry': seconds_until_retry,
-                'worker_pid': crawl_worker_pids.get(str(crawl.id)),
+                'worker_pid': crawl_process_pids.get(str(crawl.id)),
            })

        return JsonResponse({