Refactor ArchiveBox onto abx-dl bus runner

2026-04-06 07:47:53 +10:00 · 2026-03-21 11:47:57 -07:00
parent ee9ed440d1
commit c87079aa0a
45 changed files with 1282 additions and 6396 deletions
--- a/archivebox/workers/admin.py
+++ b/archivebox/workers/admin.py
@@ -1,8 +1,7 @@
 """
 Workers admin module.

-The orchestrator/worker system doesn't need Django admin registration
-as workers are managed via CLI commands and the orchestrator.
+Background runner processes do not need Django admin registration.
 """

 __package__ = 'archivebox.workers'
--- a/archivebox/workers/management/commands/orchestrator.py
+++ b/archivebox/workers/management/commands/orchestrator.py
@@ -1,20 +0,0 @@
-from django.core.management.base import BaseCommand
-
-from archivebox.workers.orchestrator import Orchestrator
-
-
-class Command(BaseCommand):
-    help = 'Run the archivebox orchestrator'
-
-    def add_arguments(self, parser):
-        parser.add_argument(
-            '--exit-on-idle',
-            action='store_true',
-            default=False,
-            help="Exit when all work is complete (default: run forever)"
-        )
-
-    def handle(self, *args, **kwargs):
-        exit_on_idle = kwargs.get('exit_on_idle', False)
-        orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
-        orchestrator.runloop()
--- a/archivebox/workers/management/commands/orchestrator_watch.py
+++ b/archivebox/workers/management/commands/orchestrator_watch.py
@@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand


 class Command(BaseCommand):
-    help = "Watch the runserver autoreload PID file and restart orchestrator on reloads."
+    help = "Watch the runserver autoreload PID file and restart the background runner on reloads."

    def add_arguments(self, parser):
        parser.add_argument(
@@ -19,22 +19,24 @@ class Command(BaseCommand):

    def handle(self, *args, **kwargs):
        import os
+        import subprocess
+        import sys
        import time
-        from archivebox.config.common import STORAGE_CONFIG
-        from archivebox.machine.models import Process, Machine
-        from archivebox.workers.orchestrator import Orchestrator

-        os.environ['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
+        from archivebox.config.common import STORAGE_CONFIG
+        from archivebox.machine.models import Machine, Process

        pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
        if not pidfile:
            pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")

        interval = max(0.2, float(kwargs.get("interval", 1.0)))
-
        last_pid = None
+        runner_proc: subprocess.Popen[bytes] | None = None
+
+        def restart_runner() -> None:
+            nonlocal runner_proc

-        def restart_orchestrator():
            Process.cleanup_stale_running()
            machine = Machine.current()

@@ -43,21 +45,39 @@ class Command(BaseCommand):
                status=Process.StatusChoices.RUNNING,
                process_type__in=[
                    Process.TypeChoices.ORCHESTRATOR,
-                    Process.TypeChoices.WORKER,
                    Process.TypeChoices.HOOK,
+                    Process.TypeChoices.BINARY,
                ],
            )
            for proc in running:
                try:
-                    if proc.process_type == Process.TypeChoices.HOOK:
-                        proc.kill_tree(graceful_timeout=0.5)
-                    else:
-                        proc.terminate(graceful_timeout=1.0)
+                    proc.kill_tree(graceful_timeout=0.5)
                except Exception:
                    continue

-            if not Orchestrator.is_running():
-                Orchestrator(exit_on_idle=False).start()
+            if runner_proc and runner_proc.poll() is None:
+                try:
+                    runner_proc.terminate()
+                    runner_proc.wait(timeout=2.0)
+                except Exception:
+                    try:
+                        runner_proc.kill()
+                    except Exception:
+                        pass
+
+            runner_proc = subprocess.Popen(
+                [sys.executable, '-m', 'archivebox', 'run', '--daemon'],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+                start_new_session=True,
+            )
+
+        def runner_running() -> bool:
+            return Process.objects.filter(
+                machine=Machine.current(),
+                status=Process.StatusChoices.RUNNING,
+                process_type=Process.TypeChoices.ORCHESTRATOR,
+            ).exists()

        while True:
            try:
@@ -68,11 +88,10 @@ class Command(BaseCommand):
                    pid = None

                if pid and pid != last_pid:
-                    restart_orchestrator()
+                    restart_runner()
                    last_pid = pid
-                elif not Orchestrator.is_running():
-                    Orchestrator(exit_on_idle=False).start()
-
+                elif not runner_running():
+                    restart_runner()
            except Exception:
                pass

--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
--- a/archivebox/workers/supervisord_util.py
+++ b/archivebox/workers/supervisord_util.py
@@ -29,13 +29,12 @@ WORKERS_DIR_NAME = "workers"
 # Global reference to supervisord process for cleanup
 _supervisord_proc = None

-ORCHESTRATOR_WORKER = {
-    "name": "worker_orchestrator",
-    # Use Django management command to avoid stdin/TTY ambiguity in `archivebox run`.
-    "command": "archivebox manage orchestrator",
+RUNNER_WORKER = {
+    "name": "worker_runner",
+    "command": "archivebox run --daemon",
    "autostart": "true",
    "autorestart": "true",
-    "stdout_logfile": "logs/worker_orchestrator.log",
+    "stdout_logfile": "logs/worker_runner.log",
    "redirect_stderr": "true",
 }

@@ -515,9 +514,7 @@ def watch_worker(supervisor, daemon_name, interval=5):
 def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
    supervisor = get_or_create_supervisord_process(daemonize=daemonize)

-    bg_workers = [
-        ORCHESTRATOR_WORKER,
-    ]
+    bg_workers = [RUNNER_WORKER]

    print()
    start_worker(supervisor, SERVER_WORKER(host=host, port=port))
@@ -532,7 +529,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
            sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n')
            sys.stdout.flush()
            tail_multiple_worker_logs(
-                log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
+                log_files=['logs/worker_daphne.log', 'logs/worker_runner.log'],
                follow=True,
                proc=_supervisord_proc,  # Stop tailing when supervisord exits
            )
@@ -551,7 +548,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
 def start_cli_workers(watch=False):
    supervisor = get_or_create_supervisord_process(daemonize=False)

-    start_worker(supervisor, ORCHESTRATOR_WORKER)
+    start_worker(supervisor, RUNNER_WORKER)

    if watch:
        try:
@@ -560,7 +557,7 @@ def start_cli_workers(watch=False):
                _supervisord_proc.wait()
            else:
                # Fallback to watching worker if no proc reference
-                watch_worker(supervisor, ORCHESTRATOR_WORKER['name'])
+                watch_worker(supervisor, RUNNER_WORKER['name'])
        except (KeyboardInterrupt, BrokenPipeError, IOError):
            STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
        except SystemExit:
@@ -571,7 +568,7 @@ def start_cli_workers(watch=False):
            # Ensure supervisord and all children are stopped
            stop_existing_supervisord_process()
            time.sleep(1.0)  # Give processes time to fully terminate
-    return [ORCHESTRATOR_WORKER]
+    return [RUNNER_WORKER]


 # def main(daemons):
--- a/archivebox/workers/tasks.py
+++ b/archivebox/workers/tasks.py
@@ -1,11 +1,11 @@
 """
-Background task functions for queuing work to the orchestrator.
+Background task functions for queuing work to the background runner.

 These functions queue Snapshots/Crawls for processing by setting their status
-to QUEUED, which the orchestrator workers will pick up and process.
+to QUEUED so `archivebox run --daemon` or `archivebox server` can pick them up.

-NOTE: These functions do NOT start the orchestrator - they assume it's already
-running via `archivebox server` (supervisord) or will be run inline by the CLI.
+NOTE: These functions do NOT start the runner. They assume it's already
+running via `archivebox server` or will be run inline by the CLI.
 """

 __package__ = 'archivebox.workers'
@@ -34,14 +34,12 @@ def bg_add(add_kwargs: dict) -> int:

 def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
    """
-    Queue multiple snapshots for archiving via the state machine system.
-
-    This sets snapshots to 'queued' status so the orchestrator workers pick them up.
-    The actual archiving happens through the worker's process_item() method.
+    Queue multiple snapshots for archiving via the shared runner loop.

    Returns the number of snapshots queued.
    """
    from archivebox.core.models import Snapshot
+    from archivebox.crawls.models import Crawl

    kwargs = kwargs or {}

@@ -49,11 +47,16 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
    queued_count = 0
    for snapshot in snapshots:
        if hasattr(snapshot, 'id'):
-            # Update snapshot to queued state so workers pick it up
            Snapshot.objects.filter(id=snapshot.id).update(
                status=Snapshot.StatusChoices.QUEUED,
                retry_at=timezone.now(),
            )
+            crawl_id = getattr(snapshot, 'crawl_id', None)
+            if crawl_id:
+                Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
+                    status=Crawl.StatusChoices.QUEUED,
+                    retry_at=timezone.now(),
+                )
            queued_count += 1

    return queued_count
@@ -61,21 +64,24 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:

 def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int:
    """
-    Queue a single snapshot for archiving via the state machine system.
-
-    This sets the snapshot to 'queued' status so the orchestrator workers pick it up.
-    The actual archiving happens through the worker's process_item() method.
+    Queue a single snapshot for archiving via the shared runner loop.

    Returns 1 if queued, 0 otherwise.
    """
    from archivebox.core.models import Snapshot
+    from archivebox.crawls.models import Crawl

-    # Queue the snapshot by setting status to queued
    if hasattr(snapshot, 'id'):
        Snapshot.objects.filter(id=snapshot.id).update(
            status=Snapshot.StatusChoices.QUEUED,
            retry_at=timezone.now(),
        )
+        crawl_id = getattr(snapshot, 'crawl_id', None)
+        if crawl_id:
+            Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
+                status=Crawl.StatusChoices.QUEUED,
+                retry_at=timezone.now(),
+            )
        return 1

    return 0
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py