Refactor ArchiveBox onto abx-dl bus runner

This commit is contained in:
Nick Sweeting
2026-03-21 11:47:57 -07:00
parent ee9ed440d1
commit c87079aa0a
45 changed files with 1282 additions and 6396 deletions

View File

@@ -1,8 +1,7 @@
"""
Workers admin module.
The orchestrator/worker system doesn't need Django admin registration
as workers are managed via CLI commands and the orchestrator.
Background runner processes do not need Django admin registration.
"""
__package__ = 'archivebox.workers'

View File

@@ -1,20 +0,0 @@
from django.core.management.base import BaseCommand
from archivebox.workers.orchestrator import Orchestrator
class Command(BaseCommand):
help = 'Run the archivebox orchestrator'
def add_arguments(self, parser):
parser.add_argument(
'--exit-on-idle',
action='store_true',
default=False,
help="Exit when all work is complete (default: run forever)"
)
def handle(self, *args, **kwargs):
exit_on_idle = kwargs.get('exit_on_idle', False)
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
orchestrator.runloop()

View File

@@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand
class Command(BaseCommand):
help = "Watch the runserver autoreload PID file and restart orchestrator on reloads."
help = "Watch the runserver autoreload PID file and restart the background runner on reloads."
def add_arguments(self, parser):
parser.add_argument(
@@ -19,22 +19,24 @@ class Command(BaseCommand):
def handle(self, *args, **kwargs):
import os
import subprocess
import sys
import time
from archivebox.config.common import STORAGE_CONFIG
from archivebox.machine.models import Process, Machine
from archivebox.workers.orchestrator import Orchestrator
os.environ['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
from archivebox.config.common import STORAGE_CONFIG
from archivebox.machine.models import Machine, Process
pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
if not pidfile:
pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
interval = max(0.2, float(kwargs.get("interval", 1.0)))
last_pid = None
runner_proc: subprocess.Popen[bytes] | None = None
def restart_runner() -> None:
nonlocal runner_proc
def restart_orchestrator():
Process.cleanup_stale_running()
machine = Machine.current()
@@ -43,21 +45,39 @@ class Command(BaseCommand):
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.ORCHESTRATOR,
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
)
for proc in running:
try:
if proc.process_type == Process.TypeChoices.HOOK:
proc.kill_tree(graceful_timeout=0.5)
else:
proc.terminate(graceful_timeout=1.0)
proc.kill_tree(graceful_timeout=0.5)
except Exception:
continue
if not Orchestrator.is_running():
Orchestrator(exit_on_idle=False).start()
if runner_proc and runner_proc.poll() is None:
try:
runner_proc.terminate()
runner_proc.wait(timeout=2.0)
except Exception:
try:
runner_proc.kill()
except Exception:
pass
runner_proc = subprocess.Popen(
[sys.executable, '-m', 'archivebox', 'run', '--daemon'],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
def runner_running() -> bool:
return Process.objects.filter(
machine=Machine.current(),
status=Process.StatusChoices.RUNNING,
process_type=Process.TypeChoices.ORCHESTRATOR,
).exists()
while True:
try:
@@ -68,11 +88,10 @@ class Command(BaseCommand):
pid = None
if pid and pid != last_pid:
restart_orchestrator()
restart_runner()
last_pid = pid
elif not Orchestrator.is_running():
Orchestrator(exit_on_idle=False).start()
elif not runner_running():
restart_runner()
except Exception:
pass

File diff suppressed because it is too large Load Diff

View File

@@ -29,13 +29,12 @@ WORKERS_DIR_NAME = "workers"
# Global reference to supervisord process for cleanup
_supervisord_proc = None
ORCHESTRATOR_WORKER = {
"name": "worker_orchestrator",
# Use Django management command to avoid stdin/TTY ambiguity in `archivebox run`.
"command": "archivebox manage orchestrator",
RUNNER_WORKER = {
"name": "worker_runner",
"command": "archivebox run --daemon",
"autostart": "true",
"autorestart": "true",
"stdout_logfile": "logs/worker_orchestrator.log",
"stdout_logfile": "logs/worker_runner.log",
"redirect_stderr": "true",
}
@@ -515,9 +514,7 @@ def watch_worker(supervisor, daemon_name, interval=5):
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
bg_workers = [
ORCHESTRATOR_WORKER,
]
bg_workers = [RUNNER_WORKER]
print()
start_worker(supervisor, SERVER_WORKER(host=host, port=port))
@@ -532,7 +529,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n')
sys.stdout.flush()
tail_multiple_worker_logs(
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
log_files=['logs/worker_daphne.log', 'logs/worker_runner.log'],
follow=True,
proc=_supervisord_proc, # Stop tailing when supervisord exits
)
@@ -551,7 +548,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
def start_cli_workers(watch=False):
supervisor = get_or_create_supervisord_process(daemonize=False)
start_worker(supervisor, ORCHESTRATOR_WORKER)
start_worker(supervisor, RUNNER_WORKER)
if watch:
try:
@@ -560,7 +557,7 @@ def start_cli_workers(watch=False):
_supervisord_proc.wait()
else:
# Fallback to watching worker if no proc reference
watch_worker(supervisor, ORCHESTRATOR_WORKER['name'])
watch_worker(supervisor, RUNNER_WORKER['name'])
except (KeyboardInterrupt, BrokenPipeError, IOError):
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
except SystemExit:
@@ -571,7 +568,7 @@ def start_cli_workers(watch=False):
# Ensure supervisord and all children are stopped
stop_existing_supervisord_process()
time.sleep(1.0) # Give processes time to fully terminate
return [ORCHESTRATOR_WORKER]
return [RUNNER_WORKER]
# def main(daemons):

View File

@@ -1,11 +1,11 @@
"""
Background task functions for queuing work to the orchestrator.
Background task functions for queuing work to the background runner.
These functions queue Snapshots/Crawls for processing by setting their status
to QUEUED, which the orchestrator workers will pick up and process.
to QUEUED so `archivebox run --daemon` or `archivebox server` can pick them up.
NOTE: These functions do NOT start the orchestrator - they assume it's already
running via `archivebox server` (supervisord) or will be run inline by the CLI.
NOTE: These functions do NOT start the runner. They assume it's already
running via `archivebox server` or will be run inline by the CLI.
"""
__package__ = 'archivebox.workers'
@@ -34,14 +34,12 @@ def bg_add(add_kwargs: dict) -> int:
def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
"""
Queue multiple snapshots for archiving via the state machine system.
This sets snapshots to 'queued' status so the orchestrator workers pick them up.
The actual archiving happens through the worker's process_item() method.
Queue multiple snapshots for archiving via the shared runner loop.
Returns the number of snapshots queued.
"""
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
kwargs = kwargs or {}
@@ -49,11 +47,16 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
queued_count = 0
for snapshot in snapshots:
if hasattr(snapshot, 'id'):
# Update snapshot to queued state so workers pick it up
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
crawl_id = getattr(snapshot, 'crawl_id', None)
if crawl_id:
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
queued_count += 1
return queued_count
@@ -61,21 +64,24 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int:
"""
Queue a single snapshot for archiving via the state machine system.
This sets the snapshot to 'queued' status so the orchestrator workers pick it up.
The actual archiving happens through the worker's process_item() method.
Queue a single snapshot for archiving via the shared runner loop.
Returns 1 if queued, 0 otherwise.
"""
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
# Queue the snapshot by setting status to queued
if hasattr(snapshot, 'id'):
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
crawl_id = getattr(snapshot, 'crawl_id', None)
if crawl_id:
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
return 1
return 0

File diff suppressed because it is too large Load Diff