mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Refactor ArchiveBox onto abx-dl bus runner
This commit is contained in:
@@ -1,8 +1,7 @@
|
||||
"""
|
||||
Workers admin module.
|
||||
|
||||
The orchestrator/worker system doesn't need Django admin registration
|
||||
as workers are managed via CLI commands and the orchestrator.
|
||||
Background runner processes do not need Django admin registration.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.workers'
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Run the archivebox orchestrator'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
'--exit-on-idle',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Exit when all work is complete (default: run forever)"
|
||||
)
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
exit_on_idle = kwargs.get('exit_on_idle', False)
|
||||
orchestrator = Orchestrator(exit_on_idle=exit_on_idle)
|
||||
orchestrator.runloop()
|
||||
@@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Watch the runserver autoreload PID file and restart orchestrator on reloads."
|
||||
help = "Watch the runserver autoreload PID file and restart the background runner on reloads."
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument(
|
||||
@@ -19,22 +19,24 @@ class Command(BaseCommand):
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.machine.models import Process, Machine
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
os.environ['ARCHIVEBOX_ORCHESTRATOR_WATCHER'] = '1'
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.machine.models import Machine, Process
|
||||
|
||||
pidfile = kwargs.get("pidfile") or os.environ.get("ARCHIVEBOX_RUNSERVER_PIDFILE")
|
||||
if not pidfile:
|
||||
pidfile = str(STORAGE_CONFIG.TMP_DIR / "runserver.pid")
|
||||
|
||||
interval = max(0.2, float(kwargs.get("interval", 1.0)))
|
||||
|
||||
last_pid = None
|
||||
runner_proc: subprocess.Popen[bytes] | None = None
|
||||
|
||||
def restart_runner() -> None:
|
||||
nonlocal runner_proc
|
||||
|
||||
def restart_orchestrator():
|
||||
Process.cleanup_stale_running()
|
||||
machine = Machine.current()
|
||||
|
||||
@@ -43,21 +45,39 @@ class Command(BaseCommand):
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.ORCHESTRATOR,
|
||||
Process.TypeChoices.WORKER,
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
)
|
||||
for proc in running:
|
||||
try:
|
||||
if proc.process_type == Process.TypeChoices.HOOK:
|
||||
proc.kill_tree(graceful_timeout=0.5)
|
||||
else:
|
||||
proc.terminate(graceful_timeout=1.0)
|
||||
proc.kill_tree(graceful_timeout=0.5)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
Orchestrator(exit_on_idle=False).start()
|
||||
if runner_proc and runner_proc.poll() is None:
|
||||
try:
|
||||
runner_proc.terminate()
|
||||
runner_proc.wait(timeout=2.0)
|
||||
except Exception:
|
||||
try:
|
||||
runner_proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
runner_proc = subprocess.Popen(
|
||||
[sys.executable, '-m', 'archivebox', 'run', '--daemon'],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
start_new_session=True,
|
||||
)
|
||||
|
||||
def runner_running() -> bool:
|
||||
return Process.objects.filter(
|
||||
machine=Machine.current(),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
).exists()
|
||||
|
||||
while True:
|
||||
try:
|
||||
@@ -68,11 +88,10 @@ class Command(BaseCommand):
|
||||
pid = None
|
||||
|
||||
if pid and pid != last_pid:
|
||||
restart_orchestrator()
|
||||
restart_runner()
|
||||
last_pid = pid
|
||||
elif not Orchestrator.is_running():
|
||||
Orchestrator(exit_on_idle=False).start()
|
||||
|
||||
elif not runner_running():
|
||||
restart_runner()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -29,13 +29,12 @@ WORKERS_DIR_NAME = "workers"
|
||||
# Global reference to supervisord process for cleanup
|
||||
_supervisord_proc = None
|
||||
|
||||
ORCHESTRATOR_WORKER = {
|
||||
"name": "worker_orchestrator",
|
||||
# Use Django management command to avoid stdin/TTY ambiguity in `archivebox run`.
|
||||
"command": "archivebox manage orchestrator",
|
||||
RUNNER_WORKER = {
|
||||
"name": "worker_runner",
|
||||
"command": "archivebox run --daemon",
|
||||
"autostart": "true",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": "logs/worker_orchestrator.log",
|
||||
"stdout_logfile": "logs/worker_runner.log",
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
|
||||
@@ -515,9 +514,7 @@ def watch_worker(supervisor, daemon_name, interval=5):
|
||||
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
|
||||
|
||||
bg_workers = [
|
||||
ORCHESTRATOR_WORKER,
|
||||
]
|
||||
bg_workers = [RUNNER_WORKER]
|
||||
|
||||
print()
|
||||
start_worker(supervisor, SERVER_WORKER(host=host, port=port))
|
||||
@@ -532,7 +529,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||
sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n')
|
||||
sys.stdout.flush()
|
||||
tail_multiple_worker_logs(
|
||||
log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'],
|
||||
log_files=['logs/worker_daphne.log', 'logs/worker_runner.log'],
|
||||
follow=True,
|
||||
proc=_supervisord_proc, # Stop tailing when supervisord exits
|
||||
)
|
||||
@@ -551,7 +548,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
|
||||
def start_cli_workers(watch=False):
|
||||
supervisor = get_or_create_supervisord_process(daemonize=False)
|
||||
|
||||
start_worker(supervisor, ORCHESTRATOR_WORKER)
|
||||
start_worker(supervisor, RUNNER_WORKER)
|
||||
|
||||
if watch:
|
||||
try:
|
||||
@@ -560,7 +557,7 @@ def start_cli_workers(watch=False):
|
||||
_supervisord_proc.wait()
|
||||
else:
|
||||
# Fallback to watching worker if no proc reference
|
||||
watch_worker(supervisor, ORCHESTRATOR_WORKER['name'])
|
||||
watch_worker(supervisor, RUNNER_WORKER['name'])
|
||||
except (KeyboardInterrupt, BrokenPipeError, IOError):
|
||||
STDERR.print("\n[🛑] Got Ctrl+C, stopping gracefully...")
|
||||
except SystemExit:
|
||||
@@ -571,7 +568,7 @@ def start_cli_workers(watch=False):
|
||||
# Ensure supervisord and all children are stopped
|
||||
stop_existing_supervisord_process()
|
||||
time.sleep(1.0) # Give processes time to fully terminate
|
||||
return [ORCHESTRATOR_WORKER]
|
||||
return [RUNNER_WORKER]
|
||||
|
||||
|
||||
# def main(daemons):
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
"""
|
||||
Background task functions for queuing work to the orchestrator.
|
||||
Background task functions for queuing work to the background runner.
|
||||
|
||||
These functions queue Snapshots/Crawls for processing by setting their status
|
||||
to QUEUED, which the orchestrator workers will pick up and process.
|
||||
to QUEUED so `archivebox run --daemon` or `archivebox server` can pick them up.
|
||||
|
||||
NOTE: These functions do NOT start the orchestrator - they assume it's already
|
||||
running via `archivebox server` (supervisord) or will be run inline by the CLI.
|
||||
NOTE: These functions do NOT start the runner. They assume it's already
|
||||
running via `archivebox server` or will be run inline by the CLI.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.workers'
|
||||
@@ -34,14 +34,12 @@ def bg_add(add_kwargs: dict) -> int:
|
||||
|
||||
def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
|
||||
"""
|
||||
Queue multiple snapshots for archiving via the state machine system.
|
||||
|
||||
This sets snapshots to 'queued' status so the orchestrator workers pick them up.
|
||||
The actual archiving happens through the worker's process_item() method.
|
||||
Queue multiple snapshots for archiving via the shared runner loop.
|
||||
|
||||
Returns the number of snapshots queued.
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
kwargs = kwargs or {}
|
||||
|
||||
@@ -49,11 +47,16 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
|
||||
queued_count = 0
|
||||
for snapshot in snapshots:
|
||||
if hasattr(snapshot, 'id'):
|
||||
# Update snapshot to queued state so workers pick it up
|
||||
Snapshot.objects.filter(id=snapshot.id).update(
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
crawl_id = getattr(snapshot, 'crawl_id', None)
|
||||
if crawl_id:
|
||||
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
queued_count += 1
|
||||
|
||||
return queued_count
|
||||
@@ -61,21 +64,24 @@ def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
|
||||
|
||||
def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int:
|
||||
"""
|
||||
Queue a single snapshot for archiving via the state machine system.
|
||||
|
||||
This sets the snapshot to 'queued' status so the orchestrator workers pick it up.
|
||||
The actual archiving happens through the worker's process_item() method.
|
||||
Queue a single snapshot for archiving via the shared runner loop.
|
||||
|
||||
Returns 1 if queued, 0 otherwise.
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
# Queue the snapshot by setting status to queued
|
||||
if hasattr(snapshot, 'id'):
|
||||
Snapshot.objects.filter(id=snapshot.id).update(
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
crawl_id = getattr(snapshot, 'crawl_id', None)
|
||||
if crawl_id:
|
||||
Crawl.objects.filter(id=crawl_id).exclude(status=Crawl.StatusChoices.SEALED).update(
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user