Refactor ArchiveBox onto abx-dl bus runner

This commit is contained in:
Nick Sweeting
2026-03-21 11:47:57 -07:00
parent ee9ed440d1
commit c87079aa0a
45 changed files with 1282 additions and 6396 deletions

View File

@@ -723,7 +723,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
messages.success(
request,
f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
f"Queued {queued} snapshots for re-archiving. The background runner will process them.",
)
@@ -739,7 +739,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
messages.success(
request,
f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
f"Creating {queryset.count()} new fresh snapshots. The background runner will process them.",
)
@admin.action(
@@ -750,7 +750,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
messages.success(
request,
f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
f"Queued {queued} snapshots for full re-archive (overwriting existing). The background runner will process them.",
)
@admin.action(

View File

@@ -3,8 +3,6 @@ __package__ = 'archivebox.core'
from django.apps import AppConfig
import os
_ORCHESTRATOR_BOOTSTRAPPED = False
class CoreConfig(AppConfig):
name = 'archivebox.core'
@@ -35,32 +33,15 @@ class CoreConfig(AppConfig):
except Exception:
pass
def _should_manage_orchestrator() -> bool:
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER') == '1':
return False
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_PROCESS') == '1':
return False
def _should_prepare_runtime() -> bool:
if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
return True
return False
argv = ' '.join(sys.argv).lower()
if 'orchestrator' in argv:
return False
return 'daphne' in argv and '--reload' in sys.argv
if _should_manage_orchestrator():
global _ORCHESTRATOR_BOOTSTRAPPED
if _ORCHESTRATOR_BOOTSTRAPPED:
return
_ORCHESTRATOR_BOOTSTRAPPED = True
if _should_prepare_runtime():
from archivebox.machine.models import Process, Machine
from archivebox.workers.orchestrator import Orchestrator
Process.cleanup_stale_running()
Machine.current()
if not Orchestrator.is_running():
Orchestrator(exit_on_idle=False).start()

View File

@@ -1821,7 +1821,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Check if all ArchiveResults are finished.
Note: This is only called for observability/progress tracking.
SnapshotWorker owns the execution and doesn't poll this.
The shared runner owns execution and does not poll this.
"""
# Check if any ARs are still pending/started
pending = self.archiveresult_set.exclude(
@@ -2325,7 +2325,7 @@ class SnapshotMachine(BaseStateMachine):
@started.enter
def enter_started(self):
"""Just mark as started - SnapshotWorker will create ARs and run hooks."""
"""Just mark as started. The shared runner creates ArchiveResults and runs hooks."""
self.snapshot.status = Snapshot.StatusChoices.STARTED
self.snapshot.retry_at = None # No more polling
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
@@ -3344,8 +3344,8 @@ class ArchiveResultMachine(BaseStateMachine):
"""
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
This method is kept for backwards compatibility with manual CLI commands.
Note: In the new architecture, the shared runner handles step advancement and sealing.
This method is kept for direct model-driven edge cases.
"""
import sys

View File

@@ -1068,21 +1068,27 @@ class HealthCheckView(View):
def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
try:
from archivebox.workers.orchestrator import Orchestrator
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.machine.models import Process, Machine
# Get orchestrator status
orchestrator_running = Orchestrator.is_running()
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
machine = Machine.current()
orchestrator_proc = Process.objects.filter(
machine=machine,
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING,
).order_by('-started_at').first()
orchestrator_running = orchestrator_proc is not None
orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
total_workers = Process.objects.filter(
machine=machine,
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.WORKER,
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
).count()
# Get model counts by status
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
@@ -1128,43 +1134,27 @@ def live_progress_view(request):
# Build hierarchical active crawls with nested snapshots and archive results
running_workers = Process.objects.filter(
running_processes = Process.objects.filter(
machine=machine,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
process_type__in=[
Process.TypeChoices.HOOK,
Process.TypeChoices.BINARY,
],
)
crawl_worker_pids: dict[str, int] = {}
snapshot_worker_pids: dict[str, int] = {}
for proc in running_workers:
crawl_process_pids: dict[str, int] = {}
snapshot_process_pids: dict[str, int] = {}
for proc in running_processes:
env = proc.env or {}
if not isinstance(env, dict):
env = {}
cmd = proc.cmd or []
if proc.worker_type == 'crawl':
crawl_id = env.get('CRAWL_ID')
if not crawl_id:
for i, part in enumerate(cmd):
if part == '--crawl-id' and i + 1 < len(cmd):
crawl_id = cmd[i + 1]
break
if part.startswith('--crawl-id='):
crawl_id = part.split('=', 1)[1]
break
if crawl_id:
crawl_worker_pids[str(crawl_id)] = proc.pid
elif proc.worker_type == 'snapshot':
snapshot_id = env.get('SNAPSHOT_ID')
if not snapshot_id:
for i, part in enumerate(cmd):
if part == '--snapshot-id' and i + 1 < len(cmd):
snapshot_id = cmd[i + 1]
break
if part.startswith('--snapshot-id='):
snapshot_id = part.split('=', 1)[1]
break
if snapshot_id:
snapshot_worker_pids[str(snapshot_id)] = proc.pid
crawl_id = env.get('CRAWL_ID')
snapshot_id = env.get('SNAPSHOT_ID')
if crawl_id and proc.pid:
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
if snapshot_id and proc.pid:
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
active_crawls_qs = Crawl.objects.filter(
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
@@ -1274,7 +1264,7 @@ def live_progress_view(request):
'failed_plugins': failed_plugins,
'pending_plugins': pending_plugins,
'all_plugins': all_plugins,
'worker_pid': snapshot_worker_pids.get(str(snapshot.id)),
'worker_pid': snapshot_process_pids.get(str(snapshot.id)),
})
# Check if crawl can start (for debugging stuck crawls)
@@ -1303,7 +1293,7 @@ def live_progress_view(request):
'urls_preview': urls_preview,
'retry_at_future': retry_at_future,
'seconds_until_retry': seconds_until_retry,
'worker_pid': crawl_worker_pids.get(str(crawl.id)),
'worker_pid': crawl_process_pids.get(str(crawl.id)),
})
return JsonResponse({