mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 15:57:53 +10:00
Refactor ArchiveBox onto abx-dl bus runner
This commit is contained in:
@@ -723,7 +723,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
|
||||
f"Queued {queued} snapshots for re-archiving. The background runner will process them.",
|
||||
)
|
||||
|
||||
|
||||
@@ -739,7 +739,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
|
||||
f"Creating {queryset.count()} new fresh snapshots. The background runner will process them.",
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
@@ -750,7 +750,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
|
||||
messages.success(
|
||||
request,
|
||||
f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
|
||||
f"Queued {queued} snapshots for full re-archive (overwriting existing). The background runner will process them.",
|
||||
)
|
||||
|
||||
@admin.action(
|
||||
|
||||
@@ -3,8 +3,6 @@ __package__ = 'archivebox.core'
|
||||
from django.apps import AppConfig
|
||||
import os
|
||||
|
||||
_ORCHESTRATOR_BOOTSTRAPPED = False
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'archivebox.core'
|
||||
@@ -35,32 +33,15 @@ class CoreConfig(AppConfig):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _should_manage_orchestrator() -> bool:
|
||||
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_MANAGED_BY_WATCHER') == '1':
|
||||
return False
|
||||
if os.environ.get('ARCHIVEBOX_ORCHESTRATOR_PROCESS') == '1':
|
||||
return False
|
||||
def _should_prepare_runtime() -> bool:
|
||||
if os.environ.get('ARCHIVEBOX_RUNSERVER') == '1':
|
||||
if os.environ.get('ARCHIVEBOX_AUTORELOAD') == '1':
|
||||
return os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
|
||||
return True
|
||||
return False
|
||||
|
||||
argv = ' '.join(sys.argv).lower()
|
||||
if 'orchestrator' in argv:
|
||||
return False
|
||||
return 'daphne' in argv and '--reload' in sys.argv
|
||||
|
||||
if _should_manage_orchestrator():
|
||||
global _ORCHESTRATOR_BOOTSTRAPPED
|
||||
if _ORCHESTRATOR_BOOTSTRAPPED:
|
||||
return
|
||||
_ORCHESTRATOR_BOOTSTRAPPED = True
|
||||
|
||||
if _should_prepare_runtime():
|
||||
from archivebox.machine.models import Process, Machine
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
Machine.current()
|
||||
|
||||
if not Orchestrator.is_running():
|
||||
Orchestrator(exit_on_idle=False).start()
|
||||
|
||||
@@ -1821,7 +1821,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
Check if all ArchiveResults are finished.
|
||||
|
||||
Note: This is only called for observability/progress tracking.
|
||||
SnapshotWorker owns the execution and doesn't poll this.
|
||||
The shared runner owns execution and does not poll this.
|
||||
"""
|
||||
# Check if any ARs are still pending/started
|
||||
pending = self.archiveresult_set.exclude(
|
||||
@@ -2325,7 +2325,7 @@ class SnapshotMachine(BaseStateMachine):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
"""Just mark as started - SnapshotWorker will create ARs and run hooks."""
|
||||
"""Just mark as started. The shared runner creates ArchiveResults and runs hooks."""
|
||||
self.snapshot.status = Snapshot.StatusChoices.STARTED
|
||||
self.snapshot.retry_at = None # No more polling
|
||||
self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
@@ -3344,8 +3344,8 @@ class ArchiveResultMachine(BaseStateMachine):
|
||||
"""
|
||||
Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
|
||||
|
||||
Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
|
||||
This method is kept for backwards compatibility with manual CLI commands.
|
||||
Note: In the new architecture, the shared runner handles step advancement and sealing.
|
||||
This method is kept for direct model-driven edge cases.
|
||||
"""
|
||||
import sys
|
||||
|
||||
|
||||
@@ -1068,21 +1068,27 @@ class HealthCheckView(View):
|
||||
def live_progress_view(request):
|
||||
"""Simple JSON endpoint for live progress status - used by admin progress monitor."""
|
||||
try:
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
# Get orchestrator status
|
||||
orchestrator_running = Orchestrator.is_running()
|
||||
total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
|
||||
machine = Machine.current()
|
||||
orchestrator_proc = Process.objects.filter(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
).order_by('-started_at').first()
|
||||
orchestrator_running = orchestrator_proc is not None
|
||||
orchestrator_pid = orchestrator_proc.pid if orchestrator_proc else None
|
||||
total_workers = Process.objects.filter(
|
||||
machine=machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.WORKER,
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
).count()
|
||||
|
||||
# Get model counts by status
|
||||
crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
|
||||
@@ -1128,43 +1134,27 @@ def live_progress_view(request):
|
||||
|
||||
# Build hierarchical active crawls with nested snapshots and archive results
|
||||
|
||||
running_workers = Process.objects.filter(
|
||||
running_processes = Process.objects.filter(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
process_type__in=[
|
||||
Process.TypeChoices.HOOK,
|
||||
Process.TypeChoices.BINARY,
|
||||
],
|
||||
)
|
||||
crawl_worker_pids: dict[str, int] = {}
|
||||
snapshot_worker_pids: dict[str, int] = {}
|
||||
for proc in running_workers:
|
||||
crawl_process_pids: dict[str, int] = {}
|
||||
snapshot_process_pids: dict[str, int] = {}
|
||||
for proc in running_processes:
|
||||
env = proc.env or {}
|
||||
if not isinstance(env, dict):
|
||||
env = {}
|
||||
|
||||
cmd = proc.cmd or []
|
||||
if proc.worker_type == 'crawl':
|
||||
crawl_id = env.get('CRAWL_ID')
|
||||
if not crawl_id:
|
||||
for i, part in enumerate(cmd):
|
||||
if part == '--crawl-id' and i + 1 < len(cmd):
|
||||
crawl_id = cmd[i + 1]
|
||||
break
|
||||
if part.startswith('--crawl-id='):
|
||||
crawl_id = part.split('=', 1)[1]
|
||||
break
|
||||
if crawl_id:
|
||||
crawl_worker_pids[str(crawl_id)] = proc.pid
|
||||
elif proc.worker_type == 'snapshot':
|
||||
snapshot_id = env.get('SNAPSHOT_ID')
|
||||
if not snapshot_id:
|
||||
for i, part in enumerate(cmd):
|
||||
if part == '--snapshot-id' and i + 1 < len(cmd):
|
||||
snapshot_id = cmd[i + 1]
|
||||
break
|
||||
if part.startswith('--snapshot-id='):
|
||||
snapshot_id = part.split('=', 1)[1]
|
||||
break
|
||||
if snapshot_id:
|
||||
snapshot_worker_pids[str(snapshot_id)] = proc.pid
|
||||
crawl_id = env.get('CRAWL_ID')
|
||||
snapshot_id = env.get('SNAPSHOT_ID')
|
||||
if crawl_id and proc.pid:
|
||||
crawl_process_pids.setdefault(str(crawl_id), proc.pid)
|
||||
if snapshot_id and proc.pid:
|
||||
snapshot_process_pids.setdefault(str(snapshot_id), proc.pid)
|
||||
|
||||
active_crawls_qs = Crawl.objects.filter(
|
||||
status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
|
||||
@@ -1274,7 +1264,7 @@ def live_progress_view(request):
|
||||
'failed_plugins': failed_plugins,
|
||||
'pending_plugins': pending_plugins,
|
||||
'all_plugins': all_plugins,
|
||||
'worker_pid': snapshot_worker_pids.get(str(snapshot.id)),
|
||||
'worker_pid': snapshot_process_pids.get(str(snapshot.id)),
|
||||
})
|
||||
|
||||
# Check if crawl can start (for debugging stuck crawls)
|
||||
@@ -1303,7 +1293,7 @@ def live_progress_view(request):
|
||||
'urls_preview': urls_preview,
|
||||
'retry_at_future': retry_at_future,
|
||||
'seconds_until_retry': seconds_until_retry,
|
||||
'worker_pid': crawl_worker_pids.get(str(crawl.id)),
|
||||
'worker_pid': crawl_process_pids.get(str(crawl.id)),
|
||||
})
|
||||
|
||||
return JsonResponse({
|
||||
|
||||
Reference in New Issue
Block a user