remove huey

This commit is contained in:
Nick Sweeting
2025-12-24 23:40:18 -08:00
parent 6c769d831c
commit d95f0dc186
105 changed files with 3635 additions and 1402 deletions

View File

@@ -1,23 +1,13 @@
"""
Workers admin module.
The orchestrator/worker system doesn't need Django admin registration
as workers are managed via CLI commands and the orchestrator.
"""
__package__ = 'archivebox.workers'
from django.contrib.auth import get_permission_codename
from huey_monitor.apps import HueyMonitorConfig
from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
HueyMonitorConfig.verbose_name = 'Background Workers'
class CustomTaskModelAdmin(TaskModelAdmin):
actions = ["delete_selected"]
def has_delete_permission(self, request, obj=None):
codename = get_permission_codename("delete", self.opts)
return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
def register_admin(admin_site):
admin_site.register(TaskModel, CustomTaskModelAdmin)
admin_site.register(SignalInfoModel, SignalInfoModelAdmin)
"""No models to register - workers are process-based, not Django models."""
pass

View File

@@ -0,0 +1,15 @@
from django.core.management.base import BaseCommand
from workers.orchestrator import Orchestrator
class Command(BaseCommand):
help = 'Run the archivebox orchestrator'
def add_arguments(self, parser):
parser.add_argument('--daemon', '-d', action='store_true', help="Run forever (don't exit on idle)")
def handle(self, *args, **kwargs):
daemon = kwargs.get('daemon', False)
orchestrator = Orchestrator(exit_on_idle=not daemon)
orchestrator.runloop()

View File

@@ -35,6 +35,7 @@ from django.utils import timezone
from rich import print
from archivebox.misc.logging_util import log_worker_event
from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker
from .pid_utils import (
write_pid_file,
@@ -82,22 +83,39 @@ class Orchestrator:
"""Called when orchestrator starts."""
self.pid = os.getpid()
self.pid_file = write_pid_file('orchestrator', worker_id=0)
print(f'[green]👨‍✈️ {self} STARTED[/green]')
# Clean up any stale PID files from previous runs
stale_count = cleanup_stale_pid_files()
# Collect startup metadata
metadata = {
'max_workers_per_type': self.MAX_WORKERS_PER_TYPE,
'max_total_workers': self.MAX_TOTAL_WORKERS,
'poll_interval': self.POLL_INTERVAL,
}
if stale_count:
print(f'[yellow]👨‍✈️ {self} cleaned up {stale_count} stale PID files[/yellow]')
metadata['cleaned_stale_pids'] = stale_count
log_worker_event(
worker_type='Orchestrator',
event='Starting...',
indent_level=0,
pid=self.pid,
metadata=metadata,
)
def on_shutdown(self, error: BaseException | None = None) -> None:
"""Called when orchestrator shuts down."""
if self.pid_file:
remove_pid_file(self.pid_file)
if error and not isinstance(error, KeyboardInterrupt):
print(f'[red]👨‍✈️ {self} SHUTDOWN with error:[/red] {type(error).__name__}: {error}')
else:
print(f'[grey53]👨‍✈️ {self} SHUTDOWN[/grey53]')
log_worker_event(
worker_type='Orchestrator',
event='Shutting down',
indent_level=0,
pid=self.pid,
error=error if error and not isinstance(error, KeyboardInterrupt) else None,
)
def get_total_worker_count(self) -> int:
"""Get total count of running workers across all types."""
@@ -129,10 +147,17 @@ class Orchestrator:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
pid = WorkerClass.start(daemon=False)
print(f'[blue]👨‍✈️ {self} spawned {WorkerClass.name} worker[/blue] pid={pid}')
# Worker spawning is logged by the worker itself in on_startup()
return pid
except Exception as e:
print(f'[red]👨‍✈️ {self} failed to spawn {WorkerClass.name} worker:[/red] {e}')
log_worker_event(
worker_type='Orchestrator',
event='Failed to spawn worker',
indent_level=0,
pid=self.pid,
metadata={'worker_type': WorkerClass.name},
error=e,
)
return None
def check_queues_and_spawn_workers(self) -> dict[str, int]:
@@ -181,26 +206,13 @@ class Orchestrator:
def on_tick(self, queue_sizes: dict[str, int]) -> None:
"""Called each orchestrator tick. Override for custom behavior."""
total_queued = sum(queue_sizes.values())
total_workers = self.get_total_worker_count()
if total_queued > 0 or total_workers > 0:
# Build status line
status_parts = []
for WorkerClass in self.WORKER_TYPES:
name = WorkerClass.name
queued = queue_sizes.get(name, 0)
workers = len(WorkerClass.get_running_workers())
if queued > 0 or workers > 0:
status_parts.append(f'{name}={queued}q/{workers}w')
if status_parts:
print(f'[grey53]👨‍✈️ {self} tick:[/grey53] {" ".join(status_parts)}')
# Tick logging suppressed to reduce noise
pass
def on_idle(self) -> None:
"""Called when orchestrator is idle (no work, no workers)."""
if self.idle_count == 1:
print(f'[grey53]👨‍✈️ {self} idle, waiting for work...[/grey53]')
# Idle logging suppressed to reduce noise
pass
def should_exit(self, queue_sizes: dict[str, int]) -> bool:
"""Determine if orchestrator should exit."""
@@ -242,7 +254,12 @@ class Orchestrator:
# Check if we should exit
if self.should_exit(queue_sizes):
print(f'[green]👨‍✈️ {self} all work complete, exiting[/green]')
log_worker_event(
worker_type='Orchestrator',
event='All work complete',
indent_level=0,
pid=self.pid,
)
break
time.sleep(self.POLL_INTERVAL)
@@ -267,9 +284,14 @@ class Orchestrator:
proc = Process(target=run_orchestrator, name='orchestrator')
proc.start()
assert proc.pid is not None
print(f'[green]👨‍✈️ Orchestrator started in background[/green] pid={proc.pid}')
log_worker_event(
worker_type='Orchestrator',
event='Started in background',
indent_level=0,
pid=proc.pid,
)
return proc.pid
@classmethod

View File

@@ -26,22 +26,6 @@ CONFIG_FILE_NAME = "supervisord.conf"
PID_FILE_NAME = "supervisord.pid"
WORKERS_DIR_NAME = "workers"
SCHEDULER_WORKER = {
"name": "worker_scheduler",
"command": "archivebox manage djangohuey --queue system_tasks -w 4 -k thread --disable-health-check --flush-locks",
"autostart": "true",
"autorestart": "true",
"stdout_logfile": "logs/worker_scheduler.log",
"redirect_stderr": "true",
}
COMMAND_WORKER = {
"name": "worker_commands",
"command": "archivebox manage djangohuey --queue commands -w 4 -k thread --no-periodic --disable-health-check",
"autostart": "true",
"autorestart": "true",
"stdout_logfile": "logs/worker_commands.log",
"redirect_stderr": "true",
}
ORCHESTRATOR_WORKER = {
"name": "worker_orchestrator",
"command": "archivebox manage orchestrator",
@@ -391,10 +375,8 @@ def watch_worker(supervisor, daemon_name, interval=5):
def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
supervisor = get_or_create_supervisord_process(daemonize=daemonize)
bg_workers = [
SCHEDULER_WORKER,
COMMAND_WORKER,
ORCHESTRATOR_WORKER,
]
@@ -422,8 +404,7 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False):
def start_cli_workers(watch=False):
supervisor = get_or_create_supervisord_process(daemonize=False)
start_worker(supervisor, COMMAND_WORKER)
start_worker(supervisor, ORCHESTRATOR_WORKER)
if watch:
@@ -434,13 +415,12 @@ def start_cli_workers(watch=False):
except SystemExit:
pass
except BaseException as e:
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping web server gracefully...")
STDERR.print(f"\n[🛑] Got {e.__class__.__name__} exception, stopping orchestrator gracefully...")
raise
finally:
stop_worker(supervisor, COMMAND_WORKER['name'])
stop_worker(supervisor, ORCHESTRATOR_WORKER['name'])
time.sleep(0.5)
return [COMMAND_WORKER, ORCHESTRATOR_WORKER]
return [ORCHESTRATOR_WORKER]
# def main(daemons):

View File

@@ -1,89 +1,60 @@
"""
Background task functions for queuing work to the orchestrator.
These functions queue Snapshots/Crawls for processing by setting their status
to QUEUED, which the orchestrator workers will pick up and process.
"""
__package__ = 'archivebox.workers'
from functools import wraps
# from django.utils import timezone
from django.utils import timezone
from django_huey import db_task, task
from huey_monitor.models import TaskModel
from huey_monitor.tqdm import ProcessInfo
def ensure_orchestrator_running():
"""Ensure the orchestrator is running to process queued items."""
from .orchestrator import Orchestrator
from .supervisord_util import get_or_create_supervisord_process
if not Orchestrator.is_running():
# Start orchestrator in background
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.start()
# @db_task(queue="commands", context=True, schedule=1)
# def scheduler_tick():
# print('SCHEDULER TICK', timezone.now().isoformat())
# # abx.archivebox.events.on_scheduler_runloop_start(timezone.now(), machine=Machine.objects.get_current_machine())
# # abx.archivebox.events.on_scheduler_tick_start(timezone.now(), machine=Machine.objects.get_current_machine())
# scheduled_crawls = CrawlSchedule.objects.filter(is_enabled=True)
# scheduled_crawls_due = scheduled_crawls.filter(next_run_at__lte=timezone.now())
# for scheduled_crawl in scheduled_crawls_due:
# try:
# abx.archivebox.events.on_crawl_schedule_tick(scheduled_crawl)
# except Exception as e:
# abx.archivebox.events.on_crawl_schedule_failure(timezone.now(), machine=Machine.objects.get_current_machine(), error=e, schedule=scheduled_crawl)
# # abx.archivebox.events.on_scheduler_tick_end(timezone.now(), machine=Machine.objects.get_current_machine(), tasks=scheduled_tasks_due)
def bg_add(add_kwargs: dict) -> int:
"""
Add URLs and queue them for archiving.
def db_task_with_parent(func):
"""Decorator for db_task that sets the parent task for the db_task"""
@wraps(func)
def wrapper(*args, **kwargs):
task = kwargs.get('task')
parent_task_id = kwargs.get('parent_task_id')
if task and parent_task_id:
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
return func(*args, **kwargs)
return wrapper
@db_task(queue="commands", context=True)
def bg_add(add_kwargs, task=None, parent_task_id=None):
get_or_create_supervisord_process(daemonize=False)
from ..main import add
if task and parent_task_id:
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
Returns the number of snapshots created.
"""
from archivebox.cli.archivebox_add import add
assert add_kwargs and add_kwargs.get("urls")
rough_url_count = add_kwargs["urls"].count("://")
process_info = ProcessInfo(task, desc="add", parent_task_id=parent_task_id, total=rough_url_count)
# When called as background task, always run in background mode
add_kwargs = add_kwargs.copy()
add_kwargs['bg'] = True
result = add(**add_kwargs)
process_info.update(n=rough_url_count)
return result
# Ensure orchestrator is running to process the new snapshots
ensure_orchestrator_running()
return len(result) if result else 0
@task(queue="commands", context=True)
def bg_archive_snapshots(snapshots, kwargs=None, task=None, parent_task_id=None):
def bg_archive_snapshots(snapshots, kwargs: dict | None = None) -> int:
"""
Queue multiple snapshots for archiving via the state machine system.
This sets snapshots to 'queued' status so the orchestrator workers pick them up.
The actual archiving happens through ArchiveResult.run().
"""
get_or_create_supervisord_process(daemonize=False)
The actual archiving happens through the worker's process_item() method.
from django.utils import timezone
Returns the number of snapshots queued.
"""
from core.models import Snapshot
if task and parent_task_id:
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
assert snapshots
kwargs = kwargs or {}
rough_count = len(snapshots) if hasattr(snapshots, '__len__') else snapshots.count()
process_info = ProcessInfo(task, desc="archive_snapshots", parent_task_id=parent_task_id, total=rough_count)
# Queue snapshots by setting status to queued with immediate retry_at
queued_count = 0
for snapshot in snapshots:
@@ -95,36 +66,33 @@ def bg_archive_snapshots(snapshots, kwargs=None, task=None, parent_task_id=None)
)
queued_count += 1
process_info.update(n=queued_count)
# Ensure orchestrator is running to process the queued snapshots
if queued_count > 0:
ensure_orchestrator_running()
return queued_count
@task(queue="commands", context=True)
def bg_archive_snapshot(snapshot, overwrite=False, methods=None, task=None, parent_task_id=None):
def bg_archive_snapshot(snapshot, overwrite: bool = False, methods: list | None = None) -> int:
"""
Queue a single snapshot for archiving via the state machine system.
This sets the snapshot to 'queued' status so the orchestrator workers pick it up.
The actual archiving happens through ArchiveResult.run().
The actual archiving happens through the worker's process_item() method.
Returns 1 if queued, 0 otherwise.
"""
get_or_create_supervisord_process(daemonize=False)
from django.utils import timezone
from core.models import Snapshot
if task and parent_task_id:
TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id)
process_info = ProcessInfo(task, desc="archive_snapshot", parent_task_id=parent_task_id, total=1)
# Queue the snapshot by setting status to queued
if hasattr(snapshot, 'id'):
Snapshot.objects.filter(id=snapshot.id).update(
status=Snapshot.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
process_info.update(n=1)
# Ensure orchestrator is running to process the queued snapshot
ensure_orchestrator_running()
return 1
return 0

View File

@@ -25,6 +25,7 @@ from django.conf import settings
from rich import print
from archivebox.misc.logging_util import log_worker_event
from .pid_utils import (
write_pid_file,
remove_pid_file,
@@ -126,7 +127,7 @@ class Worker:
obj.sm.tick()
return True
except Exception as e:
print(f'[red]{self} error processing {obj.pk}:[/red] {e}')
# Error will be logged in runloop's completion event
traceback.print_exc()
return False
@@ -134,7 +135,28 @@ class Worker:
"""Called when worker starts."""
self.pid = os.getpid()
self.pid_file = write_pid_file(self.name, self.worker_id)
print(f'[green]{self} STARTED[/green] pid_file={self.pid_file}')
# Determine worker type for logging
worker_type_name = self.__class__.__name__
indent_level = 1 # Default for most workers
# Adjust indent level based on worker type
if 'Snapshot' in worker_type_name:
indent_level = 2
elif 'ArchiveResult' in worker_type_name:
indent_level = 3
log_worker_event(
worker_type=worker_type_name,
event='Starting...',
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
metadata={
'max_concurrent': self.MAX_CONCURRENT_TASKS,
'poll_interval': self.POLL_INTERVAL,
},
)
def on_shutdown(self, error: BaseException | None = None) -> None:
"""Called when worker shuts down."""
@@ -142,10 +164,23 @@ class Worker:
if self.pid_file:
remove_pid_file(self.pid_file)
if error and not isinstance(error, KeyboardInterrupt):
print(f'[red]{self} SHUTDOWN with error:[/red] {type(error).__name__}: {error}')
else:
print(f'[grey53]{self} SHUTDOWN[/grey53]')
# Determine worker type for logging
worker_type_name = self.__class__.__name__
indent_level = 1
if 'Snapshot' in worker_type_name:
indent_level = 2
elif 'ArchiveResult' in worker_type_name:
indent_level = 3
log_worker_event(
worker_type=worker_type_name,
event='Shutting down',
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
error=error if error and not isinstance(error, KeyboardInterrupt) else None,
)
def should_exit(self) -> bool:
"""Check if worker should exit due to idle timeout."""
@@ -161,6 +196,15 @@ class Worker:
"""Main worker loop - polls queue, processes items."""
self.on_startup()
# Determine worker type for logging
worker_type_name = self.__class__.__name__
indent_level = 1
if 'Snapshot' in worker_type_name:
indent_level = 2
elif 'ArchiveResult' in worker_type_name:
indent_level = 3
try:
while True:
# Try to claim and process an item
@@ -168,25 +212,62 @@ class Worker:
if obj is not None:
self.idle_count = 0
print(f'[blue]{self} processing:[/blue] {obj.pk}')
# Build metadata for task start
start_metadata = {'task_id': str(obj.pk)}
if hasattr(obj, 'url'):
# SnapshotWorker
url = str(obj.url) if obj.url else None
else:
url = None
extractor = None
if hasattr(obj, 'extractor'):
# ArchiveResultWorker
extractor = obj.extractor
start_metadata['extractor'] = extractor
log_worker_event(
worker_type=worker_type_name,
event='Processing...',
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
url=url,
extractor=extractor,
metadata=start_metadata,
)
start_time = time.time()
success = self.process_item(obj)
elapsed = time.time() - start_time
if success:
print(f'[green]{self} completed ({elapsed:.1f}s):[/green] {obj.pk}')
else:
print(f'[red]{self} failed ({elapsed:.1f}s):[/red] {obj.pk}')
# Build metadata for task completion
complete_metadata = {
'task_id': str(obj.pk),
'duration': elapsed,
'status': 'success' if success else 'failed',
}
if hasattr(obj, 'status'):
complete_metadata['final_status'] = str(obj.status)
log_worker_event(
worker_type=worker_type_name,
event='Completed' if success else 'Failed',
indent_level=indent_level,
pid=self.pid,
worker_id=str(self.worker_id),
url=url,
extractor=extractor,
metadata=complete_metadata,
)
else:
# No work available
# No work available - idle logging suppressed
self.idle_count += 1
if self.idle_count == 1:
print(f'[grey53]{self} idle, waiting for work...[/grey53]')
# Check if we should exit
if self.should_exit():
print(f'[grey53]{self} idle timeout reached, exiting[/grey53]')
# Exit logging suppressed - shutdown will be logged by on_shutdown()
break
time.sleep(self.POLL_INTERVAL)
@@ -293,7 +374,7 @@ class ArchiveResultWorker(Worker):
obj.sm.tick()
return True
except Exception as e:
print(f'[red]{self} error processing {obj.pk}:[/red] {e}')
# Error will be logged in runloop's completion event
traceback.print_exc()
return False