simplify entrypoints for orchestrator and workers

This commit is contained in:
Nick Sweeting
2026-01-04 13:17:07 -08:00
parent 5449971777
commit 839ae744cf
13 changed files with 301 additions and 350 deletions

View File

@@ -53,8 +53,6 @@ class ArchiveBoxGroup(click.Group):
'manage': 'archivebox.cli.archivebox_manage.main',
# Introspection commands
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
# Worker command
'worker': 'archivebox.cli.archivebox_worker.main',
}
all_subcommands = {
**meta_commands,

View File

@@ -127,10 +127,11 @@ def add(urls: str | list[str],
# Background mode: just queue work and return (orchestrator via server will pick it up)
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
else:
# Foreground mode: run orchestrator inline until all work is done
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
orchestrator.runloop() # Block until complete
# Foreground mode: run CrawlWorker inline until all work is done
print(f'[green]\\[*] Starting worker to process crawl...[/green]')
from archivebox.workers.worker import CrawlWorker
worker = CrawlWorker(crawl_id=str(crawl.id), worker_id=0)
worker.runloop() # Block until complete
# 6. Return the list of Snapshots in this crawl
return crawl.snapshot_set.all()

View File

@@ -1,67 +0,0 @@
#!/usr/bin/env python3
"""
archivebox orchestrator [--daemon]
Start the orchestrator process that manages workers.
The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
and lazily spawns worker processes when there is work to be done.
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox orchestrator'
import sys
import rich_click as click
from archivebox.misc.util import docstring
def orchestrator(daemon: bool = False, watch: bool = False) -> int:
"""
Start the orchestrator process.
The orchestrator:
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
2. Spawns worker processes when there is work to do
3. Monitors worker health and restarts failed workers
4. Exits when all queues are empty (unless --daemon)
Args:
daemon: Run forever (don't exit when idle)
watch: Just watch the queues without spawning workers (for debugging)
Exit codes:
0: All work completed successfully
1: Error occurred
"""
from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running():
print('[yellow]Orchestrator is already running[/yellow]')
return 0
try:
orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
orchestrator_instance.runloop()
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
@docstring(orchestrator.__doc__)
def main(daemon: bool, watch: bool):
"""Start the ArchiveBox orchestrator process"""
sys.exit(orchestrator(daemon=daemon, watch=watch))
if __name__ == '__main__':
main()

View File

@@ -1,16 +1,18 @@
#!/usr/bin/env python3
"""
archivebox run [--daemon]
archivebox run [--daemon] [--crawl-id=...] [--snapshot-id=...]
Unified command for processing queued work.
Modes:
- With stdin JSONL: Process piped records, exit when complete
- Without stdin (TTY): Run orchestrator in foreground until killed
- --crawl-id: Run orchestrator for specific crawl only
- --snapshot-id: Run worker for specific snapshot only (internal use)
Examples:
# Run orchestrator in foreground (replaces `archivebox orchestrator`)
# Run orchestrator in foreground
archivebox run
# Run as daemon (don't exit on idle)
@@ -23,6 +25,12 @@ Examples:
# Mixed types work too
cat mixed_records.jsonl | archivebox run
# Run orchestrator for specific crawl (shows live progress for that crawl)
archivebox run --crawl-id=019b7e90-04d0-73ed-adec-aad9cfcd863e
# Run worker for specific snapshot (internal use by orchestrator)
archivebox run --snapshot-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
"""
__package__ = 'archivebox.cli'
@@ -187,15 +195,62 @@ def run_orchestrator(daemon: bool = False) -> int:
return 1
def run_snapshot_worker(snapshot_id: str) -> int:
"""
Run a SnapshotWorker for a specific snapshot.
Args:
snapshot_id: Snapshot UUID to process
Returns exit code (0 = success, 1 = error).
"""
from archivebox.workers.worker import _run_snapshot_worker
try:
_run_snapshot_worker(snapshot_id=snapshot_id, worker_id=0)
return 0
except KeyboardInterrupt:
return 0
except Exception as e:
rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
import traceback
traceback.print_exc()
return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
def main(daemon: bool):
@click.option('--crawl-id', help="Run orchestrator for specific crawl only")
@click.option('--snapshot-id', help="Run worker for specific snapshot only")
def main(daemon: bool, crawl_id: str, snapshot_id: str):
"""
Process queued work.
When stdin is piped: Process those specific records and exit.
When run standalone: Run orchestrator in foreground.
Modes:
- No args + stdin piped: Process piped JSONL records
- No args + TTY: Run orchestrator for all work
- --crawl-id: Run orchestrator for that crawl only
- --snapshot-id: Run worker for that snapshot only
"""
# Snapshot worker mode
if snapshot_id:
sys.exit(run_snapshot_worker(snapshot_id))
# Crawl worker mode
if crawl_id:
from archivebox.workers.worker import CrawlWorker
try:
worker = CrawlWorker(crawl_id=crawl_id, worker_id=0)
worker.runloop()
sys.exit(0)
except KeyboardInterrupt:
sys.exit(0)
except Exception as e:
rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
# Check if stdin has data (non-TTY means piped input)
if not sys.stdin.isatty():
sys.exit(process_stdin_records())

View File

@@ -1,50 +0,0 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox worker'
import sys
import rich_click as click
from archivebox.misc.util import docstring
def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
"""
Start a worker process to process items from the queue.
Worker types:
- crawl: Process Crawl objects (parse seeds, create snapshots)
- snapshot: Process Snapshot objects (create archive results)
- archiveresult: Process ArchiveResult objects (run plugins)
Workers poll the database for queued items, claim them atomically,
and spawn subprocess tasks to handle each item.
"""
from archivebox.workers.worker import get_worker_class
WorkerClass = get_worker_class(worker_type)
# Build kwargs
kwargs = {'daemon': daemon}
if plugin and worker_type == 'archiveresult':
kwargs['extractor'] = plugin # internal field still called extractor
# Create and run worker
worker_instance = WorkerClass(**kwargs)
worker_instance.runloop()
@click.command()
@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
@docstring(worker.__doc__)
def main(worker_type: str, daemon: bool, plugin: str | None):
"""Start an ArchiveBox worker process"""
worker(worker_type, daemon=daemon, plugin=plugin)
if __name__ == '__main__':
main()