mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
simplify entrypoints for orchestrator and workers
This commit is contained in:
@@ -53,8 +53,6 @@ class ArchiveBoxGroup(click.Group):
|
||||
'manage': 'archivebox.cli.archivebox_manage.main',
|
||||
# Introspection commands
|
||||
'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
|
||||
# Worker command
|
||||
'worker': 'archivebox.cli.archivebox_worker.main',
|
||||
}
|
||||
all_subcommands = {
|
||||
**meta_commands,
|
||||
|
||||
@@ -127,10 +127,11 @@ def add(urls: str | list[str],
|
||||
# Background mode: just queue work and return (orchestrator via server will pick it up)
|
||||
print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
|
||||
else:
|
||||
# Foreground mode: run orchestrator inline until all work is done
|
||||
print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
|
||||
orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
|
||||
orchestrator.runloop() # Block until complete
|
||||
# Foreground mode: run CrawlWorker inline until all work is done
|
||||
print(f'[green]\\[*] Starting worker to process crawl...[/green]')
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
worker = CrawlWorker(crawl_id=str(crawl.id), worker_id=0)
|
||||
worker.runloop() # Block until complete
|
||||
|
||||
# 6. Return the list of Snapshots in this crawl
|
||||
return crawl.snapshot_set.all()
|
||||
|
||||
@@ -1,67 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox orchestrator [--daemon]
|
||||
|
||||
Start the orchestrator process that manages workers.
|
||||
|
||||
The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
|
||||
and lazily spawns worker processes when there is work to be done.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox orchestrator'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def orchestrator(daemon: bool = False, watch: bool = False) -> int:
|
||||
"""
|
||||
Start the orchestrator process.
|
||||
|
||||
The orchestrator:
|
||||
1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
|
||||
2. Spawns worker processes when there is work to do
|
||||
3. Monitors worker health and restarts failed workers
|
||||
4. Exits when all queues are empty (unless --daemon)
|
||||
|
||||
Args:
|
||||
daemon: Run forever (don't exit when idle)
|
||||
watch: Just watch the queues without spawning workers (for debugging)
|
||||
|
||||
Exit codes:
|
||||
0: All work completed successfully
|
||||
1: Error occurred
|
||||
"""
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
if Orchestrator.is_running():
|
||||
print('[yellow]Orchestrator is already running[/yellow]')
|
||||
return 0
|
||||
|
||||
try:
|
||||
orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
|
||||
orchestrator_instance.runloop()
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
|
||||
@docstring(orchestrator.__doc__)
|
||||
def main(daemon: bool, watch: bool):
|
||||
"""Start the ArchiveBox orchestrator process"""
|
||||
sys.exit(orchestrator(daemon=daemon, watch=watch))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,16 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
archivebox run [--daemon]
|
||||
archivebox run [--daemon] [--crawl-id=...] [--snapshot-id=...]
|
||||
|
||||
Unified command for processing queued work.
|
||||
|
||||
Modes:
|
||||
- With stdin JSONL: Process piped records, exit when complete
|
||||
- Without stdin (TTY): Run orchestrator in foreground until killed
|
||||
- --crawl-id: Run orchestrator for specific crawl only
|
||||
- --snapshot-id: Run worker for specific snapshot only (internal use)
|
||||
|
||||
Examples:
|
||||
# Run orchestrator in foreground (replaces `archivebox orchestrator`)
|
||||
# Run orchestrator in foreground
|
||||
archivebox run
|
||||
|
||||
# Run as daemon (don't exit on idle)
|
||||
@@ -23,6 +25,12 @@ Examples:
|
||||
|
||||
# Mixed types work too
|
||||
cat mixed_records.jsonl | archivebox run
|
||||
|
||||
# Run orchestrator for specific crawl (shows live progress for that crawl)
|
||||
archivebox run --crawl-id=019b7e90-04d0-73ed-adec-aad9cfcd863e
|
||||
|
||||
# Run worker for specific snapshot (internal use by orchestrator)
|
||||
archivebox run --snapshot-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
@@ -187,15 +195,62 @@ def run_orchestrator(daemon: bool = False) -> int:
|
||||
return 1
|
||||
|
||||
|
||||
def run_snapshot_worker(snapshot_id: str) -> int:
|
||||
"""
|
||||
Run a SnapshotWorker for a specific snapshot.
|
||||
|
||||
Args:
|
||||
snapshot_id: Snapshot UUID to process
|
||||
|
||||
Returns exit code (0 = success, 1 = error).
|
||||
"""
|
||||
from archivebox.workers.worker import _run_snapshot_worker
|
||||
|
||||
try:
|
||||
_run_snapshot_worker(snapshot_id=snapshot_id, worker_id=0)
|
||||
return 0
|
||||
except KeyboardInterrupt:
|
||||
return 0
|
||||
except Exception as e:
|
||||
rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
def main(daemon: bool):
|
||||
@click.option('--crawl-id', help="Run orchestrator for specific crawl only")
|
||||
@click.option('--snapshot-id', help="Run worker for specific snapshot only")
|
||||
def main(daemon: bool, crawl_id: str, snapshot_id: str):
|
||||
"""
|
||||
Process queued work.
|
||||
|
||||
When stdin is piped: Process those specific records and exit.
|
||||
When run standalone: Run orchestrator in foreground.
|
||||
Modes:
|
||||
- No args + stdin piped: Process piped JSONL records
|
||||
- No args + TTY: Run orchestrator for all work
|
||||
- --crawl-id: Run orchestrator for that crawl only
|
||||
- --snapshot-id: Run worker for that snapshot only
|
||||
"""
|
||||
# Snapshot worker mode
|
||||
if snapshot_id:
|
||||
sys.exit(run_snapshot_worker(snapshot_id))
|
||||
|
||||
# Crawl worker mode
|
||||
if crawl_id:
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
try:
|
||||
worker = CrawlWorker(crawl_id=crawl_id, worker_id=0)
|
||||
worker.runloop()
|
||||
sys.exit(0)
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
# Check if stdin has data (non-TTY means piped input)
|
||||
if not sys.stdin.isatty():
|
||||
sys.exit(process_stdin_records())
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox worker'
|
||||
|
||||
import sys
|
||||
|
||||
import rich_click as click
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
|
||||
|
||||
def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
|
||||
"""
|
||||
Start a worker process to process items from the queue.
|
||||
|
||||
Worker types:
|
||||
- crawl: Process Crawl objects (parse seeds, create snapshots)
|
||||
- snapshot: Process Snapshot objects (create archive results)
|
||||
- archiveresult: Process ArchiveResult objects (run plugins)
|
||||
|
||||
Workers poll the database for queued items, claim them atomically,
|
||||
and spawn subprocess tasks to handle each item.
|
||||
"""
|
||||
from archivebox.workers.worker import get_worker_class
|
||||
|
||||
WorkerClass = get_worker_class(worker_type)
|
||||
|
||||
# Build kwargs
|
||||
kwargs = {'daemon': daemon}
|
||||
if plugin and worker_type == 'archiveresult':
|
||||
kwargs['extractor'] = plugin # internal field still called extractor
|
||||
|
||||
# Create and run worker
|
||||
worker_instance = WorkerClass(**kwargs)
|
||||
worker_instance.runloop()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
|
||||
@docstring(worker.__doc__)
|
||||
def main(worker_type: str, daemon: bool, plugin: str | None):
|
||||
"""Start an ArchiveBox worker process"""
|
||||
worker(worker_type, daemon=daemon, plugin=plugin)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user