simplify entrypoints for orchestrator and workers

2026-04-06 07:47:53 +10:00 · 2026-01-04 13:17:07 -08:00
parent 5449971777
commit 839ae744cf
13 changed files with 301 additions and 350 deletions
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@@ -53,8 +53,6 @@ class ArchiveBoxGroup(click.Group):
        'manage': 'archivebox.cli.archivebox_manage.main',
        # Introspection commands
        'pluginmap': 'archivebox.cli.archivebox_pluginmap.main',
-        # Worker command
-        'worker': 'archivebox.cli.archivebox_worker.main',
    }
    all_subcommands = {
        **meta_commands,
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -127,10 +127,11 @@ def add(urls: str | list[str],
        # Background mode: just queue work and return (orchestrator via server will pick it up)
        print('[yellow]\\[*] URLs queued. Orchestrator will process them (run `archivebox server` if not already running).[/yellow]')
    else:
-        # Foreground mode: run orchestrator inline until all work is done
-        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
-        orchestrator = Orchestrator(exit_on_idle=True, crawl_id=str(crawl.id))
-        orchestrator.runloop()  # Block until complete
+        # Foreground mode: run CrawlWorker inline until all work is done
+        print(f'[green]\\[*] Starting worker to process crawl...[/green]')
+        from archivebox.workers.worker import CrawlWorker
+        worker = CrawlWorker(crawl_id=str(crawl.id), worker_id=0)
+        worker.runloop()  # Block until complete

    # 6. Return the list of Snapshots in this crawl
    return crawl.snapshot_set.all()
--- a/archivebox/cli/archivebox_orchestrator.py
+++ b/archivebox/cli/archivebox_orchestrator.py
@@ -1,67 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-archivebox orchestrator [--daemon]
-
-Start the orchestrator process that manages workers.
-
-The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
-and lazily spawns worker processes when there is work to be done.
-"""
-
-__package__ = 'archivebox.cli'
-__command__ = 'archivebox orchestrator'
-
-import sys
-
-import rich_click as click
-
-from archivebox.misc.util import docstring
-
-
-def orchestrator(daemon: bool = False, watch: bool = False) -> int:
-    """
-    Start the orchestrator process.
-    
-    The orchestrator:
-    1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
-    2. Spawns worker processes when there is work to do
-    3. Monitors worker health and restarts failed workers
-    4. Exits when all queues are empty (unless --daemon)
-    
-    Args:
-        daemon: Run forever (don't exit when idle)
-        watch: Just watch the queues without spawning workers (for debugging)
-    
-    Exit codes:
-        0: All work completed successfully
-        1: Error occurred
-    """
-    from archivebox.workers.orchestrator import Orchestrator
-    
-    if Orchestrator.is_running():
-        print('[yellow]Orchestrator is already running[/yellow]')
-        return 0
-    
-    try:
-        orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
-        orchestrator_instance.runloop()
-        return 0
-    except KeyboardInterrupt:
-        return 0
-    except Exception as e:
-        print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
-        return 1
-
-
-@click.command()
-@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
-@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
-@docstring(orchestrator.__doc__)
-def main(daemon: bool, watch: bool):
-    """Start the ArchiveBox orchestrator process"""
-    sys.exit(orchestrator(daemon=daemon, watch=watch))
-
-
-if __name__ == '__main__':
-    main()
--- a/archivebox/cli/archivebox_run.py
+++ b/archivebox/cli/archivebox_run.py
@@ -1,16 +1,18 @@
 #!/usr/bin/env python3

 """
-archivebox run [--daemon]
+archivebox run [--daemon] [--crawl-id=...] [--snapshot-id=...]

 Unified command for processing queued work.

 Modes:
    - With stdin JSONL: Process piped records, exit when complete
    - Without stdin (TTY): Run orchestrator in foreground until killed
+    - --crawl-id: Run orchestrator for specific crawl only
+    - --snapshot-id: Run worker for specific snapshot only (internal use)

 Examples:
-    # Run orchestrator in foreground (replaces `archivebox orchestrator`)
+    # Run orchestrator in foreground
    archivebox run

    # Run as daemon (don't exit on idle)
@@ -23,6 +25,12 @@ Examples:

    # Mixed types work too
    cat mixed_records.jsonl | archivebox run
+
+    # Run orchestrator for specific crawl (shows live progress for that crawl)
+    archivebox run --crawl-id=019b7e90-04d0-73ed-adec-aad9cfcd863e
+
+    # Run worker for specific snapshot (internal use by orchestrator)
+    archivebox run --snapshot-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
 """

 __package__ = 'archivebox.cli'
@@ -187,15 +195,62 @@ def run_orchestrator(daemon: bool = False) -> int:
        return 1


+def run_snapshot_worker(snapshot_id: str) -> int:
+    """
+    Run a SnapshotWorker for a specific snapshot.
+
+    Args:
+        snapshot_id: Snapshot UUID to process
+
+    Returns exit code (0 = success, 1 = error).
+    """
+    from archivebox.workers.worker import _run_snapshot_worker
+
+    try:
+        _run_snapshot_worker(snapshot_id=snapshot_id, worker_id=0)
+        return 0
+    except KeyboardInterrupt:
+        return 0
+    except Exception as e:
+        rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        import traceback
+        traceback.print_exc()
+        return 1
+
+
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
-def main(daemon: bool):
+@click.option('--crawl-id', help="Run orchestrator for specific crawl only")
+@click.option('--snapshot-id', help="Run worker for specific snapshot only")
+def main(daemon: bool, crawl_id: str, snapshot_id: str):
    """
    Process queued work.

-    When stdin is piped: Process those specific records and exit.
-    When run standalone: Run orchestrator in foreground.
+    Modes:
+    - No args + stdin piped: Process piped JSONL records
+    - No args + TTY: Run orchestrator for all work
+    - --crawl-id: Run orchestrator for that crawl only
+    - --snapshot-id: Run worker for that snapshot only
    """
+    # Snapshot worker mode
+    if snapshot_id:
+        sys.exit(run_snapshot_worker(snapshot_id))
+
+    # Crawl worker mode
+    if crawl_id:
+        from archivebox.workers.worker import CrawlWorker
+        try:
+            worker = CrawlWorker(crawl_id=crawl_id, worker_id=0)
+            worker.runloop()
+            sys.exit(0)
+        except KeyboardInterrupt:
+            sys.exit(0)
+        except Exception as e:
+            rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+            import traceback
+            traceback.print_exc()
+            sys.exit(1)
+
    # Check if stdin has data (non-TTY means piped input)
    if not sys.stdin.isatty():
        sys.exit(process_stdin_records())
--- a/archivebox/cli/archivebox_worker.py
+++ b/archivebox/cli/archivebox_worker.py
@@ -1,50 +0,0 @@
-#!/usr/bin/env python3
-
-__package__ = 'archivebox.cli'
-__command__ = 'archivebox worker'
-
-import sys
-
-import rich_click as click
-
-from archivebox.misc.util import docstring
-
-
-def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
-    """
-    Start a worker process to process items from the queue.
-
-    Worker types:
-        - crawl: Process Crawl objects (parse seeds, create snapshots)
-        - snapshot: Process Snapshot objects (create archive results)
-        - archiveresult: Process ArchiveResult objects (run plugins)
-
-    Workers poll the database for queued items, claim them atomically,
-    and spawn subprocess tasks to handle each item.
-    """
-    from archivebox.workers.worker import get_worker_class
-
-    WorkerClass = get_worker_class(worker_type)
-
-    # Build kwargs
-    kwargs = {'daemon': daemon}
-    if plugin and worker_type == 'archiveresult':
-        kwargs['extractor'] = plugin  # internal field still called extractor
-
-    # Create and run worker
-    worker_instance = WorkerClass(**kwargs)
-    worker_instance.runloop()
-
-
-@click.command()
-@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
-@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
-@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
-@docstring(worker.__doc__)
-def main(worker_type: str, daemon: bool, plugin: str | None):
-    """Start an ArchiveBox worker process"""
-    worker(worker_type, daemon=daemon, plugin=plugin)
-
-
-if __name__ == '__main__':
-    main()