#!/usr/bin/env python3 """ archivebox run [--daemon] [--crawl-id=...] [--snapshot-id=...] [--binary-id=...] Unified command for processing queued work on the shared abx-dl bus. Modes: - With stdin JSONL: Process piped records, exit when complete - Without stdin (TTY): Run the background runner in foreground until killed - --crawl-id: Run the crawl runner for a specific crawl only - --snapshot-id: Run a specific snapshot through its parent crawl - --binary-id: Emit a BinaryEvent for a specific Binary row Examples: # Run the background runner in foreground archivebox run # Run as daemon (don't exit on idle) archivebox run --daemon # Process specific records (pipe any JSONL type, exits when done) archivebox snapshot list --status=queued | archivebox run archivebox archiveresult list --status=failed | archivebox run archivebox crawl list --status=queued | archivebox run # Mixed types work too cat mixed_records.jsonl | archivebox run # Run the crawl runner for a specific crawl archivebox run --crawl-id=019b7e90-04d0-73ed-adec-aad9cfcd863e # Run one snapshot from an existing crawl archivebox run --snapshot-id=019b7e90-5a8e-712c-9877-2c70eebe80ad # Run one queued binary install directly on the bus archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad """ __package__ = "archivebox.cli" __command__ = "archivebox run" import sys from collections import defaultdict import rich_click as click from rich import print as rprint def process_stdin_records() -> int: """ Process JSONL records from stdin. Create-or-update behavior: - Records WITHOUT id: Create via Model.from_json(), then queue - Records WITH id: Lookup existing, re-queue for processing Outputs JSONL of all processed records (for chaining). Handles any record type: Crawl, Snapshot, ArchiveResult. Auto-cascades: Crawl → Snapshots → ArchiveResults. Returns exit code (0 = success, 1 = error). """ from django.utils import timezone from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.core.models import Snapshot, ArchiveResult from archivebox.crawls.models import Crawl from archivebox.machine.models import Binary from archivebox.services.runner import run_binary, run_crawl records = list(read_stdin()) is_tty = sys.stdout.isatty() if not records: return 0 # Nothing to process created_by_id = get_or_create_system_user_pk() queued_count = 0 output_records = [] full_crawl_ids: set[str] = set() snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set) plugin_names_by_crawl: dict[str, set[str]] = defaultdict(set) run_all_plugins_for_crawl: set[str] = set() binary_ids: list[str] = [] for record in records: record_type = record.get("type", "") record_id = record.get("id") try: if record_type == TYPE_CRAWL: if record_id: # Existing crawl - re-queue try: crawl = Crawl.objects.get(id=record_id) except Crawl.DoesNotExist: crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) else: # New crawl - create it crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) if crawl: crawl.retry_at = timezone.now() if crawl.status not in [Crawl.StatusChoices.SEALED]: crawl.status = Crawl.StatusChoices.QUEUED crawl.save() full_crawl_ids.add(str(crawl.id)) run_all_plugins_for_crawl.add(str(crawl.id)) output_records.append(crawl.to_json()) queued_count += 1 elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type): if record_id: # Existing snapshot - re-queue try: snapshot = Snapshot.objects.get(id=record_id) except Snapshot.DoesNotExist: snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) else: # New snapshot - create it snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) if snapshot: snapshot.retry_at = timezone.now() if snapshot.status not in [Snapshot.StatusChoices.SEALED]: snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.save() crawl = snapshot.crawl crawl.retry_at = timezone.now() if crawl.status != Crawl.StatusChoices.STARTED: crawl.status = Crawl.StatusChoices.QUEUED crawl.save(update_fields=["status", "retry_at", "modified_at"]) crawl_id = str(snapshot.crawl_id) snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) run_all_plugins_for_crawl.add(crawl_id) output_records.append(snapshot.to_json()) queued_count += 1 elif record_type == TYPE_ARCHIVERESULT: if record_id: # Existing archiveresult - re-queue try: archiveresult = ArchiveResult.objects.get(id=record_id) except ArchiveResult.DoesNotExist: archiveresult = None else: archiveresult = None snapshot_id = record.get("snapshot_id") plugin_name = record.get("plugin") snapshot = None if archiveresult: if archiveresult.status in [ ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF, ]: archiveresult.reset_for_retry() snapshot = archiveresult.snapshot plugin_name = plugin_name or archiveresult.plugin elif snapshot_id: try: snapshot = Snapshot.objects.get(id=snapshot_id) except Snapshot.DoesNotExist: snapshot = None if snapshot: snapshot.retry_at = timezone.now() if snapshot.status != Snapshot.StatusChoices.STARTED: snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.save(update_fields=["status", "retry_at", "modified_at"]) crawl = snapshot.crawl crawl.retry_at = timezone.now() if crawl.status != Crawl.StatusChoices.STARTED: crawl.status = Crawl.StatusChoices.QUEUED crawl.save(update_fields=["status", "retry_at", "modified_at"]) crawl_id = str(snapshot.crawl_id) snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id)) if plugin_name: plugin_names_by_crawl[crawl_id].add(str(plugin_name)) output_records.append(record if not archiveresult else archiveresult.to_json()) queued_count += 1 elif record_type == TYPE_BINARY: if record_id: try: binary = Binary.objects.get(id=record_id) except Binary.DoesNotExist: binary = Binary.from_json(record) else: binary = Binary.from_json(record) if binary: binary.retry_at = timezone.now() if binary.status != Binary.StatusChoices.INSTALLED: binary.status = Binary.StatusChoices.QUEUED binary.save() binary_ids.append(str(binary.id)) output_records.append(binary.to_json()) queued_count += 1 else: # Unknown type - pass through output_records.append(record) except Exception as e: rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr) continue # Output all processed records (for chaining) if not is_tty: for rec in output_records: write_record(rec) if queued_count == 0: rprint("[yellow]No records to process[/yellow]", file=sys.stderr) return 0 rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr) for binary_id in binary_ids: run_binary(binary_id) targeted_crawl_ids = full_crawl_ids | set(snapshot_ids_by_crawl) if targeted_crawl_ids: for crawl_id in sorted(targeted_crawl_ids): run_crawl( crawl_id, snapshot_ids=None if crawl_id in full_crawl_ids else sorted(snapshot_ids_by_crawl[crawl_id]), selected_plugins=None if crawl_id in run_all_plugins_for_crawl else sorted(plugin_names_by_crawl[crawl_id]), ) return 0 def run_runner(daemon: bool = False) -> int: """ Run the background runner loop. Args: daemon: Run forever (don't exit when idle) Returns exit code (0 = success, 1 = error). """ from django.utils import timezone from archivebox.machine.models import Machine, Process from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls Process.cleanup_stale_running() Process.cleanup_orphaned_workers() recover_orphaned_snapshots() recover_orphaned_crawls() Machine.current() current = Process.current() if current.process_type != Process.TypeChoices.ORCHESTRATOR: current.process_type = Process.TypeChoices.ORCHESTRATOR current.save(update_fields=["process_type", "modified_at"]) try: run_pending_crawls(daemon=daemon) return 0 except KeyboardInterrupt: return 0 except Exception as e: rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) return 1 finally: current.refresh_from_db() if current.status != Process.StatusChoices.EXITED: current.status = Process.StatusChoices.EXITED current.ended_at = current.ended_at or timezone.now() current.save(update_fields=["status", "ended_at", "modified_at"]) @click.command() @click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)") @click.option("--crawl-id", help="Run the crawl runner for a specific crawl only") @click.option("--snapshot-id", help="Run one snapshot through its crawl") @click.option("--binary-id", help="Run one queued binary install directly on the bus") def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str): """ Process queued work. Modes: - No args + stdin piped: Process piped JSONL records - No args + TTY: Run the crawl runner for all work - --crawl-id: Run the crawl runner for that crawl only - --snapshot-id: Run one snapshot through its crawl only - --binary-id: Run one queued binary install directly on the bus """ if snapshot_id: sys.exit(run_snapshot_worker(snapshot_id)) if binary_id: try: from archivebox.services.runner import run_binary run_binary(binary_id) sys.exit(0) except KeyboardInterrupt: sys.exit(0) except Exception as e: rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) import traceback traceback.print_exc() sys.exit(1) if crawl_id: try: from archivebox.services.runner import run_crawl run_crawl(crawl_id) sys.exit(0) except KeyboardInterrupt: sys.exit(0) except Exception as e: rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) import traceback traceback.print_exc() sys.exit(1) if daemon: if not sys.stdin.isatty(): exit_code = process_stdin_records() if exit_code != 0: sys.exit(exit_code) sys.exit(run_runner(daemon=True)) if not sys.stdin.isatty(): sys.exit(process_stdin_records()) else: sys.exit(run_runner(daemon=daemon)) def run_snapshot_worker(snapshot_id: str) -> int: from archivebox.core.models import Snapshot from archivebox.services.runner import run_crawl try: snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id) run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)]) return 0 except KeyboardInterrupt: return 0 except Exception as e: rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr) import traceback traceback.print_exc() return 1 if __name__ == "__main__": main()