ArchiveBox/archivebox/cli/archivebox_run.py

#!/usr/bin/env python3

"""
archivebox run [--daemon] [--crawl-id=...] [--snapshot-id=...] [--binary-id=...]

Unified command for processing queued work on the shared abx-dl bus.

Modes:
    - With stdin JSONL: Process piped records, exit when complete
    - Without stdin (TTY): Run the background runner in foreground until killed
    - --crawl-id: Run the crawl runner for a specific crawl only
    - --snapshot-id: Run a specific snapshot through its parent crawl
    - --binary-id: Emit a BinaryEvent for a specific Binary row

Examples:
    # Run the background runner in foreground
    archivebox run

    # Run as daemon (don't exit on idle)
    archivebox run --daemon

    # Process specific records (pipe any JSONL type, exits when done)
    archivebox snapshot list --status=queued | archivebox run
    archivebox archiveresult list --status=failed | archivebox run
    archivebox crawl list --status=queued | archivebox run

    # Mixed types work too
    cat mixed_records.jsonl | archivebox run

    # Run the crawl runner for a specific crawl
    archivebox run --crawl-id=019b7e90-04d0-73ed-adec-aad9cfcd863e

    # Run one snapshot from an existing crawl
    archivebox run --snapshot-id=019b7e90-5a8e-712c-9877-2c70eebe80ad

    # Run one queued binary install directly on the bus
    archivebox run --binary-id=019b7e90-5a8e-712c-9877-2c70eebe80ad
"""

__package__ = "archivebox.cli"
__command__ = "archivebox run"

import sys
from collections import defaultdict

import rich_click as click
from rich import print as rprint


def process_stdin_records() -> int:
    """
    Process JSONL records from stdin.

    Create-or-update behavior:
    - Records WITHOUT id: Create via Model.from_json(), then queue
    - Records WITH id: Lookup existing, re-queue for processing

    Outputs JSONL of all processed records (for chaining).

    Handles any record type: Crawl, Snapshot, ArchiveResult.
    Auto-cascades: Crawl → Snapshots → ArchiveResults.

    Returns exit code (0 = success, 1 = error).
    """
    from django.utils import timezone

    from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY
    from archivebox.base_models.models import get_or_create_system_user_pk
    from archivebox.core.models import Snapshot, ArchiveResult
    from archivebox.crawls.models import Crawl
    from archivebox.machine.models import Binary
    from archivebox.services.runner import run_binary, run_crawl

    records = list(read_stdin())
    is_tty = sys.stdout.isatty()

    if not records:
        return 0  # Nothing to process

    created_by_id = get_or_create_system_user_pk()
    queued_count = 0
    output_records = []
    full_crawl_ids: set[str] = set()
    snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
    plugin_names_by_crawl: dict[str, set[str]] = defaultdict(set)
    run_all_plugins_for_crawl: set[str] = set()
    binary_ids: list[str] = []

    for record in records:
        record_type = record.get("type", "")
        record_id = record.get("id")

        try:
            if record_type == TYPE_CRAWL:
                if record_id:
                    # Existing crawl - re-queue
                    try:
                        crawl = Crawl.objects.get(id=record_id)
                    except Crawl.DoesNotExist:
                        crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
                else:
                    # New crawl - create it
                    crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})

                if crawl:
                    crawl.retry_at = timezone.now()
                    if crawl.status not in [Crawl.StatusChoices.SEALED]:
                        crawl.status = Crawl.StatusChoices.QUEUED
                    crawl.save()
                    full_crawl_ids.add(str(crawl.id))
                    run_all_plugins_for_crawl.add(str(crawl.id))
                    output_records.append(crawl.to_json())
                    queued_count += 1

            elif record_type == TYPE_SNAPSHOT or (record.get("url") and not record_type):
                if record_id:
                    # Existing snapshot - re-queue
                    try:
                        snapshot = Snapshot.objects.get(id=record_id)
                    except Snapshot.DoesNotExist:
                        snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
                else:
                    # New snapshot - create it
                    snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})

                if snapshot:
                    snapshot.retry_at = timezone.now()
                    if snapshot.status not in [Snapshot.StatusChoices.SEALED]:
                        snapshot.status = Snapshot.StatusChoices.QUEUED
                    snapshot.save()
                    crawl = snapshot.crawl
                    crawl.retry_at = timezone.now()
                    if crawl.status != Crawl.StatusChoices.STARTED:
                        crawl.status = Crawl.StatusChoices.QUEUED
                    crawl.save(update_fields=["status", "retry_at", "modified_at"])
                    crawl_id = str(snapshot.crawl_id)
                    snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
                    run_all_plugins_for_crawl.add(crawl_id)
                    output_records.append(snapshot.to_json())
                    queued_count += 1

            elif record_type == TYPE_ARCHIVERESULT:
                if record_id:
                    # Existing archiveresult - re-queue
                    try:
                        archiveresult = ArchiveResult.objects.get(id=record_id)
                    except ArchiveResult.DoesNotExist:
                        archiveresult = None
                else:
                    archiveresult = None

                snapshot_id = record.get("snapshot_id")
                plugin_name = record.get("plugin")
                snapshot = None
                if archiveresult:
                    if archiveresult.status in [
                        ArchiveResult.StatusChoices.FAILED,
                        ArchiveResult.StatusChoices.SKIPPED,
                        ArchiveResult.StatusChoices.NORESULTS,
                        ArchiveResult.StatusChoices.BACKOFF,
                    ]:
                        archiveresult.reset_for_retry()
                    snapshot = archiveresult.snapshot
                    plugin_name = plugin_name or archiveresult.plugin
                elif snapshot_id:
                    try:
                        snapshot = Snapshot.objects.get(id=snapshot_id)
                    except Snapshot.DoesNotExist:
                        snapshot = None

                if snapshot:
                    snapshot.retry_at = timezone.now()
                    if snapshot.status != Snapshot.StatusChoices.STARTED:
                        snapshot.status = Snapshot.StatusChoices.QUEUED
                    snapshot.save(update_fields=["status", "retry_at", "modified_at"])
                    crawl = snapshot.crawl
                    crawl.retry_at = timezone.now()
                    if crawl.status != Crawl.StatusChoices.STARTED:
                        crawl.status = Crawl.StatusChoices.QUEUED
                    crawl.save(update_fields=["status", "retry_at", "modified_at"])
                    crawl_id = str(snapshot.crawl_id)
                    snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
                    if plugin_name:
                        plugin_names_by_crawl[crawl_id].add(str(plugin_name))
                    output_records.append(record if not archiveresult else archiveresult.to_json())
                    queued_count += 1

            elif record_type == TYPE_BINARY:
                if record_id:
                    try:
                        binary = Binary.objects.get(id=record_id)
                    except Binary.DoesNotExist:
                        binary = Binary.from_json(record)
                else:
                    binary = Binary.from_json(record)

                if binary:
                    binary.retry_at = timezone.now()
                    if binary.status != Binary.StatusChoices.INSTALLED:
                        binary.status = Binary.StatusChoices.QUEUED
                    binary.save()
                    binary_ids.append(str(binary.id))
                    output_records.append(binary.to_json())
                    queued_count += 1

            else:
                # Unknown type - pass through
                output_records.append(record)

        except Exception as e:
            rprint(f"[yellow]Error processing record: {e}[/yellow]", file=sys.stderr)
            continue

    # Output all processed records (for chaining)
    if not is_tty:
        for rec in output_records:
            write_record(rec)

    if queued_count == 0:
        rprint("[yellow]No records to process[/yellow]", file=sys.stderr)
        return 0

    rprint(f"[blue]Processing {queued_count} records...[/blue]", file=sys.stderr)

    for binary_id in binary_ids:
        run_binary(binary_id)

    targeted_crawl_ids = full_crawl_ids | set(snapshot_ids_by_crawl)
    if targeted_crawl_ids:
        for crawl_id in sorted(targeted_crawl_ids):
            run_crawl(
                crawl_id,
                snapshot_ids=None if crawl_id in full_crawl_ids else sorted(snapshot_ids_by_crawl[crawl_id]),
                selected_plugins=None if crawl_id in run_all_plugins_for_crawl else sorted(plugin_names_by_crawl[crawl_id]),
            )
    return 0


def run_runner(daemon: bool = False) -> int:
    """
    Run the background runner loop.

    Args:
        daemon: Run forever (don't exit when idle)

    Returns exit code (0 = success, 1 = error).
    """
    from django.utils import timezone
    from archivebox.machine.models import Machine, Process
    from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls

    Process.cleanup_stale_running()
    Process.cleanup_orphaned_workers()
    recover_orphaned_snapshots()
    recover_orphaned_crawls()
    Machine.current()
    current = Process.current()
    if current.process_type != Process.TypeChoices.ORCHESTRATOR:
        current.process_type = Process.TypeChoices.ORCHESTRATOR
        current.save(update_fields=["process_type", "modified_at"])

    try:
        run_pending_crawls(daemon=daemon)
        return 0
    except KeyboardInterrupt:
        return 0
    except Exception as e:
        rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
        return 1
    finally:
        current.refresh_from_db()
        if current.status != Process.StatusChoices.EXITED:
            current.status = Process.StatusChoices.EXITED
            current.ended_at = current.ended_at or timezone.now()
            current.save(update_fields=["status", "ended_at", "modified_at"])


@click.command()
@click.option("--daemon", "-d", is_flag=True, help="Run forever (don't exit on idle)")
@click.option("--crawl-id", help="Run the crawl runner for a specific crawl only")
@click.option("--snapshot-id", help="Run one snapshot through its crawl")
@click.option("--binary-id", help="Run one queued binary install directly on the bus")
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
    """
    Process queued work.

    Modes:
    - No args + stdin piped: Process piped JSONL records
    - No args + TTY: Run the crawl runner for all work
    - --crawl-id: Run the crawl runner for that crawl only
    - --snapshot-id: Run one snapshot through its crawl only
    - --binary-id: Run one queued binary install directly on the bus
    """
    if snapshot_id:
        sys.exit(run_snapshot_worker(snapshot_id))

    if binary_id:
        try:
            from archivebox.services.runner import run_binary

            run_binary(binary_id)
            sys.exit(0)
        except KeyboardInterrupt:
            sys.exit(0)
        except Exception as e:
            rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
            import traceback

            traceback.print_exc()
            sys.exit(1)

    if crawl_id:
        try:
            from archivebox.services.runner import run_crawl

            run_crawl(crawl_id)
            sys.exit(0)
        except KeyboardInterrupt:
            sys.exit(0)
        except Exception as e:
            rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
            import traceback

            traceback.print_exc()
            sys.exit(1)

    if daemon:
        if not sys.stdin.isatty():
            exit_code = process_stdin_records()
            if exit_code != 0:
                sys.exit(exit_code)
        sys.exit(run_runner(daemon=True))

    if not sys.stdin.isatty():
        sys.exit(process_stdin_records())
    else:
        sys.exit(run_runner(daemon=daemon))


def run_snapshot_worker(snapshot_id: str) -> int:
    from archivebox.core.models import Snapshot
    from archivebox.services.runner import run_crawl

    try:
        snapshot = Snapshot.objects.select_related("crawl").get(id=snapshot_id)
        run_crawl(str(snapshot.crawl_id), snapshot_ids=[str(snapshot.id)])
        return 0
    except KeyboardInterrupt:
        return 0
    except Exception as e:
        rprint(f"[red]Runner error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
        import traceback

        traceback.print_exc()
        return 1


if __name__ == "__main__":
    main()