ArchiveBox/archivebox/cli/archivebox_crawl.py

#!/usr/bin/env python3

"""
archivebox crawl <action> [args...] [--filters]

Manage Crawl records.

Actions:
    create  - Create Crawl jobs from URLs
    list    - List Crawls as JSONL (with optional filters)
    update  - Update Crawls from stdin JSONL
    delete  - Delete Crawls from stdin JSONL

Examples:
    # Create
    archivebox crawl create https://example.com https://foo.com --depth=1
    archivebox crawl create --tag=news https://example.com

    # List with filters
    archivebox crawl list --status=queued
    archivebox crawl list --urls__icontains=example.com

    # Update
    archivebox crawl list --status=started | archivebox crawl update --status=queued

    # Delete
    archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes

    # Full pipeline
    archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
"""

__package__ = "archivebox.cli"
__command__ = "archivebox crawl"

import sys
from collections.abc import Iterable

import rich_click as click
from rich import print as rprint

from archivebox.cli.cli_utils import apply_filters


# =============================================================================
# CREATE
# =============================================================================


def create_crawl(
    urls: Iterable[str],
    depth: int = 0,
    tag: str = "",
    status: str = "queued",
    created_by_id: int | None = None,
) -> int:
    """
    Create a Crawl job from URLs.

    Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
    Pass-through: Records that are not URLs are output unchanged (for piping).

    Exit codes:
        0: Success
        1: Failure
    """
    from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL
    from archivebox.base_models.models import get_or_create_system_user_pk
    from archivebox.crawls.models import Crawl

    created_by_id = created_by_id or get_or_create_system_user_pk()
    is_tty = sys.stdout.isatty()

    # Collect all input records
    records = list(read_args_or_stdin(urls))

    if not records:
        rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
        return 1

    # Separate pass-through records from URL records
    url_list = []
    pass_through_records = []

    for record in records:
        record_type = record.get("type", "")

        # Pass-through: output records that aren't URL/Crawl types
        if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"):
            pass_through_records.append(record)
            continue

        # Handle existing Crawl records (just pass through with id)
        if record_type == TYPE_CRAWL and record.get("id"):
            pass_through_records.append(record)
            continue

        # Collect URLs
        url = record.get("url")
        if url:
            url_list.append(url)

        # Handle 'urls' field (newline-separated)
        urls_field = record.get("urls")
        if urls_field:
            for line in urls_field.split("\n"):
                line = line.strip()
                if line and not line.startswith("#"):
                    url_list.append(line)

    # Output pass-through records first
    if not is_tty:
        for record in pass_through_records:
            write_record(record)

    if not url_list:
        if pass_through_records:
            # If we had pass-through records but no URLs, that's OK
            rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr)
            return 0
        rprint("[red]No valid URLs found[/red]", file=sys.stderr)
        return 1

    try:
        # Build crawl record with all URLs as newline-separated string
        crawl_record = {
            "urls": "\n".join(url_list),
            "max_depth": depth,
            "tags_str": tag,
            "status": status,
            "label": "",
        }

        crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id})
        if not crawl:
            rprint("[red]Failed to create crawl[/red]", file=sys.stderr)
            return 1

        # Output JSONL record (only when piped)
        if not is_tty:
            write_record(crawl.to_json())

        rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr)

        # If TTY, show human-readable output
        if is_tty:
            rprint(f"  [dim]{crawl.id}[/dim]", file=sys.stderr)
            for url in url_list[:5]:  # Show first 5 URLs
                rprint(f"    {url[:70]}", file=sys.stderr)
            if len(url_list) > 5:
                rprint(f"    ... and {len(url_list) - 5} more", file=sys.stderr)

        return 0

    except Exception as e:
        rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr)
        return 1


# =============================================================================
# LIST
# =============================================================================


def list_crawls(
    status: str | None = None,
    urls__icontains: str | None = None,
    max_depth: int | None = None,
    limit: int | None = None,
) -> int:
    """
    List Crawls as JSONL with optional filters.

    Exit codes:
        0: Success (even if no results)
    """
    from archivebox.misc.jsonl import write_record
    from archivebox.crawls.models import Crawl

    is_tty = sys.stdout.isatty()

    queryset = Crawl.objects.all().order_by("-created_at")

    # Apply filters
    filter_kwargs = {
        "status": status,
        "urls__icontains": urls__icontains,
        "max_depth": max_depth,
    }
    queryset = apply_filters(queryset, filter_kwargs, limit=limit)

    count = 0
    for crawl in queryset:
        if is_tty:
            status_color = {
                "queued": "yellow",
                "started": "blue",
                "sealed": "green",
            }.get(crawl.status, "dim")
            url_preview = crawl.urls[:50].replace("\n", " ")
            rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...")
        else:
            write_record(crawl.to_json())
        count += 1

    rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr)
    return 0


# =============================================================================
# UPDATE
# =============================================================================


def update_crawls(
    status: str | None = None,
    max_depth: int | None = None,
) -> int:
    """
    Update Crawls from stdin JSONL.

    Reads Crawl records from stdin and applies updates.
    Uses PATCH semantics - only specified fields are updated.

    Exit codes:
        0: Success
        1: No input or error
    """
    from django.utils import timezone

    from archivebox.misc.jsonl import read_stdin, write_record
    from archivebox.crawls.models import Crawl

    is_tty = sys.stdout.isatty()

    records = list(read_stdin())
    if not records:
        rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
        return 1

    updated_count = 0
    for record in records:
        crawl_id = record.get("id")
        if not crawl_id:
            continue

        try:
            crawl = Crawl.objects.get(id=crawl_id)

            # Apply updates from CLI flags
            if status:
                crawl.status = status
                crawl.retry_at = timezone.now()
            if max_depth is not None:
                crawl.max_depth = max_depth

            crawl.save()
            updated_count += 1

            if not is_tty:
                write_record(crawl.to_json())

        except Crawl.DoesNotExist:
            rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr)
            continue

    rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr)
    return 0


# =============================================================================
# DELETE
# =============================================================================


def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
    """
    Delete Crawls from stdin JSONL.

    Requires --yes flag to confirm deletion.

    Exit codes:
        0: Success
        1: No input or missing --yes flag
    """
    from archivebox.misc.jsonl import read_stdin
    from archivebox.crawls.models import Crawl

    records = list(read_stdin())
    if not records:
        rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
        return 1

    crawl_ids = [r.get("id") for r in records if r.get("id")]

    if not crawl_ids:
        rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr)
        return 1

    crawls = Crawl.objects.filter(id__in=crawl_ids)
    count = crawls.count()

    if count == 0:
        rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr)
        return 0

    if dry_run:
        rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr)
        for crawl in crawls:
            url_preview = crawl.urls[:50].replace("\n", " ")
            rprint(f"  [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr)
        return 0

    if not yes:
        rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
        return 1

    # Perform deletion
    deleted_count, _ = crawls.delete()
    rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr)
    return 0


# =============================================================================
# CLI Commands
# =============================================================================


@click.group()
def main():
    """Manage Crawl records."""
    pass


@main.command("create")
@click.argument("urls", nargs=-1)
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
    """Create a Crawl job from URLs or stdin."""
    sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))


@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--urls__icontains", help="Filter by URLs contains")
@click.option("--max-depth", type=int, help="Filter by max depth")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(
    status: str | None,
    urls__icontains: str | None,
    max_depth: int | None,
    limit: int | None,
):
    """List Crawls as JSONL."""
    sys.exit(
        list_crawls(
            status=status,
            urls__icontains=urls__icontains,
            max_depth=max_depth,
            limit=limit,
        ),
    )


@main.command("update")
@click.option("--status", "-s", help="Set status")
@click.option("--max-depth", type=int, help="Set max depth")
def update_cmd(status: str | None, max_depth: int | None):
    """Update Crawls from stdin JSONL."""
    sys.exit(update_crawls(status=status, max_depth=max_depth))


@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
    """Delete Crawls from stdin JSONL."""
    sys.exit(delete_crawls(yes=yes, dry_run=dry_run))


if __name__ == "__main__":
    main()