ArchiveBox/archivebox/api/v1_cli.py

__package__ = "archivebox.api"

import json
from io import StringIO
from typing import Any
from enum import Enum

from django.http import HttpRequest

from ninja import Router, Schema

from archivebox.misc.util import ansi_to_html
from archivebox.config.common import ARCHIVING_CONFIG


# from .auth import API_AUTH_METHODS

# router for API that exposes archivebox cli subcommands as REST endpoints
router = Router(tags=["ArchiveBox CLI Sub-Commands"])


# Schemas

JSONType = list[Any] | dict[str, Any] | bool | int | str | None


class CLICommandResponseSchema(Schema):
    success: bool
    errors: list[str]
    result: JSONType
    result_format: str = "str"
    stdout: str
    stderr: str


class FilterTypeChoices(str, Enum):
    exact = "exact"
    substring = "substring"
    regex = "regex"
    domain = "domain"
    tag = "tag"
    timestamp = "timestamp"


class StatusChoices(str, Enum):
    indexed = "indexed"
    archived = "archived"
    unarchived = "unarchived"
    present = "present"
    valid = "valid"
    invalid = "invalid"
    duplicate = "duplicate"
    orphaned = "orphaned"
    corrupted = "corrupted"
    unrecognized = "unrecognized"


class AddCommandSchema(Schema):
    urls: list[str]
    tag: str = ""
    depth: int = 0
    parser: str = "auto"
    plugins: str = ""
    update: bool = not ARCHIVING_CONFIG.ONLY_NEW  # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
    overwrite: bool = False
    index_only: bool = False


class UpdateCommandSchema(Schema):
    resume: str | None = None
    after: float | None = 0
    before: float | None = 999999999999999
    filter_type: str | None = FilterTypeChoices.substring
    filter_patterns: list[str] | None = ["https://example.com"]
    batch_size: int = 100
    continuous: bool = False


class ScheduleCommandSchema(Schema):
    import_path: str | None = None
    add: bool = False
    show: bool = False
    foreground: bool = False
    run_all: bool = False
    quiet: bool = False
    every: str | None = None
    tag: str = ""
    depth: int = 0
    overwrite: bool = False
    update: bool = not ARCHIVING_CONFIG.ONLY_NEW
    clear: bool = False


class ListCommandSchema(Schema):
    filter_patterns: list[str] | None = ["https://example.com"]
    filter_type: str = FilterTypeChoices.substring
    status: StatusChoices = StatusChoices.indexed
    after: float | None = 0
    before: float | None = 999999999999999
    sort: str = "bookmarked_at"
    as_json: bool = True
    as_html: bool = False
    as_csv: str | None = "timestamp,url"
    with_headers: bool = False


class RemoveCommandSchema(Schema):
    delete: bool = True
    after: float | None = 0
    before: float | None = 999999999999999
    filter_type: str = FilterTypeChoices.exact
    filter_patterns: list[str] | None = ["https://example.com"]


@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
def cli_add(request: HttpRequest, args: AddCommandSchema):
    from archivebox.cli.archivebox_add import add

    crawl, snapshots = add(
        urls=args.urls,
        tag=args.tag,
        depth=args.depth,
        update=args.update,
        index_only=args.index_only,
        overwrite=args.overwrite,
        plugins=args.plugins,
        parser=args.parser,
        bg=True,  # Always run in background for API calls
        created_by_id=request.user.pk,
    )

    snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
    result_payload = {
        "crawl_id": str(crawl.id),
        "num_snapshots": len(snapshot_ids),
        "snapshot_ids": snapshot_ids,
        "queued_urls": args.urls,
    }
    stdout = getattr(request, "stdout", None)
    stderr = getattr(request, "stderr", None)

    return {
        "success": True,
        "errors": [],
        "result": result_payload,
        "result_format": "json",
        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }


@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
def cli_update(request: HttpRequest, args: UpdateCommandSchema):
    from archivebox.cli.archivebox_update import update

    result = update(
        filter_patterns=args.filter_patterns or [],
        filter_type=args.filter_type or FilterTypeChoices.substring,
        after=args.after,
        before=args.before,
        resume=args.resume,
        batch_size=args.batch_size,
        continuous=args.continuous,
    )
    stdout = getattr(request, "stdout", None)
    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }


@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
    from archivebox.cli.archivebox_schedule import schedule

    result = schedule(
        import_path=args.import_path,
        add=args.add,
        show=args.show,
        foreground=args.foreground,
        run_all=args.run_all,
        quiet=args.quiet,
        clear=args.clear,
        every=args.every,
        tag=args.tag,
        depth=args.depth,
        overwrite=args.overwrite,
        update=args.update,
    )

    stdout = getattr(request, "stdout", None)
    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
        "result_format": "json",
        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }


@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
def cli_search(request: HttpRequest, args: ListCommandSchema):
    from archivebox.cli.archivebox_search import search

    result = search(
        filter_patterns=args.filter_patterns,
        filter_type=args.filter_type,
        status=args.status,
        after=args.after,
        before=args.before,
        sort=args.sort,
        csv=args.as_csv,
        json=args.as_json,
        html=args.as_html,
        with_headers=args.with_headers,
    )

    result_format = "txt"
    if args.as_json:
        result_format = "json"
        result = json.loads(result)
    elif args.as_html:
        result_format = "html"
    elif args.as_csv:
        result_format = "csv"

    stdout = getattr(request, "stdout", None)
    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
        "result_format": result_format,
        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }


@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
    from archivebox.cli.archivebox_remove import remove
    from archivebox.cli.archivebox_search import get_snapshots
    from archivebox.core.models import Snapshot

    filter_patterns = args.filter_patterns or []
    snapshots_to_remove = get_snapshots(
        filter_patterns=filter_patterns,
        filter_type=args.filter_type,
        after=args.after,
        before=args.before,
    )
    removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]

    remove(
        yes=True,  # no way to interactively ask for confirmation via API, so we force yes
        delete=args.delete,
        snapshots=snapshots_to_remove,
        before=args.before,
        after=args.after,
        filter_type=args.filter_type,
        filter_patterns=filter_patterns,
    )

    result = {
        "removed_count": len(removed_snapshot_ids),
        "removed_snapshot_ids": removed_snapshot_ids,
        "remaining_snapshots": Snapshot.objects.count(),
    }
    stdout = getattr(request, "stdout", None)
    stderr = getattr(request, "stderr", None)
    return {
        "success": True,
        "errors": [],
        "result": result,
        "result_format": "json",
        "stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
        "stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
    }