Files
ArchiveBox/archivebox/api/v1_cli.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

285 lines
8.6 KiB
Python

__package__ = "archivebox.api"
import json
from io import StringIO
from typing import Any
from enum import Enum
from django.http import HttpRequest
from ninja import Router, Schema
from archivebox.misc.util import ansi_to_html
from archivebox.config.common import ARCHIVING_CONFIG
# from .auth import API_AUTH_METHODS
# router for API that exposes archivebox cli subcommands as REST endpoints
router = Router(tags=["ArchiveBox CLI Sub-Commands"])
# Schemas
JSONType = list[Any] | dict[str, Any] | bool | int | str | None
class CLICommandResponseSchema(Schema):
success: bool
errors: list[str]
result: JSONType
result_format: str = "str"
stdout: str
stderr: str
class FilterTypeChoices(str, Enum):
exact = "exact"
substring = "substring"
regex = "regex"
domain = "domain"
tag = "tag"
timestamp = "timestamp"
class StatusChoices(str, Enum):
indexed = "indexed"
archived = "archived"
unarchived = "unarchived"
present = "present"
valid = "valid"
invalid = "invalid"
duplicate = "duplicate"
orphaned = "orphaned"
corrupted = "corrupted"
unrecognized = "unrecognized"
class AddCommandSchema(Schema):
urls: list[str]
tag: str = ""
depth: int = 0
parser: str = "auto"
plugins: str = ""
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
overwrite: bool = False
index_only: bool = False
class UpdateCommandSchema(Schema):
resume: str | None = None
after: float | None = 0
before: float | None = 999999999999999
filter_type: str | None = FilterTypeChoices.substring
filter_patterns: list[str] | None = ["https://example.com"]
batch_size: int = 100
continuous: bool = False
class ScheduleCommandSchema(Schema):
import_path: str | None = None
add: bool = False
show: bool = False
foreground: bool = False
run_all: bool = False
quiet: bool = False
every: str | None = None
tag: str = ""
depth: int = 0
overwrite: bool = False
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
clear: bool = False
class ListCommandSchema(Schema):
filter_patterns: list[str] | None = ["https://example.com"]
filter_type: str = FilterTypeChoices.substring
status: StatusChoices = StatusChoices.indexed
after: float | None = 0
before: float | None = 999999999999999
sort: str = "bookmarked_at"
as_json: bool = True
as_html: bool = False
as_csv: str | None = "timestamp,url"
with_headers: bool = False
class RemoveCommandSchema(Schema):
delete: bool = True
after: float | None = 0
before: float | None = 999999999999999
filter_type: str = FilterTypeChoices.exact
filter_patterns: list[str] | None = ["https://example.com"]
@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
def cli_add(request: HttpRequest, args: AddCommandSchema):
from archivebox.cli.archivebox_add import add
crawl, snapshots = add(
urls=args.urls,
tag=args.tag,
depth=args.depth,
update=args.update,
index_only=args.index_only,
overwrite=args.overwrite,
plugins=args.plugins,
parser=args.parser,
bg=True, # Always run in background for API calls
created_by_id=request.user.pk,
)
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
result_payload = {
"crawl_id": str(crawl.id),
"num_snapshots": len(snapshot_ids),
"snapshot_ids": snapshot_ids,
"queued_urls": args.urls,
}
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result_payload,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
def cli_update(request: HttpRequest, args: UpdateCommandSchema):
from archivebox.cli.archivebox_update import update
result = update(
filter_patterns=args.filter_patterns or [],
filter_type=args.filter_type or FilterTypeChoices.substring,
after=args.after,
before=args.before,
resume=args.resume,
batch_size=args.batch_size,
continuous=args.continuous,
)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
from archivebox.cli.archivebox_schedule import schedule
result = schedule(
import_path=args.import_path,
add=args.add,
show=args.show,
foreground=args.foreground,
run_all=args.run_all,
quiet=args.quiet,
clear=args.clear,
every=args.every,
tag=args.tag,
depth=args.depth,
overwrite=args.overwrite,
update=args.update,
)
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
def cli_search(request: HttpRequest, args: ListCommandSchema):
from archivebox.cli.archivebox_search import search
result = search(
filter_patterns=args.filter_patterns,
filter_type=args.filter_type,
status=args.status,
after=args.after,
before=args.before,
sort=args.sort,
csv=args.as_csv,
json=args.as_json,
html=args.as_html,
with_headers=args.with_headers,
)
result_format = "txt"
if args.as_json:
result_format = "json"
result = json.loads(result)
elif args.as_html:
result_format = "html"
elif args.as_csv:
result_format = "csv"
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": result_format,
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}
@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
from archivebox.cli.archivebox_remove import remove
from archivebox.cli.archivebox_search import get_snapshots
from archivebox.core.models import Snapshot
filter_patterns = args.filter_patterns or []
snapshots_to_remove = get_snapshots(
filter_patterns=filter_patterns,
filter_type=args.filter_type,
after=args.after,
before=args.before,
)
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]
remove(
yes=True, # no way to interactively ask for confirmation via API, so we force yes
delete=args.delete,
snapshots=snapshots_to_remove,
before=args.before,
after=args.after,
filter_type=args.filter_type,
filter_patterns=filter_patterns,
)
result = {
"removed_count": len(removed_snapshot_ids),
"removed_snapshot_ids": removed_snapshot_ids,
"remaining_snapshots": Snapshot.objects.count(),
}
stdout = getattr(request, "stdout", None)
stderr = getattr(request, "stderr", None)
return {
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
}