mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
285 lines
8.6 KiB
Python
285 lines
8.6 KiB
Python
__package__ = "archivebox.api"
|
|
|
|
import json
|
|
from io import StringIO
|
|
from typing import Any
|
|
from enum import Enum
|
|
|
|
from django.http import HttpRequest
|
|
|
|
from ninja import Router, Schema
|
|
|
|
from archivebox.misc.util import ansi_to_html
|
|
from archivebox.config.common import ARCHIVING_CONFIG
|
|
|
|
|
|
# from .auth import API_AUTH_METHODS
|
|
|
|
# router for API that exposes archivebox cli subcommands as REST endpoints
|
|
router = Router(tags=["ArchiveBox CLI Sub-Commands"])
|
|
|
|
|
|
# Schemas
|
|
|
|
JSONType = list[Any] | dict[str, Any] | bool | int | str | None
|
|
|
|
|
|
class CLICommandResponseSchema(Schema):
|
|
success: bool
|
|
errors: list[str]
|
|
result: JSONType
|
|
result_format: str = "str"
|
|
stdout: str
|
|
stderr: str
|
|
|
|
|
|
class FilterTypeChoices(str, Enum):
|
|
exact = "exact"
|
|
substring = "substring"
|
|
regex = "regex"
|
|
domain = "domain"
|
|
tag = "tag"
|
|
timestamp = "timestamp"
|
|
|
|
|
|
class StatusChoices(str, Enum):
|
|
indexed = "indexed"
|
|
archived = "archived"
|
|
unarchived = "unarchived"
|
|
present = "present"
|
|
valid = "valid"
|
|
invalid = "invalid"
|
|
duplicate = "duplicate"
|
|
orphaned = "orphaned"
|
|
corrupted = "corrupted"
|
|
unrecognized = "unrecognized"
|
|
|
|
|
|
class AddCommandSchema(Schema):
|
|
urls: list[str]
|
|
tag: str = ""
|
|
depth: int = 0
|
|
parser: str = "auto"
|
|
plugins: str = ""
|
|
update: bool = not ARCHIVING_CONFIG.ONLY_NEW # Default to the opposite of ARCHIVING_CONFIG.ONLY_NEW
|
|
overwrite: bool = False
|
|
index_only: bool = False
|
|
|
|
|
|
class UpdateCommandSchema(Schema):
|
|
resume: str | None = None
|
|
after: float | None = 0
|
|
before: float | None = 999999999999999
|
|
filter_type: str | None = FilterTypeChoices.substring
|
|
filter_patterns: list[str] | None = ["https://example.com"]
|
|
batch_size: int = 100
|
|
continuous: bool = False
|
|
|
|
|
|
class ScheduleCommandSchema(Schema):
|
|
import_path: str | None = None
|
|
add: bool = False
|
|
show: bool = False
|
|
foreground: bool = False
|
|
run_all: bool = False
|
|
quiet: bool = False
|
|
every: str | None = None
|
|
tag: str = ""
|
|
depth: int = 0
|
|
overwrite: bool = False
|
|
update: bool = not ARCHIVING_CONFIG.ONLY_NEW
|
|
clear: bool = False
|
|
|
|
|
|
class ListCommandSchema(Schema):
|
|
filter_patterns: list[str] | None = ["https://example.com"]
|
|
filter_type: str = FilterTypeChoices.substring
|
|
status: StatusChoices = StatusChoices.indexed
|
|
after: float | None = 0
|
|
before: float | None = 999999999999999
|
|
sort: str = "bookmarked_at"
|
|
as_json: bool = True
|
|
as_html: bool = False
|
|
as_csv: str | None = "timestamp,url"
|
|
with_headers: bool = False
|
|
|
|
|
|
class RemoveCommandSchema(Schema):
|
|
delete: bool = True
|
|
after: float | None = 0
|
|
before: float | None = 999999999999999
|
|
filter_type: str = FilterTypeChoices.exact
|
|
filter_patterns: list[str] | None = ["https://example.com"]
|
|
|
|
|
|
@router.post("/add", response=CLICommandResponseSchema, summary="archivebox add [args] [urls]")
|
|
def cli_add(request: HttpRequest, args: AddCommandSchema):
|
|
from archivebox.cli.archivebox_add import add
|
|
|
|
crawl, snapshots = add(
|
|
urls=args.urls,
|
|
tag=args.tag,
|
|
depth=args.depth,
|
|
update=args.update,
|
|
index_only=args.index_only,
|
|
overwrite=args.overwrite,
|
|
plugins=args.plugins,
|
|
parser=args.parser,
|
|
bg=True, # Always run in background for API calls
|
|
created_by_id=request.user.pk,
|
|
)
|
|
|
|
snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list("id", flat=True)]
|
|
result_payload = {
|
|
"crawl_id": str(crawl.id),
|
|
"num_snapshots": len(snapshot_ids),
|
|
"snapshot_ids": snapshot_ids,
|
|
"queued_urls": args.urls,
|
|
}
|
|
stdout = getattr(request, "stdout", None)
|
|
stderr = getattr(request, "stderr", None)
|
|
|
|
return {
|
|
"success": True,
|
|
"errors": [],
|
|
"result": result_payload,
|
|
"result_format": "json",
|
|
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
|
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
|
}
|
|
|
|
|
|
@router.post("/update", response=CLICommandResponseSchema, summary="archivebox update [args] [filter_patterns]")
|
|
def cli_update(request: HttpRequest, args: UpdateCommandSchema):
|
|
from archivebox.cli.archivebox_update import update
|
|
|
|
result = update(
|
|
filter_patterns=args.filter_patterns or [],
|
|
filter_type=args.filter_type or FilterTypeChoices.substring,
|
|
after=args.after,
|
|
before=args.before,
|
|
resume=args.resume,
|
|
batch_size=args.batch_size,
|
|
continuous=args.continuous,
|
|
)
|
|
stdout = getattr(request, "stdout", None)
|
|
stderr = getattr(request, "stderr", None)
|
|
return {
|
|
"success": True,
|
|
"errors": [],
|
|
"result": result,
|
|
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
|
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
|
}
|
|
|
|
|
|
@router.post("/schedule", response=CLICommandResponseSchema, summary="archivebox schedule [args] [import_path]")
|
|
def cli_schedule(request: HttpRequest, args: ScheduleCommandSchema):
|
|
from archivebox.cli.archivebox_schedule import schedule
|
|
|
|
result = schedule(
|
|
import_path=args.import_path,
|
|
add=args.add,
|
|
show=args.show,
|
|
foreground=args.foreground,
|
|
run_all=args.run_all,
|
|
quiet=args.quiet,
|
|
clear=args.clear,
|
|
every=args.every,
|
|
tag=args.tag,
|
|
depth=args.depth,
|
|
overwrite=args.overwrite,
|
|
update=args.update,
|
|
)
|
|
|
|
stdout = getattr(request, "stdout", None)
|
|
stderr = getattr(request, "stderr", None)
|
|
return {
|
|
"success": True,
|
|
"errors": [],
|
|
"result": result,
|
|
"result_format": "json",
|
|
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
|
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
|
}
|
|
|
|
|
|
@router.post("/search", response=CLICommandResponseSchema, summary="archivebox search [args] [filter_patterns]")
|
|
def cli_search(request: HttpRequest, args: ListCommandSchema):
|
|
from archivebox.cli.archivebox_search import search
|
|
|
|
result = search(
|
|
filter_patterns=args.filter_patterns,
|
|
filter_type=args.filter_type,
|
|
status=args.status,
|
|
after=args.after,
|
|
before=args.before,
|
|
sort=args.sort,
|
|
csv=args.as_csv,
|
|
json=args.as_json,
|
|
html=args.as_html,
|
|
with_headers=args.with_headers,
|
|
)
|
|
|
|
result_format = "txt"
|
|
if args.as_json:
|
|
result_format = "json"
|
|
result = json.loads(result)
|
|
elif args.as_html:
|
|
result_format = "html"
|
|
elif args.as_csv:
|
|
result_format = "csv"
|
|
|
|
stdout = getattr(request, "stdout", None)
|
|
stderr = getattr(request, "stderr", None)
|
|
return {
|
|
"success": True,
|
|
"errors": [],
|
|
"result": result,
|
|
"result_format": result_format,
|
|
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
|
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
|
}
|
|
|
|
|
|
@router.post("/remove", response=CLICommandResponseSchema, summary="archivebox remove [args] [filter_patterns]")
|
|
def cli_remove(request: HttpRequest, args: RemoveCommandSchema):
|
|
from archivebox.cli.archivebox_remove import remove
|
|
from archivebox.cli.archivebox_search import get_snapshots
|
|
from archivebox.core.models import Snapshot
|
|
|
|
filter_patterns = args.filter_patterns or []
|
|
snapshots_to_remove = get_snapshots(
|
|
filter_patterns=filter_patterns,
|
|
filter_type=args.filter_type,
|
|
after=args.after,
|
|
before=args.before,
|
|
)
|
|
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list("id", flat=True)]
|
|
|
|
remove(
|
|
yes=True, # no way to interactively ask for confirmation via API, so we force yes
|
|
delete=args.delete,
|
|
snapshots=snapshots_to_remove,
|
|
before=args.before,
|
|
after=args.after,
|
|
filter_type=args.filter_type,
|
|
filter_patterns=filter_patterns,
|
|
)
|
|
|
|
result = {
|
|
"removed_count": len(removed_snapshot_ids),
|
|
"removed_snapshot_ids": removed_snapshot_ids,
|
|
"remaining_snapshots": Snapshot.objects.count(),
|
|
}
|
|
stdout = getattr(request, "stdout", None)
|
|
stderr = getattr(request, "stderr", None)
|
|
return {
|
|
"success": True,
|
|
"errors": [],
|
|
"result": result,
|
|
"result_format": "json",
|
|
"stdout": ansi_to_html(stdout.getvalue().strip()) if isinstance(stdout, StringIO) else "",
|
|
"stderr": ansi_to_html(stderr.getvalue().strip()) if isinstance(stderr, StringIO) else "",
|
|
}
|