Improve scheduling, runtime paths, and API behavior

This commit is contained in:
Nick Sweeting
2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions

View File

@@ -1,30 +1,41 @@
__package__ = 'archivebox.api'
import os
import django
from io import StringIO
from types import SimpleNamespace
# from django.test import TestCase
# from ninja.testing import TestClient
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
django.setup()
# from .routes_cli import router
from django.contrib.auth.models import User
from django.test import TestCase
# class ArchiveBoxCLIAPITestCase(TestCase):
# def setUp(self):
# self.client = TestClient(router)
from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule
from archivebox.crawls.models import CrawlSchedule
# def test_add_endpoint(self):
# response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
# self.assertEqual(response.status_code, 200)
# self.assertTrue(response.json()["success"])
# def test_remove_endpoint(self):
# response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
# self.assertEqual(response.status_code, 200)
# self.assertTrue(response.json()["success"])
class CLIScheduleAPITests(TestCase):
def setUp(self):
self.user = User.objects.create_user(
username='api-user',
password='testpass123',
email='api@example.com',
)
# def test_update_endpoint(self):
# response = self.client.post("/update", json={})
# self.assertEqual(response.status_code, 200)
# self.assertTrue(response.json()["success"])
def test_schedule_api_creates_schedule(self):
request = SimpleNamespace(
user=self.user,
stdout=StringIO(),
stderr=StringIO(),
)
args = ScheduleCommandSchema(
every='daily',
import_path='https://example.com/feed.xml',
quiet=True,
)
# def test_list_all_endpoint(self):
# response = self.client.post("/list_all", json={})
# self.assertEqual(response.status_code, 200)
# self.assertTrue(response.json()["success"])
response = cli_schedule(request, args)
self.assertTrue(response['success'])
self.assertEqual(response['result_format'], 'json')
self.assertEqual(CrawlSchedule.objects.count(), 1)
self.assertEqual(len(response['result']['created_schedule_ids']), 1)

View File

@@ -74,6 +74,10 @@ class UpdateCommandSchema(Schema):
class ScheduleCommandSchema(Schema):
import_path: Optional[str] = None
add: bool = False
show: bool = False
foreground: bool = False
run_all: bool = False
quiet: bool = False
every: Optional[str] = None
tag: str = ''
depth: int = 0
@@ -172,6 +176,9 @@ def cli_schedule(request, args: ScheduleCommandSchema):
import_path=args.import_path,
add=args.add,
show=args.show,
foreground=args.foreground,
run_all=args.run_all,
quiet=args.quiet,
clear=args.clear,
every=args.every,
tag=args.tag,
@@ -184,6 +191,7 @@ def cli_schedule(request, args: ScheduleCommandSchema):
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}
@@ -230,19 +238,37 @@ def cli_search(request, args: ListCommandSchema):
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
def cli_remove(request, args: RemoveCommandSchema):
from archivebox.cli.archivebox_remove import remove
from archivebox.cli.archivebox_search import get_snapshots
from archivebox.core.models import Snapshot
snapshots_to_remove = get_snapshots(
filter_patterns=args.filter_patterns,
filter_type=args.filter_type,
after=args.after,
before=args.before,
)
removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)]
result = remove(
remove(
yes=True, # no way to interactively ask for confirmation via API, so we force yes
delete=args.delete,
snapshots=snapshots_to_remove,
before=args.before,
after=args.after,
filter_type=args.filter_type,
filter_patterns=args.filter_patterns,
)
result = {
"removed_count": len(removed_snapshot_ids),
"removed_snapshot_ids": removed_snapshot_ids,
"remaining_snapshots": Snapshot.objects.count(),
}
return {
"success": True,
"errors": [],
"result": result,
"result_format": "json",
"stdout": ansi_to_html(request.stdout.getvalue().strip()),
"stderr": ansi_to_html(request.stderr.getvalue().strip()),
}

View File

@@ -547,7 +547,7 @@ def tags_add_to_snapshot(request, data: TagSnapshotRequestSchema):
raise HttpError(400, 'Either tag_name or tag_id is required')
# Add the tag to the snapshot
snapshot.tags.add(tag)
snapshot.tags.add(tag.pk)
return {
'success': True,
@@ -586,7 +586,7 @@ def tags_remove_from_snapshot(request, data: TagSnapshotRequestSchema):
raise HttpError(400, 'Either tag_name or tag_id is required')
# Remove the tag from the snapshot
snapshot.tags.remove(tag)
snapshot.tags.remove(tag.pk)
return {
'success': True,

View File

@@ -106,6 +106,13 @@ def get_machines(request, filters: MachineFilterSchema = Query(...)):
return filters.filter(Machine.objects.all()).distinct()
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
def get_current_machine(request):
"""Get the current machine."""
from archivebox.machine.models import Machine
return Machine.current()
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
def get_machine(request, machine_id: str):
"""Get a specific machine by ID."""
@@ -114,13 +121,6 @@ def get_machine(request, machine_id: str):
return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))
@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
def get_current_machine(request):
"""Get the current machine."""
from archivebox.machine.models import Machine
return Machine.current()
# ============================================================================
@@ -133,18 +133,18 @@ def get_current_machine(request):
def get_binaries(request, filters: BinaryFilterSchema = Query(...)):
"""List all binaries."""
from archivebox.machine.models import Binary
return filters.filter(Binary.objects.all().select_related('machine', 'dependency')).distinct()
return filters.filter(Binary.objects.all().select_related('machine')).distinct()
@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
def get_binary(request, binary_id: str):
"""Get a specific binary by ID."""
from archivebox.machine.models import Binary
return Binary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
return Binary.objects.select_related('machine').get(id__startswith=binary_id)
@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
def get_binaries_by_name(request, name: str):
"""Get all binaries with the given name."""
from archivebox.machine.models import Binary
return list(Binary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))
return list(Binary.objects.filter(name__iexact=name).select_related('machine'))