Improve scheduling, runtime paths, and API behavior

2026-04-06 07:47:53 +10:00 · 2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions
--- a/archivebox/api/tests.py
+++ b/archivebox/api/tests.py
@@ -1,30 +1,41 @@
-__package__ = 'archivebox.api'
+import os
+import django
+from io import StringIO
+from types import SimpleNamespace

-# from django.test import TestCase
-# from ninja.testing import TestClient
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+django.setup()

-# from .routes_cli import router
+from django.contrib.auth.models import User
+from django.test import TestCase

-# class ArchiveBoxCLIAPITestCase(TestCase):
-#     def setUp(self):
-#         self.client = TestClient(router)
+from archivebox.api.v1_cli import ScheduleCommandSchema, cli_schedule
+from archivebox.crawls.models import CrawlSchedule

-#     def test_add_endpoint(self):
-#         response = self.client.post("/add", json={"urls": ["http://example.com"], "tag": "testTag1,testTag2"})
-#         self.assertEqual(response.status_code, 200)
-#         self.assertTrue(response.json()["success"])

-#     def test_remove_endpoint(self):
-#         response = self.client.post("/remove", json={"filter_patterns": ["http://example.com"]})
-#         self.assertEqual(response.status_code, 200)
-#         self.assertTrue(response.json()["success"])
+class CLIScheduleAPITests(TestCase):
+    def setUp(self):
+        self.user = User.objects.create_user(
+            username='api-user',
+            password='testpass123',
+            email='api@example.com',
+        )

-#     def test_update_endpoint(self):
-#         response = self.client.post("/update", json={})
-#         self.assertEqual(response.status_code, 200)
-#         self.assertTrue(response.json()["success"])
+    def test_schedule_api_creates_schedule(self):
+        request = SimpleNamespace(
+            user=self.user,
+            stdout=StringIO(),
+            stderr=StringIO(),
+        )
+        args = ScheduleCommandSchema(
+            every='daily',
+            import_path='https://example.com/feed.xml',
+            quiet=True,
+        )

-#     def test_list_all_endpoint(self):
-#         response = self.client.post("/list_all", json={})
-#         self.assertEqual(response.status_code, 200)
-#         self.assertTrue(response.json()["success"])
+        response = cli_schedule(request, args)
+
+        self.assertTrue(response['success'])
+        self.assertEqual(response['result_format'], 'json')
+        self.assertEqual(CrawlSchedule.objects.count(), 1)
+        self.assertEqual(len(response['result']['created_schedule_ids']), 1)
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -74,6 +74,10 @@ class UpdateCommandSchema(Schema):
 class ScheduleCommandSchema(Schema):
    import_path: Optional[str] = None
    add: bool = False
+    show: bool = False
+    foreground: bool = False
+    run_all: bool = False
+    quiet: bool = False
    every: Optional[str] = None
    tag: str = ''
    depth: int = 0
@@ -172,6 +176,9 @@ def cli_schedule(request, args: ScheduleCommandSchema):
        import_path=args.import_path,
        add=args.add,
        show=args.show,
+        foreground=args.foreground,
+        run_all=args.run_all,
+        quiet=args.quiet,
        clear=args.clear,
        every=args.every,
        tag=args.tag,
@@ -184,6 +191,7 @@ def cli_schedule(request, args: ScheduleCommandSchema):
        "success": True,
        "errors": [],
        "result": result,
+        "result_format": "json",
        "stdout": ansi_to_html(request.stdout.getvalue().strip()),
        "stderr": ansi_to_html(request.stderr.getvalue().strip()),
    }
@@ -230,19 +238,37 @@ def cli_search(request, args: ListCommandSchema):
@router.post("/remove", response=CLICommandResponseSchema, summary='archivebox remove [args] [filter_patterns]')
 def cli_remove(request, args: RemoveCommandSchema):
    from archivebox.cli.archivebox_remove import remove
+    from archivebox.cli.archivebox_search import get_snapshots
+    from archivebox.core.models import Snapshot
+
+    snapshots_to_remove = get_snapshots(
+        filter_patterns=args.filter_patterns,
+        filter_type=args.filter_type,
+        after=args.after,
+        before=args.before,
+    )
+    removed_snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots_to_remove.values_list('id', flat=True)]
    
-    result = remove(
+    remove(
        yes=True,            # no way to interactively ask for confirmation via API, so we force yes
        delete=args.delete,
+        snapshots=snapshots_to_remove,
        before=args.before,
        after=args.after,
        filter_type=args.filter_type,
        filter_patterns=args.filter_patterns,
    )
+
+    result = {
+        "removed_count": len(removed_snapshot_ids),
+        "removed_snapshot_ids": removed_snapshot_ids,
+        "remaining_snapshots": Snapshot.objects.count(),
+    }
    return {
        "success": True,
        "errors": [],
        "result": result,
+        "result_format": "json",
        "stdout": ansi_to_html(request.stdout.getvalue().strip()),
        "stderr": ansi_to_html(request.stderr.getvalue().strip()),
    }
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -547,7 +547,7 @@ def tags_add_to_snapshot(request, data: TagSnapshotRequestSchema):
        raise HttpError(400, 'Either tag_name or tag_id is required')

    # Add the tag to the snapshot
-    snapshot.tags.add(tag)
+    snapshot.tags.add(tag.pk)

    return {
        'success': True,
@@ -586,7 +586,7 @@ def tags_remove_from_snapshot(request, data: TagSnapshotRequestSchema):
        raise HttpError(400, 'Either tag_name or tag_id is required')

    # Remove the tag from the snapshot
-    snapshot.tags.remove(tag)
+    snapshot.tags.remove(tag.pk)

    return {
        'success': True,
--- a/archivebox/api/v1_machine.py
+++ b/archivebox/api/v1_machine.py
@@ -106,6 +106,13 @@ def get_machines(request, filters: MachineFilterSchema = Query(...)):
    return filters.filter(Machine.objects.all()).distinct()


+@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
+def get_current_machine(request):
+    """Get the current machine."""
+    from archivebox.machine.models import Machine
+    return Machine.current()
+
+
@router.get("/machine/{machine_id}", response=MachineSchema, url_name="get_machine")
 def get_machine(request, machine_id: str):
    """Get a specific machine by ID."""
@@ -114,13 +121,6 @@ def get_machine(request, machine_id: str):
    return Machine.objects.get(Q(id__startswith=machine_id) | Q(hostname__iexact=machine_id))


-@router.get("/machine/current", response=MachineSchema, url_name="get_current_machine")
-def get_current_machine(request):
-    """Get the current machine."""
-    from archivebox.machine.models import Machine
-    return Machine.current()
-
-
 # ============================================================================


@@ -133,18 +133,18 @@ def get_current_machine(request):
 def get_binaries(request, filters: BinaryFilterSchema = Query(...)):
    """List all binaries."""
    from archivebox.machine.models import Binary
-    return filters.filter(Binary.objects.all().select_related('machine', 'dependency')).distinct()
+    return filters.filter(Binary.objects.all().select_related('machine')).distinct()


@router.get("/binary/{binary_id}", response=BinarySchema, url_name="get_binary")
 def get_binary(request, binary_id: str):
    """Get a specific binary by ID."""
    from archivebox.machine.models import Binary
-    return Binary.objects.select_related('machine', 'dependency').get(id__startswith=binary_id)
+    return Binary.objects.select_related('machine').get(id__startswith=binary_id)


@router.get("/binary/by-name/{name}", response=List[BinarySchema], url_name="get_binaries_by_name")
 def get_binaries_by_name(request, name: str):
    """Get all binaries with the given name."""
    from archivebox.machine.models import Binary
-    return list(Binary.objects.filter(name__iexact=name).select_related('machine', 'dependency'))
+    return list(Binary.objects.filter(name__iexact=name).select_related('machine'))