Tighten API typing and add return values

2026-04-06 07:47:53 +10:00 · 2026-03-15 19:24:54 -07:00
parent 95a105feb9
commit 5381f7584c
6 changed files with 20 additions and 20 deletions
--- a/archivebox/api/tests.py
+++ b/archivebox/api/tests.py
@@ -25,8 +25,8 @@ class CLIScheduleAPITests(TestCase):
    def test_schedule_api_creates_schedule(self):
        request = RequestFactory().post('/api/v1/cli/schedule')
        request.user = self.user
-        request.stdout = StringIO()
-        request.stderr = StringIO()
+        setattr(request, 'stdout', StringIO())
+        setattr(request, 'stderr', StringIO())
        args = ScheduleCommandSchema(
            every='daily',
            import_path='https://example.com/feed.xml',
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -112,7 +112,7 @@ class RemoveCommandSchema(Schema):
 def cli_add(request: HttpRequest, args: AddCommandSchema):
    from archivebox.cli.archivebox_add import add

-    result = add(
+    crawl, snapshots = add(
        urls=args.urls,
        tag=args.tag,
        depth=args.depth,
@@ -125,9 +125,9 @@ def cli_add(request: HttpRequest, args: AddCommandSchema):
        created_by_id=request.user.pk,
    )

-    snapshot_ids = [str(snapshot_id) for snapshot_id in result.values_list('id', flat=True)]
+    snapshot_ids = [str(snapshot_id) for snapshot_id in snapshots.values_list('id', flat=True)]
    result_payload = {
-        "crawl_id": getattr(result, "crawl_id", None),
+        "crawl_id": str(crawl.id),
        "num_snapshots": len(snapshot_ids),
        "snapshot_ids": snapshot_ids,
        "queued_urls": args.urls,
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -427,7 +427,7 @@ def get_any(request: HttpRequest, id: str):
        try:
            response = getter(request, id)
            if isinstance(response, Model):
-                return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}")
+                return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
        except Exception:
            pass

@@ -435,7 +435,7 @@ def get_any(request: HttpRequest, id: str):
        from archivebox.api.v1_crawls import get_crawl
        response = get_crawl(request, id)
        if isinstance(response, Model):
-            return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}")
+            return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.pk}?{request.META['QUERY_STRING']}")
    except Exception:
        pass

--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -21,6 +21,7 @@ from archivebox.config.permissions import USER, HOSTNAME

 if TYPE_CHECKING:
    from archivebox.core.models import Snapshot
+    from archivebox.crawls.models import Crawl


 def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
@@ -53,7 +54,7 @@ def add(urls: str | list[str],
        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
        index_only: bool=False,
        bg: bool=False,
-        created_by_id: int | None=None) -> QuerySet['Snapshot']:
+        created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
    """Add a new URL or list of URLs to your archive.

    The flow is:
@@ -145,7 +146,7 @@ def add(urls: str | list[str],
            if tag:
                snapshot.save_tags(tag.split(','))
            snapshot.ensure_crawl_symlink()
-        return crawl.snapshot_set.all()
+        return crawl, crawl.snapshot_set.all()

    # 5. Start the orchestrator to process the queue
    #    The orchestrator will:
@@ -210,8 +211,7 @@ def add(urls: str | list[str],

    # 6. Return the list of Snapshots in this crawl
    snapshots = crawl.snapshot_set.all()
-    snapshots.crawl_id = str(crawl.id)
-    return snapshots
+    return crawl, snapshots


@click.command()
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -85,21 +85,21 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
    Yields parsed records as dicts.
    Supports both JSONL format and plain URLs (one per line).
    """
-    stream = stream or sys.stdin
+    active_stream: TextIO = sys.stdin if stream is None else stream

    # Don't block if stdin is a tty with no input
-    if stream.isatty():
+    if active_stream.isatty():
        return

    try:
-        ready, _, _ = select.select([stream], [], [], 0)
+        ready, _, _ = select.select([active_stream], [], [], 0)
    except (OSError, ValueError):
-        ready = [stream]
+        ready = [active_stream]

    if not ready:
        return

-    for line in stream:
+    for line in active_stream:
        record = parse_line(line)
        if record:
            yield record
@@ -142,9 +142,9 @@ def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> Non
    """
    Write a single JSONL record to stdout (or provided stream).
    """
-    stream = stream or sys.stdout
-    stream.write(json.dumps(record) + '\n')
-    stream.flush()
+    active_stream: TextIO = sys.stdout if stream is None else stream
+    active_stream.write(json.dumps(record) + '\n')
+    active_stream.flush()


 def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int:
--- a/archivebox/workers/tasks.py
+++ b/archivebox/workers/tasks.py
@@ -27,7 +27,7 @@ def bg_add(add_kwargs: dict) -> int:
    add_kwargs = add_kwargs.copy()
    add_kwargs['bg'] = True

-    result = add(**add_kwargs)
+    _, result = add(**add_kwargs)

    return len(result) if result else 0