From 311e4340ec58c198114c0a4986cf6fba158585c8 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sun, 15 Mar 2026 19:04:13 -0700
Subject: [PATCH] Fix add CLI input handling and lint regressions

---
 Dockerfile                                    |  4 +--
 archivebox/cli/archivebox_add.py              | 26 +++++++++++++-
 archivebox/cli/archivebox_mcp.py              |  2 +-
 archivebox/config/configset.py                | 25 +++++++-------
 archivebox/misc/jsonl.py                      | 10 +++++-
 archivebox/tests/test_cli_add.py              |  8 ++---
 archivebox/tests/test_persona_runtime.py      | 34 +++++++++++++++++++
 archivebox/workers/orchestrator.py            | 12 ++++---
 .../workers/tests/test_scheduled_crawls.py    | 14 ++++++++
 pyproject.toml                                |  6 ++--
 10 files changed, 112 insertions(+), 29 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 71f39fac..93a46bb6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -216,7 +216,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
 
 
 # Set up uv and main app /venv
-COPY --from=ghcr.io/astral-sh/uv:0.6 /uv /uvx /bin/
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/bin sh
 ENV UV_COMPILE_BYTECODE=1 \
     UV_PYTHON_PREFERENCE=managed \
     UV_PYTHON_INSTALL_DIR=/opt/uv/python \
@@ -231,7 +231,7 @@ ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH"
 RUN uv pip install setuptools pip \
     && ( \
         which python3 && python3 --version \
-        && which uv && uv version \
+        && which uv && uv self version \
         && uv python find --system && uv python find \
         && echo -e '\n\n' \
     ) | tee -a /VERSION.txt
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index a1eecf79..ef345d8b 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -23,6 +23,25 @@ if TYPE_CHECKING:
     from archivebox.core.models import Snapshot
 
 
+def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
+    from archivebox.misc.jsonl import read_args_or_stdin
+
+    urls: list[str] = []
+    for record in read_args_or_stdin(args):
+        url = record.get('url')
+        if isinstance(url, str) and url:
+            urls.append(url)
+
+        urls_field = record.get('urls')
+        if isinstance(urls_field, str):
+            for line in urls_field.splitlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    urls.append(line)
+
+    return urls
+
+
 @enforce_types
 def add(urls: str | list[str],
         depth: int | str=0,
@@ -210,7 +229,12 @@ def add(urls: str | list[str],
 def main(**kwargs):
     """Add a new URL or list of URLs to your archive"""
 
-    add(**kwargs)
+    raw_urls = kwargs.pop('urls')
+    urls = _collect_input_urls(raw_urls)
+    if not urls:
+        raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
+
+    add(urls=urls, **kwargs)
 
 
 if __name__ == '__main__':
diff --git a/archivebox/cli/archivebox_mcp.py b/archivebox/cli/archivebox_mcp.py
index fbc153c4..07b3825f 100644
--- a/archivebox/cli/archivebox_mcp.py
+++ b/archivebox/cli/archivebox_mcp.py
@@ -32,7 +32,7 @@ def mcp():
         {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
     """
 
-    from mcp.server import run_mcp_server
+    from archivebox.mcp.server import run_mcp_server
 
     # Run the stdio server (blocks until stdin closes)
     run_mcp_server()
diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py
index 39b8f51a..c54eb2bc 100644
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -168,21 +168,20 @@ def get_config(
         user = crawl.created_by
 
     if persona is None and crawl is not None:
-        try:
-            from archivebox.personas.models import Persona
-
-            persona_id = getattr(crawl, "persona_id", None)
-            if persona_id:
-                persona = Persona.objects.filter(id=persona_id).first()
+        from archivebox.personas.models import Persona
 
+        persona_id = getattr(crawl, "persona_id", None)
+        if persona_id:
+            persona = Persona.objects.filter(id=persona_id).first()
             if persona is None:
-                crawl_config = getattr(crawl, "config", None) or {}
-                default_persona_name = crawl_config.get("DEFAULT_PERSONA")
-                if default_persona_name:
-                    persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default")
-                    persona.ensure_dirs()
-        except Exception:
-            pass
+                raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}')
+
+        if persona is None:
+            crawl_config = getattr(crawl, "config", None) or {}
+            default_persona_name = str(crawl_config.get("DEFAULT_PERSONA") or "").strip()
+            if default_persona_name:
+                persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default")
+                persona.ensure_dirs()
     from archivebox.config.constants import CONSTANTS
     from archivebox.config.common import (
         SHELL_CONFIG,
diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py
index df1163ab..826c7368 100644
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -24,6 +24,7 @@ __package__ = 'archivebox.misc'
 
 import sys
 import json
+import select
 from typing import Iterator, Dict, Any, Optional, TextIO
 from pathlib import Path
 
@@ -90,6 +91,14 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
     if stream.isatty():
         return
 
+    try:
+        ready, _, _ = select.select([stream], [], [], 0)
+    except (OSError, ValueError):
+        ready = [stream]
+
+    if not ready:
+        return
+
     for line in stream:
         record = parse_line(line)
         if record:
@@ -149,4 +158,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
         write_record(record, stream)
         count += 1
     return count
-
diff --git a/archivebox/tests/test_cli_add.py b/archivebox/tests/test_cli_add.py
index a34a4879..11abca82 100644
--- a/archivebox/tests/test_cli_add.py
+++ b/archivebox/tests/test_cli_add.py
@@ -90,8 +90,8 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
 def test_add_from_file(tmp_path, process, disable_extractors_dict):
     """Test adding URLs from a file.
 
-    With --index-only, this creates a snapshot for the file itself, not the URLs inside.
-    To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
+    The add command should treat a file argument as URL input and create snapshots
+    for each URL it contains.
     """
     os.chdir(tmp_path)
 
@@ -113,9 +113,9 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
     snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
     conn.close()
 
-    # With --index-only, creates 1 snapshot for the file itself
+    # The file is parsed into two input URLs.
     assert crawl_count == 1
-    assert snapshot_count == 1
+    assert snapshot_count == 2
 
 
 def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
diff --git a/archivebox/tests/test_persona_runtime.py b/archivebox/tests/test_persona_runtime.py
index 09773d18..6f4f35f5 100644
--- a/archivebox/tests/test_persona_runtime.py
+++ b/archivebox/tests/test_persona_runtime.py
@@ -144,3 +144,37 @@ def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive
     payload = json.loads(stdout.strip().splitlines()[-1])
     assert payload['raised'] is True
     assert 'references missing Persona' in payload['message']
+
+
+def test_get_config_raises_for_missing_persona_id(initialized_archive):
+    script = textwrap.dedent(
+        """
+        import json
+        import os
+        from uuid import uuid4
+
+        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
+        import django
+        django.setup()
+
+        from archivebox.config.configset import get_config
+        from archivebox.crawls.models import Crawl
+        from archivebox.personas.models import Persona
+
+        crawl = Crawl.objects.create(urls='https://example.com', persona_id=uuid4())
+
+        try:
+            get_config(crawl=crawl)
+        except Persona.DoesNotExist as err:
+            print(json.dumps({'raised': True, 'message': str(err)}))
+        else:
+            raise SystemExit('get_config unexpectedly succeeded')
+        """
+    )
+
+    stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
+    assert code == 0, stderr
+
+    payload = json.loads(stdout.strip().splitlines()[-1])
+    assert payload['raised'] is True
+    assert 'references missing Persona' in payload['message']
diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py
index 9720cde4..f0cb658c 100644
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -339,7 +339,7 @@ class Orchestrator:
         queue_sizes = {}
 
         self._enforce_hard_timeouts()
-        self._materialize_due_schedules()
+        materialized_schedule_count = self._materialize_due_schedules()
 
         # Check Binary queue
         machine = Machine.current()
@@ -393,7 +393,7 @@ class Orchestrator:
 
         # CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
         # This ensures all binaries are installed before snapshots start processing
-        if binary_count == 0 and running_binary_workers == 0:
+        if binary_count == 0 and running_binary_workers == 0 and materialized_schedule_count == 0:
             # Spawn CrawlWorker if needed
             if self.should_spawn_worker(CrawlWorker, crawl_count):
                 # Claim next crawl
@@ -406,20 +406,24 @@ class Orchestrator:
     def _should_process_schedules(self) -> bool:
         return (not self.exit_on_idle) and (self.crawl_id is None)
 
-    def _materialize_due_schedules(self) -> None:
+    def _materialize_due_schedules(self) -> int:
         if not self._should_process_schedules():
-            return
+            return 0
 
         from archivebox.crawls.models import CrawlSchedule
 
         now = timezone.now()
         due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by')
+        materialized_count = 0
 
         for schedule in due_schedules:
             if not schedule.is_due(now):
                 continue
 
             schedule.enqueue(queued_at=now)
+            materialized_count += 1
+
+        return materialized_count
 
     def _enforce_hard_timeouts(self) -> None:
         """Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
diff --git a/archivebox/workers/tests/test_scheduled_crawls.py b/archivebox/workers/tests/test_scheduled_crawls.py
index e0db1c77..0a7645be 100644
--- a/archivebox/workers/tests/test_scheduled_crawls.py
+++ b/archivebox/workers/tests/test_scheduled_crawls.py
@@ -1,4 +1,5 @@
 from datetime import timedelta
+from unittest.mock import patch
 
 from django.contrib.auth import get_user_model
 from django.test import TestCase
@@ -6,6 +7,7 @@ from django.utils import timezone
 
 from archivebox.crawls.models import Crawl, CrawlSchedule
 from archivebox.workers.orchestrator import Orchestrator
+from archivebox.workers.worker import CrawlWorker
 
 
 class TestScheduledCrawlMaterialization(TestCase):
@@ -63,3 +65,15 @@ class TestScheduledCrawlMaterialization(TestCase):
 
         Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
         self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
+
+    @patch.object(CrawlWorker, 'start')
+    def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
+        schedule = self._create_due_schedule()
+
+        orchestrator = Orchestrator(exit_on_idle=False)
+        with patch.object(orchestrator, '_claim_crawl', return_value=True):
+            queue_sizes = orchestrator.check_queues_and_spawn_workers()
+
+        self.assertEqual(queue_sizes['crawl'], 1)
+        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
+        mock_start.assert_not_called()
diff --git a/pyproject.toml b/pyproject.toml
index d99ade95..dd9a7c87 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -245,7 +245,7 @@ output = "coverage.json"
 show_contexts = true
 
 [tool.mypy]
-mypy_path = "archivebox,archivebox/typings"
+mypy_path = "typings"
 namespace_packages = true
 explicit_package_bases = true
 # follow_imports = "silent"
@@ -257,7 +257,7 @@ explicit_package_bases = true
 plugins = ["mypy_django_plugin.main"]
 
 [tool.django-stubs]
-django_settings_module = "core.settings"
+django_settings_module = "archivebox.core.settings"
 
 [tool.pyright]
 include = [
@@ -271,7 +271,7 @@ exclude = [
     "**/__pycache__",
     "**/migrations",
 ]
-stubPath = "./archivebox/typings"
+stubPath = "./typings"
 venvPath = "."
 venv = ".venv"
 # ignore = ["src/oldstuff"]