Fix add CLI input handling and lint regressions

2026-04-06 07:47:53 +10:00 · 2026-03-15 19:04:13 -07:00
parent 5f0cfe5251
commit 311e4340ec
10 changed files with 112 additions and 29 deletions
--- a/4
+++ b/4
@@ -216,7 +216,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T


 # Set up uv and main app /venv
-COPY --from=ghcr.io/astral-sh/uv:0.6 /uv /uvx /bin/
+RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/bin sh
 ENV UV_COMPILE_BYTECODE=1 \
    UV_PYTHON_PREFERENCE=managed \
    UV_PYTHON_INSTALL_DIR=/opt/uv/python \
@@ -231,7 +231,7 @@ ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH"
 RUN uv pip install setuptools pip \
    && ( \
        which python3 && python3 --version \
-        && which uv && uv version \
+        && which uv && uv self version \
        && uv python find --system && uv python find \
        && echo -e '\n\n' \
    ) | tee -a /VERSION.txt
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -23,6 +23,25 @@ if TYPE_CHECKING:
    from archivebox.core.models import Snapshot


+def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
+    from archivebox.misc.jsonl import read_args_or_stdin
+
+    urls: list[str] = []
+    for record in read_args_or_stdin(args):
+        url = record.get('url')
+        if isinstance(url, str) and url:
+            urls.append(url)
+
+        urls_field = record.get('urls')
+        if isinstance(urls_field, str):
+            for line in urls_field.splitlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    urls.append(line)
+
+    return urls
+
+
@enforce_types
 def add(urls: str | list[str],
        depth: int | str=0,
@@ -210,7 +229,12 @@ def add(urls: str | list[str],
 def main(**kwargs):
    """Add a new URL or list of URLs to your archive"""

-    add(**kwargs)
+    raw_urls = kwargs.pop('urls')
+    urls = _collect_input_urls(raw_urls)
+    if not urls:
+        raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
+
+    add(urls=urls, **kwargs)


 if __name__ == '__main__':
--- a/archivebox/cli/archivebox_mcp.py
+++ b/archivebox/cli/archivebox_mcp.py
@@ -32,7 +32,7 @@ def mcp():
        {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
    """

-    from mcp.server import run_mcp_server
+    from archivebox.mcp.server import run_mcp_server

    # Run the stdio server (blocks until stdin closes)
    run_mcp_server()
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -168,21 +168,20 @@ def get_config(
        user = crawl.created_by

    if persona is None and crawl is not None:
-        try:
-            from archivebox.personas.models import Persona
-
-            persona_id = getattr(crawl, "persona_id", None)
-            if persona_id:
-                persona = Persona.objects.filter(id=persona_id).first()
+        from archivebox.personas.models import Persona

+        persona_id = getattr(crawl, "persona_id", None)
+        if persona_id:
+            persona = Persona.objects.filter(id=persona_id).first()
            if persona is None:
-                crawl_config = getattr(crawl, "config", None) or {}
-                default_persona_name = crawl_config.get("DEFAULT_PERSONA")
-                if default_persona_name:
-                    persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default")
-                    persona.ensure_dirs()
-        except Exception:
-            pass
+                raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}')
+
+        if persona is None:
+            crawl_config = getattr(crawl, "config", None) or {}
+            default_persona_name = str(crawl_config.get("DEFAULT_PERSONA") or "").strip()
+            if default_persona_name:
+                persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default")
+                persona.ensure_dirs()
    from archivebox.config.constants import CONSTANTS
    from archivebox.config.common import (
        SHELL_CONFIG,
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -24,6 +24,7 @@ __package__ = 'archivebox.misc'

 import sys
 import json
+import select
 from typing import Iterator, Dict, Any, Optional, TextIO
 from pathlib import Path

@@ -90,6 +91,14 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
    if stream.isatty():
        return

+    try:
+        ready, _, _ = select.select([stream], [], [], 0)
+    except (OSError, ValueError):
+        ready = [stream]
+
+    if not ready:
+        return
+
    for line in stream:
        record = parse_line(line)
        if record:
@@ -149,4 +158,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
        write_record(record, stream)
        count += 1
    return count
-
--- a/archivebox/tests/test_cli_add.py
+++ b/archivebox/tests/test_cli_add.py
@@ -90,8 +90,8 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
 def test_add_from_file(tmp_path, process, disable_extractors_dict):
    """Test adding URLs from a file.

-    With --index-only, this creates a snapshot for the file itself, not the URLs inside.
-    To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
+    The add command should treat a file argument as URL input and create snapshots
+    for each URL it contains.
    """
    os.chdir(tmp_path)

@@ -113,9 +113,9 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
    snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
    conn.close()

-    # With --index-only, creates 1 snapshot for the file itself
+    # The file is parsed into two input URLs.
    assert crawl_count == 1
-    assert snapshot_count == 1
+    assert snapshot_count == 2


 def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
--- a/archivebox/tests/test_persona_runtime.py
+++ b/archivebox/tests/test_persona_runtime.py
@@ -144,3 +144,37 @@ def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive
    payload = json.loads(stdout.strip().splitlines()[-1])
    assert payload['raised'] is True
    assert 'references missing Persona' in payload['message']
+
+
+def test_get_config_raises_for_missing_persona_id(initialized_archive):
+    script = textwrap.dedent(
+        """
+        import json
+        import os
+        from uuid import uuid4
+
+        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
+        import django
+        django.setup()
+
+        from archivebox.config.configset import get_config
+        from archivebox.crawls.models import Crawl
+        from archivebox.personas.models import Persona
+
+        crawl = Crawl.objects.create(urls='https://example.com', persona_id=uuid4())
+
+        try:
+            get_config(crawl=crawl)
+        except Persona.DoesNotExist as err:
+            print(json.dumps({'raised': True, 'message': str(err)}))
+        else:
+            raise SystemExit('get_config unexpectedly succeeded')
+        """
+    )
+
+    stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
+    assert code == 0, stderr
+
+    payload = json.loads(stdout.strip().splitlines()[-1])
+    assert payload['raised'] is True
+    assert 'references missing Persona' in payload['message']
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -339,7 +339,7 @@ class Orchestrator:
        queue_sizes = {}

        self._enforce_hard_timeouts()
-        self._materialize_due_schedules()
+        materialized_schedule_count = self._materialize_due_schedules()

        # Check Binary queue
        machine = Machine.current()
@@ -393,7 +393,7 @@ class Orchestrator:

        # CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
        # This ensures all binaries are installed before snapshots start processing
-        if binary_count == 0 and running_binary_workers == 0:
+        if binary_count == 0 and running_binary_workers == 0 and materialized_schedule_count == 0:
            # Spawn CrawlWorker if needed
            if self.should_spawn_worker(CrawlWorker, crawl_count):
                # Claim next crawl
@@ -406,20 +406,24 @@ class Orchestrator:
    def _should_process_schedules(self) -> bool:
        return (not self.exit_on_idle) and (self.crawl_id is None)

-    def _materialize_due_schedules(self) -> None:
+    def _materialize_due_schedules(self) -> int:
        if not self._should_process_schedules():
-            return
+            return 0

        from archivebox.crawls.models import CrawlSchedule

        now = timezone.now()
        due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by')
+        materialized_count = 0

        for schedule in due_schedules:
            if not schedule.is_due(now):
                continue

            schedule.enqueue(queued_at=now)
+            materialized_count += 1
+
+        return materialized_count

    def _enforce_hard_timeouts(self) -> None:
        """Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
--- a/archivebox/workers/tests/test_scheduled_crawls.py
+++ b/archivebox/workers/tests/test_scheduled_crawls.py
@@ -1,4 +1,5 @@
 from datetime import timedelta
+from unittest.mock import patch

 from django.contrib.auth import get_user_model
 from django.test import TestCase
@@ -6,6 +7,7 @@ from django.utils import timezone

 from archivebox.crawls.models import Crawl, CrawlSchedule
 from archivebox.workers.orchestrator import Orchestrator
+from archivebox.workers.worker import CrawlWorker


 class TestScheduledCrawlMaterialization(TestCase):
@@ -63,3 +65,15 @@ class TestScheduledCrawlMaterialization(TestCase):

        Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
+
+    @patch.object(CrawlWorker, 'start')
+    def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
+        schedule = self._create_due_schedule()
+
+        orchestrator = Orchestrator(exit_on_idle=False)
+        with patch.object(orchestrator, '_claim_crawl', return_value=True):
+            queue_sizes = orchestrator.check_queues_and_spawn_workers()
+
+        self.assertEqual(queue_sizes['crawl'], 1)
+        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
+        mock_start.assert_not_called()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -245,7 +245,7 @@ output = "coverage.json"
 show_contexts = true

 [tool.mypy]
-mypy_path = "archivebox,archivebox/typings"
+mypy_path = "typings"
 namespace_packages = true
 explicit_package_bases = true
 # follow_imports = "silent"
@@ -257,7 +257,7 @@ explicit_package_bases = true
 plugins = ["mypy_django_plugin.main"]

 [tool.django-stubs]
-django_settings_module = "core.settings"
+django_settings_module = "archivebox.core.settings"

 [tool.pyright]
 include = [
@@ -271,7 +271,7 @@ exclude = [
    "**/__pycache__",
    "**/migrations",
 ]
-stubPath = "./archivebox/typings"
+stubPath = "./typings"
 venvPath = "."
 venv = ".venv"
 # ignore = ["src/oldstuff"]