From 311e4340ec58c198114c0a4986cf6fba158585c8 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 15 Mar 2026 19:04:13 -0700 Subject: [PATCH] Fix add CLI input handling and lint regressions --- Dockerfile | 4 +-- archivebox/cli/archivebox_add.py | 26 +++++++++++++- archivebox/cli/archivebox_mcp.py | 2 +- archivebox/config/configset.py | 25 +++++++------- archivebox/misc/jsonl.py | 10 +++++- archivebox/tests/test_cli_add.py | 8 ++--- archivebox/tests/test_persona_runtime.py | 34 +++++++++++++++++++ archivebox/workers/orchestrator.py | 12 ++++--- .../workers/tests/test_scheduled_crawls.py | 14 ++++++++ pyproject.toml | 6 ++-- 10 files changed, 112 insertions(+), 29 deletions(-) diff --git a/Dockerfile b/Dockerfile index 71f39fac..93a46bb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -216,7 +216,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T # Set up uv and main app /venv -COPY --from=ghcr.io/astral-sh/uv:0.6 /uv /uvx /bin/ +RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/bin sh ENV UV_COMPILE_BYTECODE=1 \ UV_PYTHON_PREFERENCE=managed \ UV_PYTHON_INSTALL_DIR=/opt/uv/python \ @@ -231,7 +231,7 @@ ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH" RUN uv pip install setuptools pip \ && ( \ which python3 && python3 --version \ - && which uv && uv version \ + && which uv && uv self version \ && uv python find --system && uv python find \ && echo -e '\n\n' \ ) | tee -a /VERSION.txt diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index a1eecf79..ef345d8b 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -23,6 +23,25 @@ if TYPE_CHECKING: from archivebox.core.models import Snapshot +def _collect_input_urls(args: tuple[str, ...]) -> list[str]: + from archivebox.misc.jsonl import read_args_or_stdin + + urls: list[str] = [] + for record in read_args_or_stdin(args): + url = record.get('url') + if isinstance(url, str) and url: + urls.append(url) + + urls_field = record.get('urls') + if isinstance(urls_field, str): + for line in urls_field.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + urls.append(line) + + return urls + + @enforce_types def add(urls: str | list[str], depth: int | str=0, @@ -210,7 +229,12 @@ def add(urls: str | list[str], def main(**kwargs): """Add a new URL or list of URLs to your archive""" - add(**kwargs) + raw_urls = kwargs.pop('urls') + urls = _collect_input_urls(raw_urls) + if not urls: + raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.') + + add(urls=urls, **kwargs) if __name__ == '__main__': diff --git a/archivebox/cli/archivebox_mcp.py b/archivebox/cli/archivebox_mcp.py index fbc153c4..07b3825f 100644 --- a/archivebox/cli/archivebox_mcp.py +++ b/archivebox/cli/archivebox_mcp.py @@ -32,7 +32,7 @@ def mcp(): {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}} """ - from mcp.server import run_mcp_server + from archivebox.mcp.server import run_mcp_server # Run the stdio server (blocks until stdin closes) run_mcp_server() diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py index 39b8f51a..c54eb2bc 100644 --- a/archivebox/config/configset.py +++ b/archivebox/config/configset.py @@ -168,21 +168,20 @@ def get_config( user = crawl.created_by if persona is None and crawl is not None: - try: - from archivebox.personas.models import Persona - - persona_id = getattr(crawl, "persona_id", None) - if persona_id: - persona = Persona.objects.filter(id=persona_id).first() + from archivebox.personas.models import Persona + persona_id = getattr(crawl, "persona_id", None) + if persona_id: + persona = Persona.objects.filter(id=persona_id).first() if persona is None: - crawl_config = getattr(crawl, "config", None) or {} - default_persona_name = crawl_config.get("DEFAULT_PERSONA") - if default_persona_name: - persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default") - persona.ensure_dirs() - except Exception: - pass + raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}') + + if persona is None: + crawl_config = getattr(crawl, "config", None) or {} + default_persona_name = str(crawl_config.get("DEFAULT_PERSONA") or "").strip() + if default_persona_name: + persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default") + persona.ensure_dirs() from archivebox.config.constants import CONSTANTS from archivebox.config.common import ( SHELL_CONFIG, diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py index df1163ab..826c7368 100644 --- a/archivebox/misc/jsonl.py +++ b/archivebox/misc/jsonl.py @@ -24,6 +24,7 @@ __package__ = 'archivebox.misc' import sys import json +import select from typing import Iterator, Dict, Any, Optional, TextIO from pathlib import Path @@ -90,6 +91,14 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]: if stream.isatty(): return + try: + ready, _, _ = select.select([stream], [], [], 0) + except (OSError, ValueError): + ready = [stream] + + if not ready: + return + for line in stream: record = parse_line(line) if record: @@ -149,4 +158,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = write_record(record, stream) count += 1 return count - diff --git a/archivebox/tests/test_cli_add.py b/archivebox/tests/test_cli_add.py index a34a4879..11abca82 100644 --- a/archivebox/tests/test_cli_add.py +++ b/archivebox/tests/test_cli_add.py @@ -90,8 +90,8 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_ def test_add_from_file(tmp_path, process, disable_extractors_dict): """Test adding URLs from a file. - With --index-only, this creates a snapshot for the file itself, not the URLs inside. - To get snapshots for the URLs inside, you need to run without --index-only so parsers run. + The add command should treat a file argument as URL input and create snapshots + for each URL it contains. """ os.chdir(tmp_path) @@ -113,9 +113,9 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict): snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] conn.close() - # With --index-only, creates 1 snapshot for the file itself + # The file is parsed into two input URLs. assert crawl_count == 1 - assert snapshot_count == 1 + assert snapshot_count == 2 def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict): diff --git a/archivebox/tests/test_persona_runtime.py b/archivebox/tests/test_persona_runtime.py index 09773d18..6f4f35f5 100644 --- a/archivebox/tests/test_persona_runtime.py +++ b/archivebox/tests/test_persona_runtime.py @@ -144,3 +144,37 @@ def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive payload = json.loads(stdout.strip().splitlines()[-1]) assert payload['raised'] is True assert 'references missing Persona' in payload['message'] + + +def test_get_config_raises_for_missing_persona_id(initialized_archive): + script = textwrap.dedent( + """ + import json + import os + from uuid import uuid4 + + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') + import django + django.setup() + + from archivebox.config.configset import get_config + from archivebox.crawls.models import Crawl + from archivebox.personas.models import Persona + + crawl = Crawl.objects.create(urls='https://example.com', persona_id=uuid4()) + + try: + get_config(crawl=crawl) + except Persona.DoesNotExist as err: + print(json.dumps({'raised': True, 'message': str(err)})) + else: + raise SystemExit('get_config unexpectedly succeeded') + """ + ) + + stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60) + assert code == 0, stderr + + payload = json.loads(stdout.strip().splitlines()[-1]) + assert payload['raised'] is True + assert 'references missing Persona' in payload['message'] diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 9720cde4..f0cb658c 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -339,7 +339,7 @@ class Orchestrator: queue_sizes = {} self._enforce_hard_timeouts() - self._materialize_due_schedules() + materialized_schedule_count = self._materialize_due_schedules() # Check Binary queue machine = Machine.current() @@ -393,7 +393,7 @@ class Orchestrator: # CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running # This ensures all binaries are installed before snapshots start processing - if binary_count == 0 and running_binary_workers == 0: + if binary_count == 0 and running_binary_workers == 0 and materialized_schedule_count == 0: # Spawn CrawlWorker if needed if self.should_spawn_worker(CrawlWorker, crawl_count): # Claim next crawl @@ -406,20 +406,24 @@ class Orchestrator: def _should_process_schedules(self) -> bool: return (not self.exit_on_idle) and (self.crawl_id is None) - def _materialize_due_schedules(self) -> None: + def _materialize_due_schedules(self) -> int: if not self._should_process_schedules(): - return + return 0 from archivebox.crawls.models import CrawlSchedule now = timezone.now() due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by') + materialized_count = 0 for schedule in due_schedules: if not schedule.is_due(now): continue schedule.enqueue(queued_at=now) + materialized_count += 1 + + return materialized_count def _enforce_hard_timeouts(self) -> None: """Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits.""" diff --git a/archivebox/workers/tests/test_scheduled_crawls.py b/archivebox/workers/tests/test_scheduled_crawls.py index e0db1c77..0a7645be 100644 --- a/archivebox/workers/tests/test_scheduled_crawls.py +++ b/archivebox/workers/tests/test_scheduled_crawls.py @@ -1,4 +1,5 @@ from datetime import timedelta +from unittest.mock import patch from django.contrib.auth import get_user_model from django.test import TestCase @@ -6,6 +7,7 @@ from django.utils import timezone from archivebox.crawls.models import Crawl, CrawlSchedule from archivebox.workers.orchestrator import Orchestrator +from archivebox.workers.worker import CrawlWorker class TestScheduledCrawlMaterialization(TestCase): @@ -63,3 +65,15 @@ class TestScheduledCrawlMaterialization(TestCase): Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules() self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1) + + @patch.object(CrawlWorker, 'start') + def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start): + schedule = self._create_due_schedule() + + orchestrator = Orchestrator(exit_on_idle=False) + with patch.object(orchestrator, '_claim_crawl', return_value=True): + queue_sizes = orchestrator.check_queues_and_spawn_workers() + + self.assertEqual(queue_sizes['crawl'], 1) + self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2) + mock_start.assert_not_called() diff --git a/pyproject.toml b/pyproject.toml index d99ade95..dd9a7c87 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -245,7 +245,7 @@ output = "coverage.json" show_contexts = true [tool.mypy] -mypy_path = "archivebox,archivebox/typings" +mypy_path = "typings" namespace_packages = true explicit_package_bases = true # follow_imports = "silent" @@ -257,7 +257,7 @@ explicit_package_bases = true plugins = ["mypy_django_plugin.main"] [tool.django-stubs] -django_settings_module = "core.settings" +django_settings_module = "archivebox.core.settings" [tool.pyright] include = [ @@ -271,7 +271,7 @@ exclude = [ "**/__pycache__", "**/migrations", ] -stubPath = "./archivebox/typings" +stubPath = "./typings" venvPath = "." venv = ".venv" # ignore = ["src/oldstuff"]