Fix add CLI input handling and lint regressions

This commit is contained in:
Nick Sweeting
2026-03-15 19:04:13 -07:00
parent 5f0cfe5251
commit 311e4340ec
10 changed files with 112 additions and 29 deletions

View File

@@ -216,7 +216,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
# Set up uv and main app /venv
COPY --from=ghcr.io/astral-sh/uv:0.6 /uv /uvx /bin/
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/bin sh
ENV UV_COMPILE_BYTECODE=1 \
UV_PYTHON_PREFERENCE=managed \
UV_PYTHON_INSTALL_DIR=/opt/uv/python \
@@ -231,7 +231,7 @@ ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH"
RUN uv pip install setuptools pip \
&& ( \
which python3 && python3 --version \
&& which uv && uv version \
&& which uv && uv self version \
&& uv python find --system && uv python find \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt

View File

@@ -23,6 +23,25 @@ if TYPE_CHECKING:
from archivebox.core.models import Snapshot
def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
from archivebox.misc.jsonl import read_args_or_stdin
urls: list[str] = []
for record in read_args_or_stdin(args):
url = record.get('url')
if isinstance(url, str) and url:
urls.append(url)
urls_field = record.get('urls')
if isinstance(urls_field, str):
for line in urls_field.splitlines():
line = line.strip()
if line and not line.startswith('#'):
urls.append(line)
return urls
@enforce_types
def add(urls: str | list[str],
depth: int | str=0,
@@ -210,7 +229,12 @@ def add(urls: str | list[str],
def main(**kwargs):
"""Add a new URL or list of URLs to your archive"""
add(**kwargs)
raw_urls = kwargs.pop('urls')
urls = _collect_input_urls(raw_urls)
if not urls:
raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
add(urls=urls, **kwargs)
if __name__ == '__main__':

View File

@@ -32,7 +32,7 @@ def mcp():
{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
"""
from mcp.server import run_mcp_server
from archivebox.mcp.server import run_mcp_server
# Run the stdio server (blocks until stdin closes)
run_mcp_server()

View File

@@ -168,21 +168,20 @@ def get_config(
user = crawl.created_by
if persona is None and crawl is not None:
try:
from archivebox.personas.models import Persona
persona_id = getattr(crawl, "persona_id", None)
if persona_id:
persona = Persona.objects.filter(id=persona_id).first()
from archivebox.personas.models import Persona
persona_id = getattr(crawl, "persona_id", None)
if persona_id:
persona = Persona.objects.filter(id=persona_id).first()
if persona is None:
crawl_config = getattr(crawl, "config", None) or {}
default_persona_name = crawl_config.get("DEFAULT_PERSONA")
if default_persona_name:
persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default")
persona.ensure_dirs()
except Exception:
pass
raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}')
if persona is None:
crawl_config = getattr(crawl, "config", None) or {}
default_persona_name = str(crawl_config.get("DEFAULT_PERSONA") or "").strip()
if default_persona_name:
persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default")
persona.ensure_dirs()
from archivebox.config.constants import CONSTANTS
from archivebox.config.common import (
SHELL_CONFIG,

View File

@@ -24,6 +24,7 @@ __package__ = 'archivebox.misc'
import sys
import json
import select
from typing import Iterator, Dict, Any, Optional, TextIO
from pathlib import Path
@@ -90,6 +91,14 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
if stream.isatty():
return
try:
ready, _, _ = select.select([stream], [], [], 0)
except (OSError, ValueError):
ready = [stream]
if not ready:
return
for line in stream:
record = parse_line(line)
if record:
@@ -149,4 +158,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
write_record(record, stream)
count += 1
return count

View File

@@ -90,8 +90,8 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
def test_add_from_file(tmp_path, process, disable_extractors_dict):
"""Test adding URLs from a file.
With --index-only, this creates a snapshot for the file itself, not the URLs inside.
To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
The add command should treat a file argument as URL input and create snapshots
for each URL it contains.
"""
os.chdir(tmp_path)
@@ -113,9 +113,9 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
# With --index-only, creates 1 snapshot for the file itself
# The file is parsed into two input URLs.
assert crawl_count == 1
assert snapshot_count == 1
assert snapshot_count == 2
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):

View File

@@ -144,3 +144,37 @@ def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive
payload = json.loads(stdout.strip().splitlines()[-1])
assert payload['raised'] is True
assert 'references missing Persona' in payload['message']
def test_get_config_raises_for_missing_persona_id(initialized_archive):
script = textwrap.dedent(
"""
import json
import os
from uuid import uuid4
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
from archivebox.config.configset import get_config
from archivebox.crawls.models import Crawl
from archivebox.personas.models import Persona
crawl = Crawl.objects.create(urls='https://example.com', persona_id=uuid4())
try:
get_config(crawl=crawl)
except Persona.DoesNotExist as err:
print(json.dumps({'raised': True, 'message': str(err)}))
else:
raise SystemExit('get_config unexpectedly succeeded')
"""
)
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
assert code == 0, stderr
payload = json.loads(stdout.strip().splitlines()[-1])
assert payload['raised'] is True
assert 'references missing Persona' in payload['message']

View File

@@ -339,7 +339,7 @@ class Orchestrator:
queue_sizes = {}
self._enforce_hard_timeouts()
self._materialize_due_schedules()
materialized_schedule_count = self._materialize_due_schedules()
# Check Binary queue
machine = Machine.current()
@@ -393,7 +393,7 @@ class Orchestrator:
# CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
# This ensures all binaries are installed before snapshots start processing
if binary_count == 0 and running_binary_workers == 0:
if binary_count == 0 and running_binary_workers == 0 and materialized_schedule_count == 0:
# Spawn CrawlWorker if needed
if self.should_spawn_worker(CrawlWorker, crawl_count):
# Claim next crawl
@@ -406,20 +406,24 @@ class Orchestrator:
def _should_process_schedules(self) -> bool:
return (not self.exit_on_idle) and (self.crawl_id is None)
def _materialize_due_schedules(self) -> None:
def _materialize_due_schedules(self) -> int:
if not self._should_process_schedules():
return
return 0
from archivebox.crawls.models import CrawlSchedule
now = timezone.now()
due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by')
materialized_count = 0
for schedule in due_schedules:
if not schedule.is_due(now):
continue
schedule.enqueue(queued_at=now)
materialized_count += 1
return materialized_count
def _enforce_hard_timeouts(self) -> None:
"""Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""

View File

@@ -1,4 +1,5 @@
from datetime import timedelta
from unittest.mock import patch
from django.contrib.auth import get_user_model
from django.test import TestCase
@@ -6,6 +7,7 @@ from django.utils import timezone
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import CrawlWorker
class TestScheduledCrawlMaterialization(TestCase):
@@ -63,3 +65,15 @@ class TestScheduledCrawlMaterialization(TestCase):
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
@patch.object(CrawlWorker, 'start')
def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
schedule = self._create_due_schedule()
orchestrator = Orchestrator(exit_on_idle=False)
with patch.object(orchestrator, '_claim_crawl', return_value=True):
queue_sizes = orchestrator.check_queues_and_spawn_workers()
self.assertEqual(queue_sizes['crawl'], 1)
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
mock_start.assert_not_called()

View File

@@ -245,7 +245,7 @@ output = "coverage.json"
show_contexts = true
[tool.mypy]
mypy_path = "archivebox,archivebox/typings"
mypy_path = "typings"
namespace_packages = true
explicit_package_bases = true
# follow_imports = "silent"
@@ -257,7 +257,7 @@ explicit_package_bases = true
plugins = ["mypy_django_plugin.main"]
[tool.django-stubs]
django_settings_module = "core.settings"
django_settings_module = "archivebox.core.settings"
[tool.pyright]
include = [
@@ -271,7 +271,7 @@ exclude = [
"**/__pycache__",
"**/migrations",
]
stubPath = "./archivebox/typings"
stubPath = "./typings"
venvPath = "."
venv = ".venv"
# ignore = ["src/oldstuff"]