mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Fix add CLI input handling and lint regressions
This commit is contained in:
@@ -216,7 +216,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
|
||||
|
||||
|
||||
# Set up uv and main app /venv
|
||||
COPY --from=ghcr.io/astral-sh/uv:0.6 /uv /uvx /bin/
|
||||
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/bin sh
|
||||
ENV UV_COMPILE_BYTECODE=1 \
|
||||
UV_PYTHON_PREFERENCE=managed \
|
||||
UV_PYTHON_INSTALL_DIR=/opt/uv/python \
|
||||
@@ -231,7 +231,7 @@ ENV VIRTUAL_ENV=/venv PATH="/venv/bin:$PATH"
|
||||
RUN uv pip install setuptools pip \
|
||||
&& ( \
|
||||
which python3 && python3 --version \
|
||||
&& which uv && uv version \
|
||||
&& which uv && uv self version \
|
||||
&& uv python find --system && uv python find \
|
||||
&& echo -e '\n\n' \
|
||||
) | tee -a /VERSION.txt
|
||||
|
||||
@@ -23,6 +23,25 @@ if TYPE_CHECKING:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
|
||||
def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
|
||||
from archivebox.misc.jsonl import read_args_or_stdin
|
||||
|
||||
urls: list[str] = []
|
||||
for record in read_args_or_stdin(args):
|
||||
url = record.get('url')
|
||||
if isinstance(url, str) and url:
|
||||
urls.append(url)
|
||||
|
||||
urls_field = record.get('urls')
|
||||
if isinstance(urls_field, str):
|
||||
for line in urls_field.splitlines():
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
urls.append(line)
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
@enforce_types
|
||||
def add(urls: str | list[str],
|
||||
depth: int | str=0,
|
||||
@@ -210,7 +229,12 @@ def add(urls: str | list[str],
|
||||
def main(**kwargs):
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
add(**kwargs)
|
||||
raw_urls = kwargs.pop('urls')
|
||||
urls = _collect_input_urls(raw_urls)
|
||||
if not urls:
|
||||
raise click.UsageError('No URLs provided. Pass URLs as arguments or via stdin.')
|
||||
|
||||
add(urls=urls, **kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -32,7 +32,7 @@ def mcp():
|
||||
{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
|
||||
"""
|
||||
|
||||
from mcp.server import run_mcp_server
|
||||
from archivebox.mcp.server import run_mcp_server
|
||||
|
||||
# Run the stdio server (blocks until stdin closes)
|
||||
run_mcp_server()
|
||||
|
||||
@@ -168,21 +168,20 @@ def get_config(
|
||||
user = crawl.created_by
|
||||
|
||||
if persona is None and crawl is not None:
|
||||
try:
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
persona_id = getattr(crawl, "persona_id", None)
|
||||
if persona_id:
|
||||
persona = Persona.objects.filter(id=persona_id).first()
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
persona_id = getattr(crawl, "persona_id", None)
|
||||
if persona_id:
|
||||
persona = Persona.objects.filter(id=persona_id).first()
|
||||
if persona is None:
|
||||
crawl_config = getattr(crawl, "config", None) or {}
|
||||
default_persona_name = crawl_config.get("DEFAULT_PERSONA")
|
||||
if default_persona_name:
|
||||
persona, _ = Persona.objects.get_or_create(name=str(default_persona_name).strip() or "Default")
|
||||
persona.ensure_dirs()
|
||||
except Exception:
|
||||
pass
|
||||
raise Persona.DoesNotExist(f'Crawl {getattr(crawl, "id", None)} references missing Persona {persona_id}')
|
||||
|
||||
if persona is None:
|
||||
crawl_config = getattr(crawl, "config", None) or {}
|
||||
default_persona_name = str(crawl_config.get("DEFAULT_PERSONA") or "").strip()
|
||||
if default_persona_name:
|
||||
persona, _ = Persona.objects.get_or_create(name=default_persona_name or "Default")
|
||||
persona.ensure_dirs()
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
|
||||
@@ -24,6 +24,7 @@ __package__ = 'archivebox.misc'
|
||||
|
||||
import sys
|
||||
import json
|
||||
import select
|
||||
from typing import Iterator, Dict, Any, Optional, TextIO
|
||||
from pathlib import Path
|
||||
|
||||
@@ -90,6 +91,14 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
|
||||
if stream.isatty():
|
||||
return
|
||||
|
||||
try:
|
||||
ready, _, _ = select.select([stream], [], [], 0)
|
||||
except (OSError, ValueError):
|
||||
ready = [stream]
|
||||
|
||||
if not ready:
|
||||
return
|
||||
|
||||
for line in stream:
|
||||
record = parse_line(line)
|
||||
if record:
|
||||
@@ -149,4 +158,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
|
||||
write_record(record, stream)
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
@@ -90,8 +90,8 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
|
||||
def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding URLs from a file.
|
||||
|
||||
With --index-only, this creates a snapshot for the file itself, not the URLs inside.
|
||||
To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
|
||||
The add command should treat a file argument as URL input and create snapshots
|
||||
for each URL it contains.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
@@ -113,9 +113,9 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# With --index-only, creates 1 snapshot for the file itself
|
||||
# The file is parsed into two input URLs.
|
||||
assert crawl_count == 1
|
||||
assert snapshot_count == 1
|
||||
assert snapshot_count == 2
|
||||
|
||||
|
||||
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
@@ -144,3 +144,37 @@ def test_crawl_resolve_persona_raises_for_missing_persona_id(initialized_archive
|
||||
payload = json.loads(stdout.strip().splitlines()[-1])
|
||||
assert payload['raised'] is True
|
||||
assert 'references missing Persona' in payload['message']
|
||||
|
||||
|
||||
def test_get_config_raises_for_missing_persona_id(initialized_archive):
|
||||
script = textwrap.dedent(
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
from uuid import uuid4
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from archivebox.config.configset import get_config
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.personas.models import Persona
|
||||
|
||||
crawl = Crawl.objects.create(urls='https://example.com', persona_id=uuid4())
|
||||
|
||||
try:
|
||||
get_config(crawl=crawl)
|
||||
except Persona.DoesNotExist as err:
|
||||
print(json.dumps({'raised': True, 'message': str(err)}))
|
||||
else:
|
||||
raise SystemExit('get_config unexpectedly succeeded')
|
||||
"""
|
||||
)
|
||||
|
||||
stdout, stderr, code = run_python_cwd(script, cwd=initialized_archive, timeout=60)
|
||||
assert code == 0, stderr
|
||||
|
||||
payload = json.loads(stdout.strip().splitlines()[-1])
|
||||
assert payload['raised'] is True
|
||||
assert 'references missing Persona' in payload['message']
|
||||
|
||||
@@ -339,7 +339,7 @@ class Orchestrator:
|
||||
queue_sizes = {}
|
||||
|
||||
self._enforce_hard_timeouts()
|
||||
self._materialize_due_schedules()
|
||||
materialized_schedule_count = self._materialize_due_schedules()
|
||||
|
||||
# Check Binary queue
|
||||
machine = Machine.current()
|
||||
@@ -393,7 +393,7 @@ class Orchestrator:
|
||||
|
||||
# CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
|
||||
# This ensures all binaries are installed before snapshots start processing
|
||||
if binary_count == 0 and running_binary_workers == 0:
|
||||
if binary_count == 0 and running_binary_workers == 0 and materialized_schedule_count == 0:
|
||||
# Spawn CrawlWorker if needed
|
||||
if self.should_spawn_worker(CrawlWorker, crawl_count):
|
||||
# Claim next crawl
|
||||
@@ -406,20 +406,24 @@ class Orchestrator:
|
||||
def _should_process_schedules(self) -> bool:
|
||||
return (not self.exit_on_idle) and (self.crawl_id is None)
|
||||
|
||||
def _materialize_due_schedules(self) -> None:
|
||||
def _materialize_due_schedules(self) -> int:
|
||||
if not self._should_process_schedules():
|
||||
return
|
||||
return 0
|
||||
|
||||
from archivebox.crawls.models import CrawlSchedule
|
||||
|
||||
now = timezone.now()
|
||||
due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by')
|
||||
materialized_count = 0
|
||||
|
||||
for schedule in due_schedules:
|
||||
if not schedule.is_due(now):
|
||||
continue
|
||||
|
||||
schedule.enqueue(queued_at=now)
|
||||
materialized_count += 1
|
||||
|
||||
return materialized_count
|
||||
|
||||
def _enforce_hard_timeouts(self) -> None:
|
||||
"""Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from datetime import timedelta
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.test import TestCase
|
||||
@@ -6,6 +7,7 @@ from django.utils import timezone
|
||||
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
|
||||
class TestScheduledCrawlMaterialization(TestCase):
|
||||
@@ -63,3 +65,15 @@ class TestScheduledCrawlMaterialization(TestCase):
|
||||
|
||||
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
||||
|
||||
@patch.object(CrawlWorker, 'start')
|
||||
def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
with patch.object(orchestrator, '_claim_crawl', return_value=True):
|
||||
queue_sizes = orchestrator.check_queues_and_spawn_workers()
|
||||
|
||||
self.assertEqual(queue_sizes['crawl'], 1)
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
|
||||
mock_start.assert_not_called()
|
||||
|
||||
@@ -245,7 +245,7 @@ output = "coverage.json"
|
||||
show_contexts = true
|
||||
|
||||
[tool.mypy]
|
||||
mypy_path = "archivebox,archivebox/typings"
|
||||
mypy_path = "typings"
|
||||
namespace_packages = true
|
||||
explicit_package_bases = true
|
||||
# follow_imports = "silent"
|
||||
@@ -257,7 +257,7 @@ explicit_package_bases = true
|
||||
plugins = ["mypy_django_plugin.main"]
|
||||
|
||||
[tool.django-stubs]
|
||||
django_settings_module = "core.settings"
|
||||
django_settings_module = "archivebox.core.settings"
|
||||
|
||||
[tool.pyright]
|
||||
include = [
|
||||
@@ -271,7 +271,7 @@ exclude = [
|
||||
"**/__pycache__",
|
||||
"**/migrations",
|
||||
]
|
||||
stubPath = "./archivebox/typings"
|
||||
stubPath = "./typings"
|
||||
venvPath = "."
|
||||
venv = ".venv"
|
||||
# ignore = ["src/oldstuff"]
|
||||
|
||||
Reference in New Issue
Block a user