add mcp server support

2026-01-03 09:25:42 +10:00 · 2025-12-25 01:50:42 -08:00
parent 866f993f26
commit 28e6c5bb65
6 changed files with 561 additions and 1 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -3,7 +3,10 @@
    "allow": [
      "Bash(python -m archivebox:*)",
      "Bash(ls:*)",
-      "Bash(xargs:*)"
+      "Bash(xargs:*)",
+      "Bash(python -c:*)",
+      "Bash(printf:*)",
+      "Bash(pkill:*)"
    ]
  }
 }
--- a/archivebox/cli/archivebox_mcp.py
+++ b/archivebox/cli/archivebox_mcp.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+"""
+archivebox mcp
+
+Start the Model Context Protocol (MCP) server in stdio mode.
+Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox mcp'
+
+import rich_click as click
+
+from archivebox.misc.util import docstring, enforce_types
+
+
+@enforce_types
+def mcp():
+    """
+    Start the MCP server in stdio mode for AI agent control.
+
+    The MCP (Model Context Protocol) server exposes all ArchiveBox CLI commands
+    as tools that AI agents can discover and execute. It communicates via JSON-RPC
+    2.0 over stdin/stdout.
+
+    Example usage with an MCP client:
+        archivebox mcp < requests.jsonl > responses.jsonl
+
+    Or interactively:
+        archivebox mcp
+        {"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}
+        {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
+    """
+
+    from mcp.server import run_mcp_server
+
+    # Run the stdio server (blocks until stdin closes)
+    run_mcp_server()
+
+
+@click.command()
+@docstring(mcp.__doc__)
+def main(**kwargs):
+    """Start the MCP server in stdio mode"""
+    mcp()
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/mcp/README.md
+++ b/archivebox/mcp/README.md
@@ -0,0 +1,138 @@
+# ArchiveBox MCP Server
+
+Model Context Protocol (MCP) server for ArchiveBox that exposes all CLI commands as tools for AI agents.
+
+## Overview
+
+This is a lightweight, stateless MCP server that dynamically introspects ArchiveBox's Click CLI commands and exposes them as MCP tools. It requires **zero manual schema definitions** - everything is auto-generated from the existing CLI metadata.
+
+## Features
+
+- ✅ **Auto-discovery**: Dynamically discovers all 19+ ArchiveBox CLI commands
+- ✅ **Zero duplication**: Reuses existing Click command definitions, types, and help text
+- ✅ **Auto-sync**: Changes to CLI commands automatically reflected in MCP tools
+- ✅ **Stateless**: No database models or state management required
+- ✅ **Lightweight**: ~200 lines of code
+
+## Usage
+
+### Start the MCP Server
+
+```bash
+archivebox mcp
+```
+
+The server runs in stdio mode, reading JSON-RPC 2.0 requests from stdin and writing responses to stdout.
+
+### Example Client
+
+```python
+import subprocess
+import json
+
+# Start MCP server
+proc = subprocess.Popen(
+    ['archivebox', 'mcp'],
+    stdin=subprocess.PIPE,
+    stdout=subprocess.PIPE,
+    text=True
+)
+
+# Send initialize request
+request = {"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {}}
+proc.stdin.write(json.dumps(request) + '\n')
+proc.stdin.flush()
+
+# Read response
+response = json.loads(proc.stdout.readline())
+print(response)
+```
+
+### Example Requests
+
+**Initialize:**
+```json
+{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}
+```
+
+**List all available tools:**
+```json
+{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
+```
+
+**Call a tool:**
+```json
+{
+  "jsonrpc":"2.0",
+  "id":3,
+  "method":"tools/call",
+  "params":{
+    "name":"version",
+    "arguments":{"quiet":true}
+  }
+}
+```
+
+## Supported MCP Methods
+
+- `initialize` - Handshake and capability negotiation
+- `tools/list` - List all available CLI commands as MCP tools
+- `tools/call` - Execute a CLI command with arguments
+
+## Available Tools
+
+The server exposes all ArchiveBox CLI commands:
+
+**Meta**: `help`, `version`, `mcp`
+**Setup**: `init`, `install`
+**Archive**: `add`, `remove`, `update`, `search`, `status`, `config`
+**Workers**: `orchestrator`, `worker`
+**Tasks**: `crawl`, `snapshot`, `extract`
+**Server**: `server`, `schedule`
+**Utilities**: `shell`, `manage`
+
+## Architecture
+
+### Dynamic Introspection
+
+Instead of manually defining schemas, the server uses Click's introspection API to automatically generate MCP tool definitions:
+
+```python
+# Auto-discover commands
+from archivebox.cli import ArchiveBoxGroup
+cli_group = ArchiveBoxGroup()
+all_commands = cli_group.all_subcommands
+
+# Auto-generate schemas from Click metadata
+for cmd_name in all_commands:
+    click_cmd = cli_group.get_command(None, cmd_name)
+    # Extract params, types, help text, etc.
+    tool_schema = click_command_to_mcp_tool(cmd_name, click_cmd)
+```
+
+### Tool Execution
+
+Commands are executed using Click's `CliRunner`:
+
+```python
+from click.testing import CliRunner
+
+runner = CliRunner()
+result = runner.invoke(click_command, args)
+```
+
+## Files
+
+- `server.py` (~350 lines) - Core MCP server with Click introspection
+- `archivebox/cli/archivebox_mcp.py` (~50 lines) - CLI entry point
+- `apps.py`, `__init__.py` - Django app boilerplate
+
+## MCP Specification
+
+Implements the [MCP 2025-11-25 specification](https://modelcontextprotocol.io/specification/2025-11-25).
+
+## Sources
+
+- [MCP Specification](https://modelcontextprotocol.io/specification/2025-11-25)
+- [MCP Introduction](https://www.anthropic.com/news/model-context-protocol)
+- [MCP GitHub](https://github.com/modelcontextprotocol/modelcontextprotocol)
--- a/archivebox/mcp/init.py
+++ b/archivebox/mcp/init.py
@@ -0,0 +1,8 @@
+__package__ = 'archivebox.mcp'
+
+"""
+Model Context Protocol (MCP) server for ArchiveBox.
+
+Exposes all ArchiveBox CLI commands as MCP tools via dynamic Click introspection.
+Provides a JSON-RPC 2.0 interface over stdio for AI agents to control ArchiveBox.
+"""
--- a/archivebox/mcp/apps.py
+++ b/archivebox/mcp/apps.py
@@ -0,0 +1,9 @@
+__package__ = 'archivebox.mcp'
+
+from django.apps import AppConfig
+
+
+class MCPConfig(AppConfig):
+    name = 'mcp'
+    verbose_name = 'Model Context Protocol Server'
+    default_auto_field = 'django.db.models.BigAutoField'
--- a/archivebox/mcp/server.py
+++ b/archivebox/mcp/server.py
@@ -0,0 +1,353 @@
+__package__ = 'archivebox.mcp'
+
+"""
+Model Context Protocol (MCP) server implementation for ArchiveBox.
+
+Dynamically exposes all ArchiveBox CLI commands as MCP tools by introspecting
+Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport.
+"""
+
+import sys
+import json
+import traceback
+from typing import Any, Dict, List, Optional
+from io import StringIO
+from contextlib import redirect_stdout, redirect_stderr
+
+import click
+from click.testing import CliRunner
+
+from archivebox.config.version import VERSION
+
+
+class MCPJSONEncoder(json.JSONEncoder):
+    """Custom JSON encoder that handles Click sentinel values and other special types"""
+
+    def default(self, obj):
+        # Handle Click's sentinel values
+        if hasattr(click, 'core') and hasattr(click.core, '_SentinelClass'):
+            if isinstance(obj, click.core._SentinelClass):
+                return None
+
+        # Handle tuples (convert to lists)
+        if isinstance(obj, tuple):
+            return list(obj)
+
+        # Handle any other non-serializable objects
+        try:
+            return super().default(obj)
+        except TypeError:
+            return str(obj)
+
+
+# Type mapping from Click types to JSON Schema types
+def click_type_to_json_schema_type(click_type) -> dict:
+    """Convert a Click parameter type to JSON Schema type definition"""
+
+    if isinstance(click_type, click.types.StringParamType):
+        return {"type": "string"}
+    elif isinstance(click_type, click.types.IntParamType):
+        return {"type": "integer"}
+    elif isinstance(click_type, click.types.FloatParamType):
+        return {"type": "number"}
+    elif isinstance(click_type, click.types.BoolParamType):
+        return {"type": "boolean"}
+    elif isinstance(click_type, click.types.Choice):
+        return {"type": "string", "enum": click_type.choices}
+    elif isinstance(click_type, click.types.Path):
+        return {"type": "string", "description": "File or directory path"}
+    elif isinstance(click_type, click.types.File):
+        return {"type": "string", "description": "File path"}
+    elif isinstance(click_type, click.types.Tuple):
+        # Multiple arguments of same type
+        return {"type": "array", "items": {"type": "string"}}
+    else:
+        # Default to string for unknown types
+        return {"type": "string"}
+
+
+def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> dict:
+    """
+    Convert a Click command to an MCP tool definition with JSON Schema.
+
+    Introspects the Click command's parameters to automatically generate
+    the input schema without manual definition.
+    """
+
+    properties = {}
+    required = []
+
+    # Extract parameters from Click command
+    for param in click_command.params:
+        # Skip internal parameters
+        if param.name in ('help', 'version'):
+            continue
+
+        param_schema = click_type_to_json_schema_type(param.type)
+
+        # Add description from Click help text
+        if param.help:
+            param_schema["description"] = param.help
+
+        # Handle default values
+        if param.default is not None and param.default != ():
+            param_schema["default"] = param.default
+
+        # Handle multiple values (like multiple URLs)
+        if param.multiple:
+            properties[param.name] = {
+                "type": "array",
+                "items": param_schema,
+                "description": param_schema.get("description", f"Multiple {param.name} values")
+            }
+        else:
+            properties[param.name] = param_schema
+
+        # Mark as required if Click requires it
+        if param.required:
+            required.append(param.name)
+
+    return {
+        "name": cmd_name,
+        "description": click_command.help or click_command.short_help or f"Run archivebox {cmd_name} command",
+        "inputSchema": {
+            "type": "object",
+            "properties": properties,
+            "required": required
+        }
+    }
+
+
+def execute_click_command(cmd_name: str, click_command: click.Command, arguments: dict) -> dict:
+    """
+    Execute a Click command programmatically with given arguments.
+
+    Returns MCP-formatted result with captured output and error status.
+    """
+
+    # Use Click's test runner to invoke command programmatically
+    runner = CliRunner()
+
+    # Convert arguments dict to CLI args list
+    args = []
+    for key, value in arguments.items():
+        param_name = key.replace('_', '-')  # Click uses dashes
+
+        if isinstance(value, bool):
+            if value:
+                args.append(f'--{param_name}')
+        elif isinstance(value, list):
+            # Multiple values (e.g., multiple URLs)
+            for item in value:
+                args.append(str(item))
+        elif value is not None:
+            args.append(f'--{param_name}')
+            args.append(str(value))
+
+    # Execute the command
+    try:
+        result = runner.invoke(click_command, args, catch_exceptions=False)
+
+        # Format output as MCP content
+        content = []
+
+        if result.output:
+            content.append({
+                "type": "text",
+                "text": result.output
+            })
+
+        if result.stderr_bytes:
+            stderr_text = result.stderr_bytes.decode('utf-8', errors='replace')
+            if stderr_text.strip():
+                content.append({
+                    "type": "text",
+                    "text": f"[stderr]\n{stderr_text}"
+                })
+
+        # Check exit code
+        is_error = result.exit_code != 0
+
+        if is_error and not content:
+            content.append({
+                "type": "text",
+                "text": f"Command failed with exit code {result.exit_code}"
+            })
+
+        return {
+            "content": content or [{"type": "text", "text": "(no output)"}],
+            "isError": is_error
+        }
+
+    except Exception as e:
+        # Capture any exceptions during execution
+        error_trace = traceback.format_exc()
+        return {
+            "content": [{
+                "type": "text",
+                "text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}"
+            }],
+            "isError": True
+        }
+
+
+class MCPServer:
+    """
+    Model Context Protocol server for ArchiveBox.
+
+    Provides JSON-RPC 2.0 interface over stdio, dynamically exposing
+    all Click commands as MCP tools.
+    """
+
+    def __init__(self):
+        # Import here to avoid circular imports
+        from archivebox.cli import ArchiveBoxGroup
+
+        self.cli_group = ArchiveBoxGroup()
+        self.protocol_version = "2025-11-25"
+        self._tool_cache = {}  # Cache loaded Click commands
+
+    def get_click_command(self, cmd_name: str) -> Optional[click.Command]:
+        """Get a Click command by name, with caching"""
+        if cmd_name not in self._tool_cache:
+            if cmd_name not in self.cli_group.all_subcommands:
+                return None
+            self._tool_cache[cmd_name] = self.cli_group.get_command(None, cmd_name)
+        return self._tool_cache[cmd_name]
+
+    def handle_initialize(self, params: dict) -> dict:
+        """Handle MCP initialize request"""
+        return {
+            "protocolVersion": self.protocol_version,
+            "capabilities": {
+                "tools": {}
+            },
+            "serverInfo": {
+                "name": "archivebox-mcp",
+                "version": VERSION
+            }
+        }
+
+    def handle_tools_list(self, params: dict) -> dict:
+        """Handle MCP tools/list request - returns all available CLI commands as tools"""
+        tools = []
+
+        for cmd_name in self.cli_group.all_subcommands.keys():
+            click_cmd = self.get_click_command(cmd_name)
+            if click_cmd:
+                try:
+                    tool_def = click_command_to_mcp_tool(cmd_name, click_cmd)
+                    tools.append(tool_def)
+                except Exception as e:
+                    # Log but don't fail - skip problematic commands
+                    print(f"Warning: Could not generate tool for {cmd_name}: {e}", file=sys.stderr)
+
+        return {"tools": tools}
+
+    def handle_tools_call(self, params: dict) -> dict:
+        """Handle MCP tools/call request - executes a CLI command"""
+        tool_name = params.get('name')
+        arguments = params.get('arguments', {})
+
+        if not tool_name:
+            raise ValueError("Missing required parameter: name")
+
+        click_cmd = self.get_click_command(tool_name)
+        if not click_cmd:
+            raise ValueError(f"Unknown tool: {tool_name}")
+
+        # Execute the command and return MCP-formatted result
+        return execute_click_command(tool_name, click_cmd, arguments)
+
+    def handle_request(self, request: dict) -> dict:
+        """
+        Handle a JSON-RPC 2.0 request and return response.
+
+        Supports MCP methods: initialize, tools/list, tools/call
+        """
+
+        method = request.get('method')
+        params = request.get('params', {})
+        request_id = request.get('id')
+
+        try:
+            # Route to appropriate handler
+            if method == 'initialize':
+                result = self.handle_initialize(params)
+            elif method == 'tools/list':
+                result = self.handle_tools_list(params)
+            elif method == 'tools/call':
+                result = self.handle_tools_call(params)
+            else:
+                # Method not found
+                return {
+                    "jsonrpc": "2.0",
+                    "id": request_id,
+                    "error": {
+                        "code": -32601,
+                        "message": f"Method not found: {method}"
+                    }
+                }
+
+            # Success response
+            return {
+                "jsonrpc": "2.0",
+                "id": request_id,
+                "result": result
+            }
+
+        except Exception as e:
+            # Error response
+            error_trace = traceback.format_exc()
+            return {
+                "jsonrpc": "2.0",
+                "id": request_id,
+                "error": {
+                    "code": -32603,
+                    "message": str(e),
+                    "data": error_trace
+                }
+            }
+
+    def run_stdio_server(self):
+        """
+        Run the MCP server in stdio mode.
+
+        Reads JSON-RPC requests from stdin (one per line),
+        writes JSON-RPC responses to stdout (one per line).
+        """
+
+        # Read requests from stdin line by line
+        for line in sys.stdin:
+            line = line.strip()
+            if not line:
+                continue
+
+            try:
+                # Parse JSON-RPC request
+                request = json.loads(line)
+
+                # Handle request
+                response = self.handle_request(request)
+
+                # Write response to stdout (use custom encoder for Click types)
+                print(json.dumps(response, cls=MCPJSONEncoder), flush=True)
+
+            except json.JSONDecodeError as e:
+                # Invalid JSON
+                error_response = {
+                    "jsonrpc": "2.0",
+                    "id": None,
+                    "error": {
+                        "code": -32700,
+                        "message": "Parse error",
+                        "data": str(e)
+                    }
+                }
+                print(json.dumps(error_response, cls=MCPJSONEncoder), flush=True)
+
+
+def run_mcp_server():
+    """Main entry point for MCP server"""
+    server = MCPServer()
+    server.run_stdio_server()