diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 80bdacbe..ac196f40 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -3,7 +3,10 @@ "allow": [ "Bash(python -m archivebox:*)", "Bash(ls:*)", - "Bash(xargs:*)" + "Bash(xargs:*)", + "Bash(python -c:*)", + "Bash(printf:*)", + "Bash(pkill:*)" ] } } diff --git a/archivebox/cli/archivebox_mcp.py b/archivebox/cli/archivebox_mcp.py new file mode 100644 index 00000000..fbc153c4 --- /dev/null +++ b/archivebox/cli/archivebox_mcp.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +""" +archivebox mcp + +Start the Model Context Protocol (MCP) server in stdio mode. +Exposes all ArchiveBox CLI commands as MCP tools for AI agents. +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox mcp' + +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +@enforce_types +def mcp(): + """ + Start the MCP server in stdio mode for AI agent control. + + The MCP (Model Context Protocol) server exposes all ArchiveBox CLI commands + as tools that AI agents can discover and execute. It communicates via JSON-RPC + 2.0 over stdin/stdout. + + Example usage with an MCP client: + archivebox mcp < requests.jsonl > responses.jsonl + + Or interactively: + archivebox mcp + {"jsonrpc":"2.0","id":1,"method":"initialize","params":{}} + {"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}} + """ + + from mcp.server import run_mcp_server + + # Run the stdio server (blocks until stdin closes) + run_mcp_server() + + +@click.command() +@docstring(mcp.__doc__) +def main(**kwargs): + """Start the MCP server in stdio mode""" + mcp() + + +if __name__ == '__main__': + main() diff --git a/archivebox/mcp/README.md b/archivebox/mcp/README.md new file mode 100644 index 00000000..8b0aa42b --- /dev/null +++ b/archivebox/mcp/README.md @@ -0,0 +1,138 @@ +# ArchiveBox MCP Server + +Model Context Protocol (MCP) server for ArchiveBox that exposes all CLI commands as tools for AI agents. + +## Overview + +This is a lightweight, stateless MCP server that dynamically introspects ArchiveBox's Click CLI commands and exposes them as MCP tools. It requires **zero manual schema definitions** - everything is auto-generated from the existing CLI metadata. + +## Features + +- ✅ **Auto-discovery**: Dynamically discovers all 19+ ArchiveBox CLI commands +- ✅ **Zero duplication**: Reuses existing Click command definitions, types, and help text +- ✅ **Auto-sync**: Changes to CLI commands automatically reflected in MCP tools +- ✅ **Stateless**: No database models or state management required +- ✅ **Lightweight**: ~200 lines of code + +## Usage + +### Start the MCP Server + +```bash +archivebox mcp +``` + +The server runs in stdio mode, reading JSON-RPC 2.0 requests from stdin and writing responses to stdout. + +### Example Client + +```python +import subprocess +import json + +# Start MCP server +proc = subprocess.Popen( + ['archivebox', 'mcp'], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + text=True +) + +# Send initialize request +request = {"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {}} +proc.stdin.write(json.dumps(request) + '\n') +proc.stdin.flush() + +# Read response +response = json.loads(proc.stdout.readline()) +print(response) +``` + +### Example Requests + +**Initialize:** +```json +{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}} +``` + +**List all available tools:** +```json +{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}} +``` + +**Call a tool:** +```json +{ + "jsonrpc":"2.0", + "id":3, + "method":"tools/call", + "params":{ + "name":"version", + "arguments":{"quiet":true} + } +} +``` + +## Supported MCP Methods + +- `initialize` - Handshake and capability negotiation +- `tools/list` - List all available CLI commands as MCP tools +- `tools/call` - Execute a CLI command with arguments + +## Available Tools + +The server exposes all ArchiveBox CLI commands: + +**Meta**: `help`, `version`, `mcp` +**Setup**: `init`, `install` +**Archive**: `add`, `remove`, `update`, `search`, `status`, `config` +**Workers**: `orchestrator`, `worker` +**Tasks**: `crawl`, `snapshot`, `extract` +**Server**: `server`, `schedule` +**Utilities**: `shell`, `manage` + +## Architecture + +### Dynamic Introspection + +Instead of manually defining schemas, the server uses Click's introspection API to automatically generate MCP tool definitions: + +```python +# Auto-discover commands +from archivebox.cli import ArchiveBoxGroup +cli_group = ArchiveBoxGroup() +all_commands = cli_group.all_subcommands + +# Auto-generate schemas from Click metadata +for cmd_name in all_commands: + click_cmd = cli_group.get_command(None, cmd_name) + # Extract params, types, help text, etc. + tool_schema = click_command_to_mcp_tool(cmd_name, click_cmd) +``` + +### Tool Execution + +Commands are executed using Click's `CliRunner`: + +```python +from click.testing import CliRunner + +runner = CliRunner() +result = runner.invoke(click_command, args) +``` + +## Files + +- `server.py` (~350 lines) - Core MCP server with Click introspection +- `archivebox/cli/archivebox_mcp.py` (~50 lines) - CLI entry point +- `apps.py`, `__init__.py` - Django app boilerplate + +## MCP Specification + +Implements the [MCP 2025-11-25 specification](https://modelcontextprotocol.io/specification/2025-11-25). + +## Sources + +- [MCP Specification](https://modelcontextprotocol.io/specification/2025-11-25) +- [MCP Introduction](https://www.anthropic.com/news/model-context-protocol) +- [MCP GitHub](https://github.com/modelcontextprotocol/modelcontextprotocol) diff --git a/archivebox/mcp/__init__.py b/archivebox/mcp/__init__.py new file mode 100644 index 00000000..d05fc2fc --- /dev/null +++ b/archivebox/mcp/__init__.py @@ -0,0 +1,8 @@ +__package__ = 'archivebox.mcp' + +""" +Model Context Protocol (MCP) server for ArchiveBox. + +Exposes all ArchiveBox CLI commands as MCP tools via dynamic Click introspection. +Provides a JSON-RPC 2.0 interface over stdio for AI agents to control ArchiveBox. +""" diff --git a/archivebox/mcp/apps.py b/archivebox/mcp/apps.py new file mode 100644 index 00000000..2eeb3b2b --- /dev/null +++ b/archivebox/mcp/apps.py @@ -0,0 +1,9 @@ +__package__ = 'archivebox.mcp' + +from django.apps import AppConfig + + +class MCPConfig(AppConfig): + name = 'mcp' + verbose_name = 'Model Context Protocol Server' + default_auto_field = 'django.db.models.BigAutoField' diff --git a/archivebox/mcp/server.py b/archivebox/mcp/server.py new file mode 100644 index 00000000..1789d806 --- /dev/null +++ b/archivebox/mcp/server.py @@ -0,0 +1,353 @@ +__package__ = 'archivebox.mcp' + +""" +Model Context Protocol (MCP) server implementation for ArchiveBox. + +Dynamically exposes all ArchiveBox CLI commands as MCP tools by introspecting +Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport. +""" + +import sys +import json +import traceback +from typing import Any, Dict, List, Optional +from io import StringIO +from contextlib import redirect_stdout, redirect_stderr + +import click +from click.testing import CliRunner + +from archivebox.config.version import VERSION + + +class MCPJSONEncoder(json.JSONEncoder): + """Custom JSON encoder that handles Click sentinel values and other special types""" + + def default(self, obj): + # Handle Click's sentinel values + if hasattr(click, 'core') and hasattr(click.core, '_SentinelClass'): + if isinstance(obj, click.core._SentinelClass): + return None + + # Handle tuples (convert to lists) + if isinstance(obj, tuple): + return list(obj) + + # Handle any other non-serializable objects + try: + return super().default(obj) + except TypeError: + return str(obj) + + +# Type mapping from Click types to JSON Schema types +def click_type_to_json_schema_type(click_type) -> dict: + """Convert a Click parameter type to JSON Schema type definition""" + + if isinstance(click_type, click.types.StringParamType): + return {"type": "string"} + elif isinstance(click_type, click.types.IntParamType): + return {"type": "integer"} + elif isinstance(click_type, click.types.FloatParamType): + return {"type": "number"} + elif isinstance(click_type, click.types.BoolParamType): + return {"type": "boolean"} + elif isinstance(click_type, click.types.Choice): + return {"type": "string", "enum": click_type.choices} + elif isinstance(click_type, click.types.Path): + return {"type": "string", "description": "File or directory path"} + elif isinstance(click_type, click.types.File): + return {"type": "string", "description": "File path"} + elif isinstance(click_type, click.types.Tuple): + # Multiple arguments of same type + return {"type": "array", "items": {"type": "string"}} + else: + # Default to string for unknown types + return {"type": "string"} + + +def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> dict: + """ + Convert a Click command to an MCP tool definition with JSON Schema. + + Introspects the Click command's parameters to automatically generate + the input schema without manual definition. + """ + + properties = {} + required = [] + + # Extract parameters from Click command + for param in click_command.params: + # Skip internal parameters + if param.name in ('help', 'version'): + continue + + param_schema = click_type_to_json_schema_type(param.type) + + # Add description from Click help text + if param.help: + param_schema["description"] = param.help + + # Handle default values + if param.default is not None and param.default != (): + param_schema["default"] = param.default + + # Handle multiple values (like multiple URLs) + if param.multiple: + properties[param.name] = { + "type": "array", + "items": param_schema, + "description": param_schema.get("description", f"Multiple {param.name} values") + } + else: + properties[param.name] = param_schema + + # Mark as required if Click requires it + if param.required: + required.append(param.name) + + return { + "name": cmd_name, + "description": click_command.help or click_command.short_help or f"Run archivebox {cmd_name} command", + "inputSchema": { + "type": "object", + "properties": properties, + "required": required + } + } + + +def execute_click_command(cmd_name: str, click_command: click.Command, arguments: dict) -> dict: + """ + Execute a Click command programmatically with given arguments. + + Returns MCP-formatted result with captured output and error status. + """ + + # Use Click's test runner to invoke command programmatically + runner = CliRunner() + + # Convert arguments dict to CLI args list + args = [] + for key, value in arguments.items(): + param_name = key.replace('_', '-') # Click uses dashes + + if isinstance(value, bool): + if value: + args.append(f'--{param_name}') + elif isinstance(value, list): + # Multiple values (e.g., multiple URLs) + for item in value: + args.append(str(item)) + elif value is not None: + args.append(f'--{param_name}') + args.append(str(value)) + + # Execute the command + try: + result = runner.invoke(click_command, args, catch_exceptions=False) + + # Format output as MCP content + content = [] + + if result.output: + content.append({ + "type": "text", + "text": result.output + }) + + if result.stderr_bytes: + stderr_text = result.stderr_bytes.decode('utf-8', errors='replace') + if stderr_text.strip(): + content.append({ + "type": "text", + "text": f"[stderr]\n{stderr_text}" + }) + + # Check exit code + is_error = result.exit_code != 0 + + if is_error and not content: + content.append({ + "type": "text", + "text": f"Command failed with exit code {result.exit_code}" + }) + + return { + "content": content or [{"type": "text", "text": "(no output)"}], + "isError": is_error + } + + except Exception as e: + # Capture any exceptions during execution + error_trace = traceback.format_exc() + return { + "content": [{ + "type": "text", + "text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}" + }], + "isError": True + } + + +class MCPServer: + """ + Model Context Protocol server for ArchiveBox. + + Provides JSON-RPC 2.0 interface over stdio, dynamically exposing + all Click commands as MCP tools. + """ + + def __init__(self): + # Import here to avoid circular imports + from archivebox.cli import ArchiveBoxGroup + + self.cli_group = ArchiveBoxGroup() + self.protocol_version = "2025-11-25" + self._tool_cache = {} # Cache loaded Click commands + + def get_click_command(self, cmd_name: str) -> Optional[click.Command]: + """Get a Click command by name, with caching""" + if cmd_name not in self._tool_cache: + if cmd_name not in self.cli_group.all_subcommands: + return None + self._tool_cache[cmd_name] = self.cli_group.get_command(None, cmd_name) + return self._tool_cache[cmd_name] + + def handle_initialize(self, params: dict) -> dict: + """Handle MCP initialize request""" + return { + "protocolVersion": self.protocol_version, + "capabilities": { + "tools": {} + }, + "serverInfo": { + "name": "archivebox-mcp", + "version": VERSION + } + } + + def handle_tools_list(self, params: dict) -> dict: + """Handle MCP tools/list request - returns all available CLI commands as tools""" + tools = [] + + for cmd_name in self.cli_group.all_subcommands.keys(): + click_cmd = self.get_click_command(cmd_name) + if click_cmd: + try: + tool_def = click_command_to_mcp_tool(cmd_name, click_cmd) + tools.append(tool_def) + except Exception as e: + # Log but don't fail - skip problematic commands + print(f"Warning: Could not generate tool for {cmd_name}: {e}", file=sys.stderr) + + return {"tools": tools} + + def handle_tools_call(self, params: dict) -> dict: + """Handle MCP tools/call request - executes a CLI command""" + tool_name = params.get('name') + arguments = params.get('arguments', {}) + + if not tool_name: + raise ValueError("Missing required parameter: name") + + click_cmd = self.get_click_command(tool_name) + if not click_cmd: + raise ValueError(f"Unknown tool: {tool_name}") + + # Execute the command and return MCP-formatted result + return execute_click_command(tool_name, click_cmd, arguments) + + def handle_request(self, request: dict) -> dict: + """ + Handle a JSON-RPC 2.0 request and return response. + + Supports MCP methods: initialize, tools/list, tools/call + """ + + method = request.get('method') + params = request.get('params', {}) + request_id = request.get('id') + + try: + # Route to appropriate handler + if method == 'initialize': + result = self.handle_initialize(params) + elif method == 'tools/list': + result = self.handle_tools_list(params) + elif method == 'tools/call': + result = self.handle_tools_call(params) + else: + # Method not found + return { + "jsonrpc": "2.0", + "id": request_id, + "error": { + "code": -32601, + "message": f"Method not found: {method}" + } + } + + # Success response + return { + "jsonrpc": "2.0", + "id": request_id, + "result": result + } + + except Exception as e: + # Error response + error_trace = traceback.format_exc() + return { + "jsonrpc": "2.0", + "id": request_id, + "error": { + "code": -32603, + "message": str(e), + "data": error_trace + } + } + + def run_stdio_server(self): + """ + Run the MCP server in stdio mode. + + Reads JSON-RPC requests from stdin (one per line), + writes JSON-RPC responses to stdout (one per line). + """ + + # Read requests from stdin line by line + for line in sys.stdin: + line = line.strip() + if not line: + continue + + try: + # Parse JSON-RPC request + request = json.loads(line) + + # Handle request + response = self.handle_request(request) + + # Write response to stdout (use custom encoder for Click types) + print(json.dumps(response, cls=MCPJSONEncoder), flush=True) + + except json.JSONDecodeError as e: + # Invalid JSON + error_response = { + "jsonrpc": "2.0", + "id": None, + "error": { + "code": -32700, + "message": "Parse error", + "data": str(e) + } + } + print(json.dumps(error_response, cls=MCPJSONEncoder), flush=True) + + +def run_mcp_server(): + """Main entry point for MCP server""" + server = MCPServer() + server.run_stdio_server()