add mcp server support

This commit is contained in:
Nick Sweeting
2025-12-25 01:50:42 -08:00
parent 866f993f26
commit 28e6c5bb65
6 changed files with 561 additions and 1 deletions

View File

@@ -3,7 +3,10 @@
"allow": [
"Bash(python -m archivebox:*)",
"Bash(ls:*)",
"Bash(xargs:*)"
"Bash(xargs:*)",
"Bash(python -c:*)",
"Bash(printf:*)",
"Bash(pkill:*)"
]
}
}

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""
archivebox mcp
Start the Model Context Protocol (MCP) server in stdio mode.
Exposes all ArchiveBox CLI commands as MCP tools for AI agents.
"""
__package__ = 'archivebox.cli'
__command__ = 'archivebox mcp'
import rich_click as click
from archivebox.misc.util import docstring, enforce_types
@enforce_types
def mcp():
"""
Start the MCP server in stdio mode for AI agent control.
The MCP (Model Context Protocol) server exposes all ArchiveBox CLI commands
as tools that AI agents can discover and execute. It communicates via JSON-RPC
2.0 over stdin/stdout.
Example usage with an MCP client:
archivebox mcp < requests.jsonl > responses.jsonl
Or interactively:
archivebox mcp
{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}
{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
"""
from mcp.server import run_mcp_server
# Run the stdio server (blocks until stdin closes)
run_mcp_server()
@click.command()
@docstring(mcp.__doc__)
def main(**kwargs):
"""Start the MCP server in stdio mode"""
mcp()
if __name__ == '__main__':
main()

138
archivebox/mcp/README.md Normal file
View File

@@ -0,0 +1,138 @@
# ArchiveBox MCP Server
Model Context Protocol (MCP) server for ArchiveBox that exposes all CLI commands as tools for AI agents.
## Overview
This is a lightweight, stateless MCP server that dynamically introspects ArchiveBox's Click CLI commands and exposes them as MCP tools. It requires **zero manual schema definitions** - everything is auto-generated from the existing CLI metadata.
## Features
-**Auto-discovery**: Dynamically discovers all 19+ ArchiveBox CLI commands
-**Zero duplication**: Reuses existing Click command definitions, types, and help text
-**Auto-sync**: Changes to CLI commands automatically reflected in MCP tools
-**Stateless**: No database models or state management required
-**Lightweight**: ~200 lines of code
## Usage
### Start the MCP Server
```bash
archivebox mcp
```
The server runs in stdio mode, reading JSON-RPC 2.0 requests from stdin and writing responses to stdout.
### Example Client
```python
import subprocess
import json
# Start MCP server
proc = subprocess.Popen(
['archivebox', 'mcp'],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
text=True
)
# Send initialize request
request = {"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {}}
proc.stdin.write(json.dumps(request) + '\n')
proc.stdin.flush()
# Read response
response = json.loads(proc.stdout.readline())
print(response)
```
### Example Requests
**Initialize:**
```json
{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}
```
**List all available tools:**
```json
{"jsonrpc":"2.0","id":2,"method":"tools/list","params":{}}
```
**Call a tool:**
```json
{
"jsonrpc":"2.0",
"id":3,
"method":"tools/call",
"params":{
"name":"version",
"arguments":{"quiet":true}
}
}
```
## Supported MCP Methods
- `initialize` - Handshake and capability negotiation
- `tools/list` - List all available CLI commands as MCP tools
- `tools/call` - Execute a CLI command with arguments
## Available Tools
The server exposes all ArchiveBox CLI commands:
**Meta**: `help`, `version`, `mcp`
**Setup**: `init`, `install`
**Archive**: `add`, `remove`, `update`, `search`, `status`, `config`
**Workers**: `orchestrator`, `worker`
**Tasks**: `crawl`, `snapshot`, `extract`
**Server**: `server`, `schedule`
**Utilities**: `shell`, `manage`
## Architecture
### Dynamic Introspection
Instead of manually defining schemas, the server uses Click's introspection API to automatically generate MCP tool definitions:
```python
# Auto-discover commands
from archivebox.cli import ArchiveBoxGroup
cli_group = ArchiveBoxGroup()
all_commands = cli_group.all_subcommands
# Auto-generate schemas from Click metadata
for cmd_name in all_commands:
click_cmd = cli_group.get_command(None, cmd_name)
# Extract params, types, help text, etc.
tool_schema = click_command_to_mcp_tool(cmd_name, click_cmd)
```
### Tool Execution
Commands are executed using Click's `CliRunner`:
```python
from click.testing import CliRunner
runner = CliRunner()
result = runner.invoke(click_command, args)
```
## Files
- `server.py` (~350 lines) - Core MCP server with Click introspection
- `archivebox/cli/archivebox_mcp.py` (~50 lines) - CLI entry point
- `apps.py`, `__init__.py` - Django app boilerplate
## MCP Specification
Implements the [MCP 2025-11-25 specification](https://modelcontextprotocol.io/specification/2025-11-25).
## Sources
- [MCP Specification](https://modelcontextprotocol.io/specification/2025-11-25)
- [MCP Introduction](https://www.anthropic.com/news/model-context-protocol)
- [MCP GitHub](https://github.com/modelcontextprotocol/modelcontextprotocol)

View File

@@ -0,0 +1,8 @@
__package__ = 'archivebox.mcp'
"""
Model Context Protocol (MCP) server for ArchiveBox.
Exposes all ArchiveBox CLI commands as MCP tools via dynamic Click introspection.
Provides a JSON-RPC 2.0 interface over stdio for AI agents to control ArchiveBox.
"""

9
archivebox/mcp/apps.py Normal file
View File

@@ -0,0 +1,9 @@
__package__ = 'archivebox.mcp'
from django.apps import AppConfig
class MCPConfig(AppConfig):
name = 'mcp'
verbose_name = 'Model Context Protocol Server'
default_auto_field = 'django.db.models.BigAutoField'

353
archivebox/mcp/server.py Normal file
View File

@@ -0,0 +1,353 @@
__package__ = 'archivebox.mcp'
"""
Model Context Protocol (MCP) server implementation for ArchiveBox.
Dynamically exposes all ArchiveBox CLI commands as MCP tools by introspecting
Click command metadata. Handles JSON-RPC 2.0 requests over stdio transport.
"""
import sys
import json
import traceback
from typing import Any, Dict, List, Optional
from io import StringIO
from contextlib import redirect_stdout, redirect_stderr
import click
from click.testing import CliRunner
from archivebox.config.version import VERSION
class MCPJSONEncoder(json.JSONEncoder):
"""Custom JSON encoder that handles Click sentinel values and other special types"""
def default(self, obj):
# Handle Click's sentinel values
if hasattr(click, 'core') and hasattr(click.core, '_SentinelClass'):
if isinstance(obj, click.core._SentinelClass):
return None
# Handle tuples (convert to lists)
if isinstance(obj, tuple):
return list(obj)
# Handle any other non-serializable objects
try:
return super().default(obj)
except TypeError:
return str(obj)
# Type mapping from Click types to JSON Schema types
def click_type_to_json_schema_type(click_type) -> dict:
"""Convert a Click parameter type to JSON Schema type definition"""
if isinstance(click_type, click.types.StringParamType):
return {"type": "string"}
elif isinstance(click_type, click.types.IntParamType):
return {"type": "integer"}
elif isinstance(click_type, click.types.FloatParamType):
return {"type": "number"}
elif isinstance(click_type, click.types.BoolParamType):
return {"type": "boolean"}
elif isinstance(click_type, click.types.Choice):
return {"type": "string", "enum": click_type.choices}
elif isinstance(click_type, click.types.Path):
return {"type": "string", "description": "File or directory path"}
elif isinstance(click_type, click.types.File):
return {"type": "string", "description": "File path"}
elif isinstance(click_type, click.types.Tuple):
# Multiple arguments of same type
return {"type": "array", "items": {"type": "string"}}
else:
# Default to string for unknown types
return {"type": "string"}
def click_command_to_mcp_tool(cmd_name: str, click_command: click.Command) -> dict:
"""
Convert a Click command to an MCP tool definition with JSON Schema.
Introspects the Click command's parameters to automatically generate
the input schema without manual definition.
"""
properties = {}
required = []
# Extract parameters from Click command
for param in click_command.params:
# Skip internal parameters
if param.name in ('help', 'version'):
continue
param_schema = click_type_to_json_schema_type(param.type)
# Add description from Click help text
if param.help:
param_schema["description"] = param.help
# Handle default values
if param.default is not None and param.default != ():
param_schema["default"] = param.default
# Handle multiple values (like multiple URLs)
if param.multiple:
properties[param.name] = {
"type": "array",
"items": param_schema,
"description": param_schema.get("description", f"Multiple {param.name} values")
}
else:
properties[param.name] = param_schema
# Mark as required if Click requires it
if param.required:
required.append(param.name)
return {
"name": cmd_name,
"description": click_command.help or click_command.short_help or f"Run archivebox {cmd_name} command",
"inputSchema": {
"type": "object",
"properties": properties,
"required": required
}
}
def execute_click_command(cmd_name: str, click_command: click.Command, arguments: dict) -> dict:
"""
Execute a Click command programmatically with given arguments.
Returns MCP-formatted result with captured output and error status.
"""
# Use Click's test runner to invoke command programmatically
runner = CliRunner()
# Convert arguments dict to CLI args list
args = []
for key, value in arguments.items():
param_name = key.replace('_', '-') # Click uses dashes
if isinstance(value, bool):
if value:
args.append(f'--{param_name}')
elif isinstance(value, list):
# Multiple values (e.g., multiple URLs)
for item in value:
args.append(str(item))
elif value is not None:
args.append(f'--{param_name}')
args.append(str(value))
# Execute the command
try:
result = runner.invoke(click_command, args, catch_exceptions=False)
# Format output as MCP content
content = []
if result.output:
content.append({
"type": "text",
"text": result.output
})
if result.stderr_bytes:
stderr_text = result.stderr_bytes.decode('utf-8', errors='replace')
if stderr_text.strip():
content.append({
"type": "text",
"text": f"[stderr]\n{stderr_text}"
})
# Check exit code
is_error = result.exit_code != 0
if is_error and not content:
content.append({
"type": "text",
"text": f"Command failed with exit code {result.exit_code}"
})
return {
"content": content or [{"type": "text", "text": "(no output)"}],
"isError": is_error
}
except Exception as e:
# Capture any exceptions during execution
error_trace = traceback.format_exc()
return {
"content": [{
"type": "text",
"text": f"Error executing {cmd_name}: {str(e)}\n\n{error_trace}"
}],
"isError": True
}
class MCPServer:
"""
Model Context Protocol server for ArchiveBox.
Provides JSON-RPC 2.0 interface over stdio, dynamically exposing
all Click commands as MCP tools.
"""
def __init__(self):
# Import here to avoid circular imports
from archivebox.cli import ArchiveBoxGroup
self.cli_group = ArchiveBoxGroup()
self.protocol_version = "2025-11-25"
self._tool_cache = {} # Cache loaded Click commands
def get_click_command(self, cmd_name: str) -> Optional[click.Command]:
"""Get a Click command by name, with caching"""
if cmd_name not in self._tool_cache:
if cmd_name not in self.cli_group.all_subcommands:
return None
self._tool_cache[cmd_name] = self.cli_group.get_command(None, cmd_name)
return self._tool_cache[cmd_name]
def handle_initialize(self, params: dict) -> dict:
"""Handle MCP initialize request"""
return {
"protocolVersion": self.protocol_version,
"capabilities": {
"tools": {}
},
"serverInfo": {
"name": "archivebox-mcp",
"version": VERSION
}
}
def handle_tools_list(self, params: dict) -> dict:
"""Handle MCP tools/list request - returns all available CLI commands as tools"""
tools = []
for cmd_name in self.cli_group.all_subcommands.keys():
click_cmd = self.get_click_command(cmd_name)
if click_cmd:
try:
tool_def = click_command_to_mcp_tool(cmd_name, click_cmd)
tools.append(tool_def)
except Exception as e:
# Log but don't fail - skip problematic commands
print(f"Warning: Could not generate tool for {cmd_name}: {e}", file=sys.stderr)
return {"tools": tools}
def handle_tools_call(self, params: dict) -> dict:
"""Handle MCP tools/call request - executes a CLI command"""
tool_name = params.get('name')
arguments = params.get('arguments', {})
if not tool_name:
raise ValueError("Missing required parameter: name")
click_cmd = self.get_click_command(tool_name)
if not click_cmd:
raise ValueError(f"Unknown tool: {tool_name}")
# Execute the command and return MCP-formatted result
return execute_click_command(tool_name, click_cmd, arguments)
def handle_request(self, request: dict) -> dict:
"""
Handle a JSON-RPC 2.0 request and return response.
Supports MCP methods: initialize, tools/list, tools/call
"""
method = request.get('method')
params = request.get('params', {})
request_id = request.get('id')
try:
# Route to appropriate handler
if method == 'initialize':
result = self.handle_initialize(params)
elif method == 'tools/list':
result = self.handle_tools_list(params)
elif method == 'tools/call':
result = self.handle_tools_call(params)
else:
# Method not found
return {
"jsonrpc": "2.0",
"id": request_id,
"error": {
"code": -32601,
"message": f"Method not found: {method}"
}
}
# Success response
return {
"jsonrpc": "2.0",
"id": request_id,
"result": result
}
except Exception as e:
# Error response
error_trace = traceback.format_exc()
return {
"jsonrpc": "2.0",
"id": request_id,
"error": {
"code": -32603,
"message": str(e),
"data": error_trace
}
}
def run_stdio_server(self):
"""
Run the MCP server in stdio mode.
Reads JSON-RPC requests from stdin (one per line),
writes JSON-RPC responses to stdout (one per line).
"""
# Read requests from stdin line by line
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
# Parse JSON-RPC request
request = json.loads(line)
# Handle request
response = self.handle_request(request)
# Write response to stdout (use custom encoder for Click types)
print(json.dumps(response, cls=MCPJSONEncoder), flush=True)
except json.JSONDecodeError as e:
# Invalid JSON
error_response = {
"jsonrpc": "2.0",
"id": None,
"error": {
"code": -32700,
"message": "Parse error",
"data": str(e)
}
}
print(json.dumps(error_response, cls=MCPJSONEncoder), flush=True)
def run_mcp_server():
"""Main entry point for MCP server"""
server = MCPServer()
server.run_stdio_server()