Advanced Topics - MCP Academy

Multi-Modal Integration

Multi-modal applications are becoming increasingly important in AI, enabling richer interactions and more complex tasks.

The Model Context Protocol (MCP) provides a framework for building multi-modal applications that can handle various types of data, such as text, images, and audio.

MCP supports not just text-based interactions but also multi-modal capabilities, allowing models to work with images, audio, and other data types.

Introduction

In this lesson, you'll learn how to build a multi modal application.

Learning Objectives

By the end of this lesson, you will be able to:

Understand multi modal choices

Implement a multi modal app.

Architecture for Multi-Modal Support

Multi-modal MCP implementations typically involve:

Modal-Specific Parsers: Components that convert different media types into formats the model can process.

Modal-Specific Tools: Special tools designed to handle specific modalities (image analysis, audio processing)

Unified Context Management: System to maintain context across different modalities

Response Generation: Capability to generate responses that may include multiple modalities.

Multi-Modal Example: Image Analysis

In the below example, we will analyze an image and extract information.

C# Implementation


using ModelContextProtocol.SDK.Server;

using ModelContextProtocol.SDK.Server.Tools;

using ModelContextProtocol.SDK.Server.Content;

using System.Text.Json;

using System.IO;

using System.Threading.Tasks;

using System.Collections.Generic;



namespace MultiModalMcpExample

{

    // Tool for image analysis

    public class ImageAnalysisTool : ITool

    {

        private readonly IImageAnalysisService _imageService;

        

        public ImageAnalysisTool(IImageAnalysisService imageService)

        {

            _imageService = imageService;

        }

        

        public string Name => "imageAnalysis";

        public string Description => "Analyzes image content and extracts information";

          public ToolDefinition GetDefinition()

        {

            return new ToolDefinition

            {

                Name = Name,

                Description = Description,

                Parameters = new Dictionary<string, ParameterDefinition>

                {

                    ["imageUrl"] = new ParameterDefinition

                    {

                        Type = ParameterType.String,

                        Description = "URL to the image to analyze" 

                    },

                    ["analysisType"] = new ParameterDefinition

                    {

                        Type = ParameterType.String,

                        Description = "Type of analysis to perform",

                        Enum = new[] { "general", "objects", "text", "faces" },

                        Default = "general"

                    }

                },

                Required = new[] { "imageUrl" }

            };

        }

        

        public async Task<ToolResponse> ExecuteAsync(IDictionary<string, object> parameters)

        {

            // Extract parameters

            string imageUrl = parameters["imageUrl"].ToString();

            string analysisType = parameters.ContainsKey("analysisType") 

                ? parameters["analysisType"].ToString() 

                : "general";

              // Download or access the image

            byte[] imageData = await DownloadImageAsync(imageUrl);

            

            // Analyze based on the requested analysis type

            var analysisResult = analysisType switch

            {

                "objects" => await _imageService.DetectObjectsAsync(imageData),                "text" => await _imageService.RecognizeTextAsync(imageData),

                "faces" => await _imageService.DetectFacesAsync(imageData),

                _ => await _imageService.AnalyzeGeneralAsync(imageData) // Default general analysis

            };

            

            // Return structured result as a ToolResponse

            // Format follows the MCP specification for content structure

            var content = new List<ContentItem>

            {

                new ContentItem

                {

                    Type = ContentType.Text,

                    Text = JsonSerializer.Serialize(analysisResult)

                }

            };

            

            return new ToolResponse

            {

                Content = content,

                IsError = false

            };

        }

        

        private async Task<byte[]> DownloadImageAsync(string url)

        {

            using var httpClient = new HttpClient();

            return await httpClient.GetByteArrayAsync(url);

        }

    }

    

    // Multi-modal MCP server with image and text processing

    public class MultiModalMcpServer

    {

        public static async Task Main(string[] args)

        {

            // Create an MCP server

            var server = new McpServer(

                name: "Multi-Modal MCP Server",

                version: "1.0.0"

            );

            

            // Configure server for multi-modal support

            var serverOptions = new McpServerOptions

            {

                MaxRequestSize = 10 * 1024 * 1024, // 10MB for larger payloads like images

                SupportedContentTypes = new[]

                {

                    "image/jpeg",

                    "image/png",

                    "text/plain",

                    "application/json"

                }

            };

            

            // Create image analysis service

            var imageService = new ComputerVisionService();

            

            // Register image analysis tools

            server.AddTool(new ImageAnalysisTool(imageService));

            

            // Register a text-to-image tool

            services.AddMcpTool<TextAnalysisTool>();

            services.AddMcpTool<ImageAnalysisTool>();

            services.AddMcpTool<DocumentGenerationTool>(); // Tool that can generate documents with text and images

        }

    }

}

In the preceding example, we've:

Created an ImageAnalysisTool that can analyze images using a hypothetical IImageAnalysisService.

Configured the MCP server to handle larger requests and support image content types.

Registered the image analysis tool with the server.

Implemented a method to download images from a URL and analyze them based on the requested type (objects, text, faces, etc.).

Returned structured results in a format compliant with the MCP specification.

Multi-Modal Example: Audio Processing

Audio processing is another common modality in multi-modal applications. Below is an example of how to implement an audio transcription tool that can handle audio files and return transcriptions.

Java Implementation


package com.example.mcp.multimodal;



import com.mcp.server.McpServer;

import com.mcp.tools.Tool;

import com.mcp.tools.ToolRequest;

import com.mcp.tools.ToolResponse;

import com.mcp.tools.ToolExecutionException;

import com.example.audio.AudioProcessor;



import java.util.Base64;

import java.util.HashMap;

import java.util.Map;



// Audio transcription tool

public class AudioTranscriptionTool implements Tool {

    private final AudioProcessor audioProcessor;

    

    public AudioTranscriptionTool(AudioProcessor audioProcessor) {

        this.audioProcessor = audioProcessor;

    }

    

    @Override

    public String getName() {

        return "audioTranscription";

    }

    

    @Override

    public String getDescription() {

        return "Transcribes speech from audio files to text";

    }

    

    @Override

    public Object getSchema() {

        Map<String, Object> schema = new HashMap<>();

        schema.put("type", "object");

        

        Map<String, Object> properties = new HashMap<>();

        

        Map<String, Object> audioUrl = new HashMap<>();

        audioUrl.put("type", "string");

        audioUrl.put("description", "URL to the audio file to transcribe");

        

        Map<String, Object> audioData = new HashMap<>();

        audioData.put("type", "string");

        audioData.put("description", "Base64-encoded audio data (alternative to URL)");

        

        Map<String, Object> language = new HashMap<>();

        language.put("type", "string");

        language.put("description", "Language code (e.g., 'en-US', 'es-ES')");

        language.put("default", "en-US");

        

        properties.put("audioUrl", audioUrl);

        properties.put("audioData", audioData);

        properties.put("language", language);

        

        schema.put("properties", properties);

        schema.put("required", Arrays.asList("audioUrl"));

        

        return schema;

    }

    

    @Override

    public ToolResponse execute(ToolRequest request) {

        try {

            byte[] audioData;

            String language = request.getParameters().has("language") ? 

                request.getParameters().get("language").asText() : "en-US";

                

            // Get audio either from URL or direct data

            if (request.getParameters().has("audioUrl")) {

                String audioUrl = request.getParameters().get("audioUrl").asText();

                audioData = downloadAudio(audioUrl);

            } else if (request.getParameters().has("audioData")) {

                String base64Audio = request.getParameters().get("audioData").asText();

                audioData = Base64.getDecoder().decode(base64Audio);

            } else {

                throw new ToolExecutionException("Either audioUrl or audioData must be provided");

            }

            

            // Process audio and transcribe

            Map<String, Object> transcriptionResult = audioProcessor.transcribe(audioData, language);

            

            // Return transcription result

            return new ToolResponse.Builder()

                .setResult(transcriptionResult)

                .build();

        } catch (Exception ex) {

            throw new ToolExecutionException("Audio transcription failed: " + ex.getMessage(), ex);

        }

    }

    

    private byte[] downloadAudio(String url) {

        // Implementation for downloading audio from URL

        // ...

        return new byte[0]; // Placeholder

    }

}



// Main application with audio and other modalities

public class MultiModalApplication {

    public static void main(String[] args) {

        // Configure services

        AudioProcessor audioProcessor = new AudioProcessor();

        ImageProcessor imageProcessor = new ImageProcessor();

        

        // Create and configure server

        McpServer server = new McpServer.Builder()

            .setName("Multi-Modal MCP Server")

            .setVersion("1.0.0")

            .setPort(5000)

            .setMaxRequestSize(20 * 1024 * 1024) // 20MB for audio/video content

            .build();

            

        // Register multi-modal tools

        server.registerTool(new AudioTranscriptionTool(audioProcessor));

        server.registerTool(new ImageAnalysisTool(imageProcessor));

        server.registerTool(new VideoProcessingTool());

        

        // Start server

        server.start();

        System.out.println("Multi-Modal MCP Server started on port 5000");

    }

}

In the preceding example, we've:

Created an AudioTranscriptionTool that can transcribe audio files.

Defined the tool's schema to accept either a URL or base64-encoded audio data.

Implemented the execute method to handle audio processing and transcription.

Configured the MCP server to handle multi-modal requests, including audio and image processing.

Registered the audio transcription tool with the server.

Implemented a method to download audio files from a URL or decode base64 audio data.

Used an AudioProcessor service to handle the actual transcription logic.

Started the MCP server to listen for requests.

Multi-Modal Example: Multi-Modal Response Generation

Python Implementation


from mcp_server import McpServer

from mcp_tools import Tool, ToolRequest, ToolResponse, ToolExecutionException

import base64

from PIL import Image

import io

import requests

import json

from typing import Dict, Any, List, Optional



# Image generation tool

class ImageGenerationTool(Tool):

    def get_name(self):

        return "imageGeneration"

        

    def get_description(self):

        return "Generates images based on text descriptions"

    

    def get_schema(self):

        return {

            "type": "object",

            "properties": {

                "prompt": {

                    "type": "string", 

                    "description": "Text description of the image to generate"

                },

                "style": {

                    "type": "string",

                    "enum": ["realistic", "artistic", "cartoon", "sketch"],

                    "default": "realistic"

                },

                "width": {

                    "type": "integer",

                    "default": 512

                },

                "height": {

                    "type": "integer",

                    "default": 512

                }

            },

            "required": ["prompt"]

        }

    

    async def execute_async(self, request: ToolRequest) -> ToolResponse:

        try:

            # Extract parameters

            prompt = request.parameters.get("prompt")

            style = request.parameters.get("style", "realistic")

            width = request.parameters.get("width", 512)

            height = request.parameters.get("height", 512)

            

            # Generate image using external service (example implementation)

            image_data = await self._generate_image(prompt, style, width, height)

            

            # Convert image to base64 for response

            buffered = io.BytesIO()

            image_data.save(buffered, format="PNG")

            img_str = base64.b64encode(buffered.getvalue()).decode()

            

            # Return result with both the image and metadata

            return ToolResponse(

                result={

                    "imageBase64": img_str,

                    "format": "image/png",

                    "width": width,

                    "height": height,

                    "generationPrompt": prompt,

                    "style": style

                }

            )

        except Exception as e:

            raise ToolExecutionException(f"Image generation failed: {str(e)}")

    

    async def _generate_image(self, prompt: str, style: str, width: int, height: int) -> Image.Image:

        """

        This would call an actual image generation API

        Simplified placeholder implementation

        """

        # Return a placeholder image or call actual image generation API

        # For this example, we'll create a simple colored image

        image = Image.new('RGB', (width, height), color=(73, 109, 137))

        return image



# Multi-modal response handler

class MultiModalResponseHandler:

    """Handler for creating responses that combine text, images, and other modalities"""

    

    def __init__(self, mcp_client):

        self.client = mcp_client

    

    async def create_multi_modal_response(self, 

                                         text_content: str, 

                                         generate_images: bool = False,

                                         image_prompts: Optional[List[str]] = None) -> Dict[str, Any]:

        """

        Creates a response that may include generated images alongside text

        """

        response = {

            "text": text_content,

            "images": []

        }

        

        # Generate images if requested

        if generate_images and image_prompts:

            for prompt in image_prompts:

                image_result = await self.client.execute_tool(

                    "imageGeneration",

                    {

                        "prompt": prompt,

                        "style": "realistic",

                        "width": 512,

                        "height": 512

                    }

                )

                

                response["images"].append({

                    "imageData": image_result.result["imageBase64"],

                    "format": image_result.result["format"],

                    "prompt": prompt

                })

        

        return response



# Main application

async def main():

    # Create server

    server = McpServer(

        name="Multi-Modal MCP Server",

        version="1.0.0",

        port=5000

    )

    

    # Register multi-modal tools

    server.register_tool(ImageGenerationTool())

    server.register_tool(AudioAnalysisTool())

    server.register_tool(VideoFrameExtractionTool())

    

    # Start server

    await server.start()

    print("Multi-Modal MCP Server running on port 5000")



if __name__ == "__main__":

    import asyncio

    asyncio.run(main())

What's next

5.3 Oauth 2

다중 모달 통합

다중 모달 애플리케이션은 AI에서 점점 더 중요해지고 있으며, 더 풍부한 상호작용과 복잡한 작업 수행을 가능하게 합니다. Model Context Protocol(MCP)은 텍스트, 이미지, 오디오 등 다양한 유형의 데이터를 처리할 수 있는 다중 모달 애플리케이션을 구축하기 위한 프레임워크를 제공합니다.

MCP는 텍스트 기반 상호작용뿐만 아니라 이미지, 오디오 및 기타 데이터 유형을 다룰 수 있는 다중 모달 기능도 지원합니다.

소개

이번 수업에서는 다중 모달 애플리케이션을 만드는 방법을 배웁니다.

학습 목표

이 수업이 끝나면 다음을 할 수 있습니다:

다중 모달 선택지를 이해하기

다중 모달 앱 구현하기

다중 모달 지원 아키텍처

다중 모달 MCP 구현은 일반적으로 다음을 포함합니다:

모달별 파서: 모델이 처리할 수 있는 형식으로 다양한 미디어 유형을 변환하는 구성 요소

모달별 도구: 특정 모달리티(예: 이미지 분석, 오디오 처리)를 다루기 위한 특수 도구

통합 컨텍스트 관리: 서로 다른 모달리티 간의 컨텍스트를 유지하는 시스템

응답 생성: 여러 모달리티를 포함할 수 있는 응답을 생성하는 기능

다중 모달 예제: 이미지 분석

아래 예제에서는 이미지를 분석하고 정보를 추출합니다.

C# 구현


using ModelContextProtocol.SDK.Server;

using ModelContextProtocol.SDK.Server.Tools;

using ModelContextProtocol.SDK.Server.Content;

using System.Text.Json;

using System.IO;

using System.Threading.Tasks;

using System.Collections.Generic;



namespace MultiModalMcpExample

{

    // Tool for image analysis

    public class ImageAnalysisTool : ITool

    {

        private readonly IImageAnalysisService _imageService;

        

        public ImageAnalysisTool(IImageAnalysisService imageService)

        {

            _imageService = imageService;

        }

        

        public string Name => "imageAnalysis";

        public string Description => "Analyzes image content and extracts information";

          public ToolDefinition GetDefinition()

        {

            return new ToolDefinition

            {

                Name = Name,

                Description = Description,

                Parameters = new Dictionary<string, ParameterDefinition>

                {

                    ["imageUrl"] = new ParameterDefinition

                    {

                        Type = ParameterType.String,

                        Description = "URL to the image to analyze" 

                    },

                    ["analysisType"] = new ParameterDefinition

                    {

                        Type = ParameterType.String,

                        Description = "Type of analysis to perform",

                        Enum = new[] { "general", "objects", "text", "faces" },

                        Default = "general"

                    }

                },

                Required = new[] { "imageUrl" }

            };

        }

        

        public async Task<ToolResponse> ExecuteAsync(IDictionary<string, object> parameters)

        {

            // Extract parameters

            string imageUrl = parameters["imageUrl"].ToString();

            string analysisType = parameters.ContainsKey("analysisType") 

                ? parameters["analysisType"].ToString() 

                : "general";

              // Download or access the image

            byte[] imageData = await DownloadImageAsync(imageUrl);

            

            // Analyze based on the requested analysis type

            var analysisResult = analysisType switch

            {

                "objects" => await _imageService.DetectObjectsAsync(imageData),                "text" => await _imageService.RecognizeTextAsync(imageData),

                "faces" => await _imageService.DetectFacesAsync(imageData),

                _ => await _imageService.AnalyzeGeneralAsync(imageData) // Default general analysis

            };

            

            // Return structured result as a ToolResponse

            // Format follows the MCP specification for content structure

            var content = new List<ContentItem>

            {

                new ContentItem

                {

                    Type = ContentType.Text,

                    Text = JsonSerializer.Serialize(analysisResult)

                }

            };

            

            return new ToolResponse

            {

                Content = content,

                IsError = false

            };

        }

        

        private async Task<byte[]> DownloadImageAsync(string url)

        {

            using var httpClient = new HttpClient();

            return await httpClient.GetByteArrayAsync(url);

        }

    }

    

    // Multi-modal MCP server with image and text processing

    public class MultiModalMcpServer

    {

        public static async Task Main(string[] args)

        {

            // Create an MCP server

            var server = new McpServer(

                name: "Multi-Modal MCP Server",

                version: "1.0.0"

            );

            

            // Configure server for multi-modal support

            var serverOptions = new McpServerOptions

            {

                MaxRequestSize = 10 * 1024 * 1024, // 10MB for larger payloads like images

                SupportedContentTypes = new[]

                {

                    "image/jpeg",

                    "image/png",

                    "text/plain",

                    "application/json"

                }

            };

            

            // Create image analysis service

            var imageService = new ComputerVisionService();

            

            // Register image analysis tools

            server.AddTool(new ImageAnalysisTool(imageService));

            

            // Register a text-to-image tool

            services.AddMcpTool<TextAnalysisTool>();

            services.AddMcpTool<ImageAnalysisTool>();

            services.AddMcpTool<DocumentGenerationTool>(); // Tool that can generate documents with text and images

        }

    }

}

위 예제에서는 다음을 수행했습니다:

가상의 IImageAnalysisService를 사용하여 이미지를 분석할 수 있는 ImageAnalysisTool을 생성했습니다.

MCP 서버를 더 큰 요청을 처리하고 이미지 콘텐츠 유형을 지원하도록 구성했습니다.

이미지 분석 도구를 서버에 등록했습니다.

URL에서 이미지를 다운로드하고 요청된 유형(객체, 텍스트, 얼굴 등)에 따라 분석하는 메서드를 구현했습니다.

MCP 사양에 맞는 형식으로 구조화된 결과를 반환했습니다.

다중 모달 예제: 오디오 처리

오디오 처리는 다중 모달 애플리케이션에서 또 다른 일반적인 모달리티입니다. 아래는 오디오 파일을 처리하고 전사를 반환하는 오디오 전사 도구를 구현하는 예제입니다.

Java 구현


package com.example.mcp.multimodal;



import com.mcp.server.McpServer;

import com.mcp.tools.Tool;

import com.mcp.tools.ToolRequest;

import com.mcp.tools.ToolResponse;

import com.mcp.tools.ToolExecutionException;

import com.example.audio.AudioProcessor;



import java.util.Base64;

import java.util.HashMap;

import java.util.Map;



// Audio transcription tool

public class AudioTranscriptionTool implements Tool {

    private final AudioProcessor audioProcessor;

    

    public AudioTranscriptionTool(AudioProcessor audioProcessor) {

        this.audioProcessor = audioProcessor;

    }

    

    @Override

    public String getName() {

        return "audioTranscription";

    }

    

    @Override

    public String getDescription() {

        return "Transcribes speech from audio files to text";

    }

    

    @Override

    public Object getSchema() {

        Map<String, Object> schema = new HashMap<>();

        schema.put("type", "object");

        

        Map<String, Object> properties = new HashMap<>();

        

        Map<String, Object> audioUrl = new HashMap<>();

        audioUrl.put("type", "string");

        audioUrl.put("description", "URL to the audio file to transcribe");

        

        Map<String, Object> audioData = new HashMap<>();

        audioData.put("type", "string");

        audioData.put("description", "Base64-encoded audio data (alternative to URL)");

        

        Map<String, Object> language = new HashMap<>();

        language.put("type", "string");

        language.put("description", "Language code (e.g., 'en-US', 'es-ES')");

        language.put("default", "en-US");

        

        properties.put("audioUrl", audioUrl);

        properties.put("audioData", audioData);

        properties.put("language", language);

        

        schema.put("properties", properties);

        schema.put("required", Arrays.asList("audioUrl"));

        

        return schema;

    }

    

    @Override

    public ToolResponse execute(ToolRequest request) {

        try {

            byte[] audioData;

            String language = request.getParameters().has("language") ? 

                request.getParameters().get("language").asText() : "en-US";

                

            // Get audio either from URL or direct data

            if (request.getParameters().has("audioUrl")) {

                String audioUrl = request.getParameters().get("audioUrl").asText();

                audioData = downloadAudio(audioUrl);

            } else if (request.getParameters().has("audioData")) {

                String base64Audio = request.getParameters().get("audioData").asText();

                audioData = Base64.getDecoder().decode(base64Audio);

            } else {

                throw new ToolExecutionException("Either audioUrl or audioData must be provided");

            }

            

            // Process audio and transcribe

            Map<String, Object> transcriptionResult = audioProcessor.transcribe(audioData, language);

            

            // Return transcription result

            return new ToolResponse.Builder()

                .setResult(transcriptionResult)

                .build();

        } catch (Exception ex) {

            throw new ToolExecutionException("Audio transcription failed: " + ex.getMessage(), ex);

        }

    }

    

    private byte[] downloadAudio(String url) {

        // Implementation for downloading audio from URL

        // ...

        return new byte[0]; // Placeholder

    }

}



// Main application with audio and other modalities

public class MultiModalApplication {

    public static void main(String[] args) {

        // Configure services

        AudioProcessor audioProcessor = new AudioProcessor();

        ImageProcessor imageProcessor = new ImageProcessor();

        

        // Create and configure server

        McpServer server = new McpServer.Builder()

            .setName("Multi-Modal MCP Server")

            .setVersion("1.0.0")

            .setPort(5000)

            .setMaxRequestSize(20 * 1024 * 1024) // 20MB for audio/video content

            .build();

            

        // Register multi-modal tools

        server.registerTool(new AudioTranscriptionTool(audioProcessor));

        server.registerTool(new ImageAnalysisTool(imageProcessor));

        server.registerTool(new VideoProcessingTool());

        

        // Start server

        server.start();

        System.out.println("Multi-Modal MCP Server started on port 5000");

    }

}

위 예제에서는 다음을 수행했습니다:

오디오 파일을 전사할 수 있는 AudioTranscriptionTool을 생성했습니다.

도구의 스키마를 URL 또는 base64로 인코딩된 오디오 데이터를 받을 수 있도록 정의했습니다.

오디오 처리 및 전사를 수행하는 execute 메서드를 구현했습니다.

오디오 및 이미지 처리를 포함한 다중 모달 요청을 처리하도록 MCP 서버를 구성했습니다.

오디오 전사 도구를 서버에 등록했습니다.

URL에서 오디오 파일을 다운로드하거나 base64 오디오 데이터를 디코딩하는 메서드를 구현했습니다.

실제 전사 로직을 처리하는 AudioProcessor 서비스를 사용했습니다.

요청을 수신하기 위해 MCP 서버를 시작했습니다.

다중 모달 예제: 다중 모달 응답 생성

Python 구현


from mcp_server import McpServer

from mcp_tools import Tool, ToolRequest, ToolResponse, ToolExecutionException

import base64

from PIL import Image

import io

import requests

import json

from typing import Dict, Any, List, Optional



# Image generation tool

class ImageGenerationTool(Tool):

    def get_name(self):

        return "imageGeneration"

        

    def get_description(self):

        return "Generates images based on text descriptions"

    

    def get_schema(self):

        return {

            "type": "object",

            "properties": {

                "prompt": {

                    "type": "string", 

                    "description": "Text description of the image to generate"

                },

                "style": {

                    "type": "string",

                    "enum": ["realistic", "artistic", "cartoon", "sketch"],

                    "default": "realistic"

                },

                "width": {

                    "type": "integer",

                    "default": 512

                },

                "height": {

                    "type": "integer",

                    "default": 512

                }

            },

            "required": ["prompt"]

        }

    

    async def execute_async(self, request: ToolRequest) -> ToolResponse:

        try:

            # Extract parameters

            prompt = request.parameters.get("prompt")

            style = request.parameters.get("style", "realistic")

            width = request.parameters.get("width", 512)

            height = request.parameters.get("height", 512)

            

            # Generate image using external service (example implementation)

            image_data = await self._generate_image(prompt, style, width, height)

            

            # Convert image to base64 for response

            buffered = io.BytesIO()

            image_data.save(buffered, format="PNG")

            img_str = base64.b64encode(buffered.getvalue()).decode()

            

            # Return result with both the image and metadata

            return ToolResponse(

                result={

                    "imageBase64": img_str,

                    "format": "image/png",

                    "width": width,

                    "height": height,

                    "generationPrompt": prompt,

                    "style": style

                }

            )

        except Exception as e:

            raise ToolExecutionException(f"Image generation failed: {str(e)}")

    

    async def _generate_image(self, prompt: str, style: str, width: int, height: int) -> Image.Image:

        """

        This would call an actual image generation API

        Simplified placeholder implementation

        """

        # Return a placeholder image or call actual image generation API

        # For this example, we'll create a simple colored image

        image = Image.new('RGB', (width, height), color=(73, 109, 137))

        return image



# Multi-modal response handler

class MultiModalResponseHandler:

    """Handler for creating responses that combine text, images, and other modalities"""

    

    def __init__(self, mcp_client):

        self.client = mcp_client

    

    async def create_multi_modal_response(self, 

                                         text_content: str, 

                                         generate_images: bool = False,

                                         image_prompts: Optional[List[str]] = None) -> Dict[str, Any]:

        """

        Creates a response that may include generated images alongside text

        """

        response = {

            "text": text_content,

            "images": []

        }

        

        # Generate images if requested

        if generate_images and image_prompts:

            for prompt in image_prompts:

                image_result = await self.client.execute_tool(

                    "imageGeneration",

                    {

                        "prompt": prompt,

                        "style": "realistic",

                        "width": 512,

                        "height": 512

                    }

                )

                

                response["images"].append({

                    "imageData": image_result.result["imageBase64"],

                    "format": image_result.result["format"],

                    "prompt": prompt

                })

        

        return response



# Main application

async def main():

    # Create server

    server = McpServer(

        name="Multi-Modal MCP Server",

        version="1.0.0",

        port=5000

    )

    

    # Register multi-modal tools

    server.register_tool(ImageGenerationTool())

    server.register_tool(AudioAnalysisTool())

    server.register_tool(VideoFrameExtractionTool())

    

    # Start server

    await server.start()

    print("Multi-Modal MCP Server running on port 5000")



if __name__ == "__main__":

    import asyncio

    asyncio.run(main())

다음 단계