diff --git a/docs/CN/source/tutorial/multimodal_request.rst b/docs/CN/source/tutorial/multimodal_request.rst new file mode 100644 index 0000000000..838f6b3301 --- /dev/null +++ b/docs/CN/source/tutorial/multimodal_request.rst @@ -0,0 +1,413 @@ +多模态请求指南 +============================ + +LightLLM 提供两种 API 格式来接收多模态请求:OpenAI 兼容格式(推荐)和 Legacy 格式。本文档介绍如何向 LightLLM 服务发送包含图片和音频的多模态请求。 + +OpenAI 兼容格式(推荐) +------------------------ + +使用 ``POST /v1/chat/completions`` 端点,与 OpenAI API 格式完全兼容。 + +图片输入方式 +^^^^^^^^^^^^ + +LightLLM 支持三种图片输入方式: + +**1. URL 方式** + +.. code-block:: json + + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + } + } + +**2. Base64 编码** + +.. code-block:: json + + { + "type": "image_url", + "image_url": { + "url": "data:image/jpeg;base64,/9j/4AAQSkZJRg..." + } + } + +**3. 本地文件路径** + +.. code-block:: json + + { + "type": "image_url", + "image_url": { + "url": "file:///path/to/image.jpg" + } + } + +音频输入方式 +^^^^^^^^^^^^ + +音频同样支持 URL 和 Base64 两种方式: + +.. code-block:: json + + { + "type": "audio_url", + "audio_url": { + "url": "https://example.com/audio.wav" + } + } + +使用 curl 发送请求 +^^^^^^^^^^^^^^^^^^ + +**基本图片请求** + +.. code-block:: bash + + curl http://127.0.0.1:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + } + }, + { + "type": "text", + "text": "请描述这张图片的内容。" + } + ] + } + ], + "max_completion_tokens": 512, + "temperature": 0.7 + }' + +**流式输出** + +.. code-block:: bash + + curl http://127.0.0.1:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + } + }, + { + "type": "text", + "text": "请描述这张图片的内容。" + } + ] + } + ], + "max_completion_tokens": 512, + "stream": true + }' + +使用 Python requests 发送请求 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + import requests + + url = "http://127.0.0.1:8080/v1/chat/completions" + image_url = "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + + payload = { + "model": "qwen", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "请描述这张图片的内容。"}, + ], + } + ], + "max_completion_tokens": 512, + "temperature": 0.7, + } + + response = requests.post(url, json=payload) + print(response.json()) + +使用 OpenAI SDK 发送请求 +^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + from openai import OpenAI + + client = OpenAI(base_url="http://127.0.0.1:8080/v1", api_key="none") + + image_url = "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + + # 非流式 + response = client.chat.completions.create( + model="qwen", + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "请描述这张图片的内容。"}, + ], + } + ], + max_completion_tokens=512, + temperature=0.7, + ) + print(response.choices[0].message.content) + + # 流式 + stream = client.chat.completions.create( + model="qwen", + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "请描述这张图片的内容。"}, + ], + } + ], + max_completion_tokens=512, + stream=True, + ) + for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) + +多图片请求 +^^^^^^^^^^ + +在 ``content`` 列表中传入多个 ``image_url`` 即可: + +.. code-block:: python + + response = client.chat.completions.create( + model="qwen", + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "https://example.com/image1.jpg"}}, + {"type": "image_url", "image_url": {"url": "https://example.com/image2.jpg"}}, + {"type": "text", "text": "请比较这两张图片的区别。"}, + ], + } + ], + max_completion_tokens=512, + ) + +多轮对话 +^^^^^^^^ + +多轮对话中,相同图片会自动命中 embed 缓存,无需重复推理: + +.. code-block:: python + + image_url = "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + + messages = [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "这张图片里有什么?"}, + ], + }, + ] + + # 第一轮 + response = client.chat.completions.create( + model="qwen", messages=messages, max_completion_tokens=512 + ) + answer = response.choices[0].message.content + messages.append({"role": "assistant", "content": answer}) + + # 第二轮(同一张图片会命中缓存) + messages.append({ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "请更详细地描述图片中的文字内容。"}, + ], + }) + response = client.chat.completions.create( + model="qwen", messages=messages, max_completion_tokens=512 + ) + +Legacy 格式 +------------ + +使用 ``POST /generate`` 端点,需要手动构造 prompt 模板和 multimodal_params。 + +.. code-block:: python + + import json + import requests + import base64 + + + def run(query, uris): + images = [] + for uri in uris: + if uri.startswith("http"): + images.append({"type": "url", "data": uri}) + else: + with open(uri, "rb") as fin: + b64 = base64.b64encode(fin.read()).decode("utf-8") + images.append({"type": "base64", "data": b64}) + + data = { + "inputs": query, + "parameters": { + "max_new_tokens": 512, + "do_sample": False, + }, + "multimodal_params": { + "images": images, + }, + } + + url = "http://127.0.0.1:8080/generate" + headers = {"Content-Type": "application/json"} + response = requests.post(url, headers=headers, data=json.dumps(data)) + return response + + + response = run( + uris=["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg"], + query="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n\n请描述这张图片的内容。<|im_end|>\n" + "<|im_start|>assistant\n", + ) + + if response.status_code == 200: + print(f"Result: {response.json()}") + else: + print(f"Error: {response.status_code}, {response.text}") + +.. note:: Legacy 格式需要手动构造 chat template,推荐使用 OpenAI 兼容格式以避免模板错误。 + +常用采样参数 +------------ + +.. list-table:: + :header-rows: 1 + :widths: 25 15 15 45 + + * - 参数 + - 类型 + - 默认值 + - 说明 + * - max_completion_tokens + - int + - 16384 + - 最大输出 token 数 + * - temperature + - float + - 1.0 + - 采样温度,越高越随机 + * - top_p + - float + - 1.0 + - 核采样参数 + * - top_k + - int + - -1 + - Top-K 采样,-1 表示禁用 + * - do_sample + - bool + - true + - 是否启用采样(false 为贪心解码) + * - repetition_penalty + - float + - 1.0 + - 重复惩罚系数,>1 抑制重复 + * - stream + - bool + - false + - 是否流式输出 + * - stop + - str/list + - - + - 停止生成的序列 + * - seed + - int + - -1 + - 随机种子,-1 为随机 + +响应格式 +-------- + +**非流式响应** + +.. code-block:: json + + { + "id": "chatcmpl-xxx", + "object": "chat.completion", + "created": 1234567890, + "model": "qwen", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "这张图片显示的是..." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 256, + "completion_tokens": 128, + "total_tokens": 384 + } + } + +**流式响应** + +每个 chunk 以 ``data: `` 前缀的 SSE 格式返回: + +.. code-block:: json + + { + "id": "chatcmpl-xxx", + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": { + "content": "这张" + }, + "finish_reason": null + } + ] + } + +环境变量 +-------- + +- **REQUEST_TIMEOUT**: 下载远程图片/音频的超时时间(秒),默认 5 +- **REQUEST_PROXY**: 下载远程资源时使用的代理地址 diff --git a/docs/EN/source/tutorial/multimodal_request.rst b/docs/EN/source/tutorial/multimodal_request.rst new file mode 100644 index 0000000000..22ecdc9fca --- /dev/null +++ b/docs/EN/source/tutorial/multimodal_request.rst @@ -0,0 +1,413 @@ +Multimodal Request Guide +============================ + +LightLLM provides two API formats for multimodal requests: OpenAI-compatible format (recommended) and Legacy format. This guide shows how to send requests with images and audio to a LightLLM server. + +OpenAI-Compatible Format (Recommended) +--------------------------------------- + +Use the ``POST /v1/chat/completions`` endpoint, fully compatible with the OpenAI API. + +Image Input Methods +^^^^^^^^^^^^^^^^^^^ + +LightLLM supports three image input methods: + +**1. URL** + +.. code-block:: json + + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + } + } + +**2. Base64 Encoded** + +.. code-block:: json + + { + "type": "image_url", + "image_url": { + "url": "data:image/jpeg;base64,/9j/4AAQSkZJRg..." + } + } + +**3. Local File Path** + +.. code-block:: json + + { + "type": "image_url", + "image_url": { + "url": "file:///path/to/image.jpg" + } + } + +Audio Input Methods +^^^^^^^^^^^^^^^^^^^ + +Audio supports both URL and Base64 methods: + +.. code-block:: json + + { + "type": "audio_url", + "audio_url": { + "url": "https://example.com/audio.wav" + } + } + +Using curl +^^^^^^^^^^ + +**Basic Image Request** + +.. code-block:: bash + + curl http://127.0.0.1:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + } + }, + { + "type": "text", + "text": "Describe the content of this image." + } + ] + } + ], + "max_completion_tokens": 512, + "temperature": 0.7 + }' + +**Streaming Output** + +.. code-block:: bash + + curl http://127.0.0.1:8080/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "qwen", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + } + }, + { + "type": "text", + "text": "Describe the content of this image." + } + ] + } + ], + "max_completion_tokens": 512, + "stream": true + }' + +Using Python requests +^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + import requests + + url = "http://127.0.0.1:8080/v1/chat/completions" + image_url = "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + + payload = { + "model": "qwen", + "messages": [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Describe the content of this image."}, + ], + } + ], + "max_completion_tokens": 512, + "temperature": 0.7, + } + + response = requests.post(url, json=payload) + print(response.json()) + +Using OpenAI SDK +^^^^^^^^^^^^^^^^ + +.. code-block:: python + + from openai import OpenAI + + client = OpenAI(base_url="http://127.0.0.1:8080/v1", api_key="none") + + image_url = "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + + # Non-streaming + response = client.chat.completions.create( + model="qwen", + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Describe the content of this image."}, + ], + } + ], + max_completion_tokens=512, + temperature=0.7, + ) + print(response.choices[0].message.content) + + # Streaming + stream = client.chat.completions.create( + model="qwen", + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Describe the content of this image."}, + ], + } + ], + max_completion_tokens=512, + stream=True, + ) + for chunk in stream: + if chunk.choices[0].delta.content: + print(chunk.choices[0].delta.content, end="", flush=True) + +Multi-Image Request +^^^^^^^^^^^^^^^^^^^ + +Pass multiple ``image_url`` items in the ``content`` list: + +.. code-block:: python + + response = client.chat.completions.create( + model="qwen", + messages=[ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "https://example.com/image1.jpg"}}, + {"type": "image_url", "image_url": {"url": "https://example.com/image2.jpg"}}, + {"type": "text", "text": "Compare the differences between these two images."}, + ], + } + ], + max_completion_tokens=512, + ) + +Multi-Turn Conversation +^^^^^^^^^^^^^^^^^^^^^^^ + +In multi-turn conversations, identical images automatically hit the embed cache, avoiding redundant inference: + +.. code-block:: python + + image_url = "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg" + + messages = [ + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "What is in this image?"}, + ], + }, + ] + + # First turn + response = client.chat.completions.create( + model="qwen", messages=messages, max_completion_tokens=512 + ) + answer = response.choices[0].message.content + messages.append({"role": "assistant", "content": answer}) + + # Second turn (same image hits cache) + messages.append({ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Describe the text content in the image in more detail."}, + ], + }) + response = client.chat.completions.create( + model="qwen", messages=messages, max_completion_tokens=512 + ) + +Legacy Format +-------------- + +Use the ``POST /generate`` endpoint. Requires manually constructing prompt templates and multimodal_params. + +.. code-block:: python + + import json + import requests + import base64 + + + def run(query, uris): + images = [] + for uri in uris: + if uri.startswith("http"): + images.append({"type": "url", "data": uri}) + else: + with open(uri, "rb") as fin: + b64 = base64.b64encode(fin.read()).decode("utf-8") + images.append({"type": "base64", "data": b64}) + + data = { + "inputs": query, + "parameters": { + "max_new_tokens": 512, + "do_sample": False, + }, + "multimodal_params": { + "images": images, + }, + } + + url = "http://127.0.0.1:8080/generate" + headers = {"Content-Type": "application/json"} + response = requests.post(url, headers=headers, data=json.dumps(data)) + return response + + + response = run( + uris=["https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3.5/demo/CI_Demo/mathv-1327.jpg"], + query="<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n\nDescribe the content of this image.<|im_end|>\n" + "<|im_start|>assistant\n", + ) + + if response.status_code == 200: + print(f"Result: {response.json()}") + else: + print(f"Error: {response.status_code}, {response.text}") + +.. note:: The Legacy format requires manually constructing chat templates. The OpenAI-compatible format is recommended to avoid template errors. + +Common Sampling Parameters +--------------------------- + +.. list-table:: + :header-rows: 1 + :widths: 25 15 15 45 + + * - Parameter + - Type + - Default + - Description + * - max_completion_tokens + - int + - 16384 + - Maximum output tokens + * - temperature + - float + - 1.0 + - Sampling temperature, higher is more random + * - top_p + - float + - 1.0 + - Nucleus sampling parameter + * - top_k + - int + - -1 + - Top-K sampling, -1 to disable + * - do_sample + - bool + - true + - Enable sampling (false for greedy decoding) + * - repetition_penalty + - float + - 1.0 + - Repetition penalty, >1 suppresses repetition + * - stream + - bool + - false + - Enable streaming output + * - stop + - str/list + - - + - Stop generation sequences + * - seed + - int + - -1 + - Random seed, -1 for random + +Response Format +---------------- + +**Non-Streaming Response** + +.. code-block:: json + + { + "id": "chatcmpl-xxx", + "object": "chat.completion", + "created": 1234567890, + "model": "qwen", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "The image shows..." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 256, + "completion_tokens": 128, + "total_tokens": 384 + } + } + +**Streaming Response** + +Each chunk is returned in SSE format with the ``data: `` prefix: + +.. code-block:: json + + { + "id": "chatcmpl-xxx", + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": { + "content": "The image" + }, + "finish_reason": null + } + ] + } + +Environment Variables +--------------------- + +- **REQUEST_TIMEOUT**: Timeout (seconds) for downloading remote images/audio, default 5 +- **REQUEST_PROXY**: Proxy address for downloading remote resources