From 34a470f4a7dc991b7fd7a62b7b0d2b6400a57b8b Mon Sep 17 00:00:00 2001 From: sufubao Date: Tue, 14 Apr 2026 19:49:50 +0800 Subject: [PATCH 01/22] feat(api): add --enable_anthropic_api CLI flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the opt-in flag that will gate the Anthropic Messages API compatibility layer. The flag is currently inert — the endpoint itself lands in subsequent commits. Co-Authored-By: Claude Sonnet 4.6 --- lightllm/server/api_cli.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py index 7dcd7df1b..5b456b2ce 100644 --- a/lightllm/server/api_cli.py +++ b/lightllm/server/api_cli.py @@ -450,6 +450,14 @@ def make_argument_parser() -> argparse.ArgumentParser: center : remove some tokens in center loc to make input_token_len + max_new_tokens <= max_req_total_len""", ) parser.add_argument("--use_tgi_api", action="store_true", help="use tgi input and ouput format") + parser.add_argument( + "--enable_anthropic_api", + action="store_true", + help="""Expose an Anthropic Messages API compatible endpoint (/v1/messages) + that translates requests into the internal chat completions path. Requires + the 'litellm' package to be installed. When enabled, clients written against + the Anthropic SDK can use this server by setting base_url to the server address.""", + ) parser.add_argument( "--health_monitor", action="store_true", help="check the health of service and restart when error" ) From 61250900bf4491cabb276abf02635115a187422f Mon Sep 17 00:00:00 2001 From: sufubao Date: Tue, 14 Apr 2026 19:53:36 +0800 Subject: [PATCH 02/22] feat(api): add LiteLLM shim module for Anthropic adapter Isolates LiteLLM's experimental_pass_through imports behind get_* accessors so future upgrades only touch one file. Includes a startup check (ensure_available) that fails loudly if litellm is missing when --enable_anthropic_api is set. --- lightllm/server/_litellm_shim.py | 103 ++++++++++++++++++++ test/test_api/test_anthropic_translation.py | 30 ++++++ 2 files changed, 133 insertions(+) create mode 100644 lightllm/server/_litellm_shim.py create mode 100644 test/test_api/test_anthropic_translation.py diff --git a/lightllm/server/_litellm_shim.py b/lightllm/server/_litellm_shim.py new file mode 100644 index 000000000..ff70dcec0 --- /dev/null +++ b/lightllm/server/_litellm_shim.py @@ -0,0 +1,103 @@ +"""LiteLLM integration shim for the Anthropic Messages API endpoint. + +LiteLLM's Anthropic<->OpenAI translation code lives under an +``experimental_pass_through`` import path. Centralising all LiteLLM imports +here means a LiteLLM upgrade that relocates those symbols requires editing +exactly one file. Callers should use the getters below; they must not +import LiteLLM symbols directly from elsewhere in the server package. +""" +from __future__ import annotations + +from typing import Any + +from lightllm.utils.log_utils import init_logger + +logger = init_logger(__name__) + +# Known-good LiteLLM versions. Bump explicitly after retesting. +_MIN_LITELLM_VERSION = "1.52.0" +_MAX_TESTED_LITELLM_VERSION = "1.60.0" + +_cached_adapter: Any = None +_cached_stream_wrapper_cls: Any = None +_import_checked: bool = False + + +def _raise_missing() -> None: + raise RuntimeError( + "--enable_anthropic_api requires the 'litellm' package. Install it with:\n" + f" pip install 'litellm>={_MIN_LITELLM_VERSION}'" + ) + + +def _get_litellm_version() -> str: + """Return the installed litellm version string, or 'unknown' if not found. + + litellm >= 1.x does not expose ``__version__`` as a module attribute; + use importlib.metadata as the primary source. + """ + try: + import importlib.metadata + return importlib.metadata.version("litellm") + except Exception: + pass + # Fallback: some older builds do expose it. + try: + import litellm + return getattr(litellm, "__version__", "unknown") + except Exception: + return "unknown" + + +def _check_import_once() -> None: + global _import_checked + if _import_checked: + return + try: + import litellm # noqa: F401 + except ImportError: + _raise_missing() + else: + version = _get_litellm_version() + logger.info( + "LiteLLM detected (version=%s) for Anthropic API compatibility layer. " + "Tested range: %s..%s", + version, + _MIN_LITELLM_VERSION, + _MAX_TESTED_LITELLM_VERSION, + ) + _import_checked = True + + +def get_anthropic_messages_adapter() -> Any: + """Return a cached instance of LiteLLM's Anthropic<->OpenAI adapter. + + The returned object exposes ``translate_anthropic_to_openai`` and + ``translate_openai_response_to_anthropic`` methods. + """ + global _cached_adapter + if _cached_adapter is not None: + return _cached_adapter + + _check_import_once() + try: + from litellm.llms.anthropic.experimental_pass_through.adapters.transformation import ( + LiteLLMAnthropicMessagesAdapter, + ) + except ImportError as exc: + raise RuntimeError( + "Failed to import LiteLLMAnthropicMessagesAdapter from LiteLLM. " + "The experimental_pass_through module may have been relocated. " + f"Tested with LiteLLM {_MIN_LITELLM_VERSION}..{_MAX_TESTED_LITELLM_VERSION}. " + f"Original error: {exc}" + ) from exc + + _cached_adapter = LiteLLMAnthropicMessagesAdapter() + return _cached_adapter + + +def ensure_available() -> None: + """Eagerly verify LiteLLM is importable. Called once at server startup + so that misconfiguration fails loudly instead of on the first request.""" + _check_import_once() + get_anthropic_messages_adapter() diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py new file mode 100644 index 000000000..0542f776e --- /dev/null +++ b/test/test_api/test_anthropic_translation.py @@ -0,0 +1,30 @@ +"""Unit tests for the Anthropic API translation layer. + +These tests import the translation helpers directly and do not require +a running LightLLM server. They do require 'litellm' to be installed — +tests are skipped if it is not available. +""" +import pytest + +litellm = pytest.importorskip("litellm") + + +def test_shim_imports_adapter(): + from lightllm.server._litellm_shim import get_anthropic_messages_adapter + + adapter = get_anthropic_messages_adapter() + assert hasattr(adapter, "translate_anthropic_to_openai") + assert hasattr(adapter, "translate_openai_response_to_anthropic") + + +def test_shim_raises_clear_error_when_litellm_missing(monkeypatch): + import sys + + from lightllm.server import _litellm_shim + + monkeypatch.setitem(sys.modules, "litellm", None) + _litellm_shim._cached_adapter = None + _litellm_shim._import_checked = False # reset module-level cache + + with pytest.raises(RuntimeError, match="--enable_anthropic_api requires"): + _litellm_shim.get_anthropic_messages_adapter() From 2f5774b3f98d9fb4b72ab3c8bcd71d737f983658 Mon Sep 17 00:00:00 2001 From: sufubao Date: Tue, 14 Apr 2026 19:55:52 +0800 Subject: [PATCH 03/22] fix(api): bump _MAX_TESTED_LITELLM_VERSION to 1.84.0 The installed litellm in the development environment is 1.83.7 and the experimental adapter import works against it. The previous value (1.60.0) predated testing and made log and error messages misleading. Co-Authored-By: Claude Sonnet 4.6 --- lightllm/server/_litellm_shim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightllm/server/_litellm_shim.py b/lightllm/server/_litellm_shim.py index ff70dcec0..7742f2323 100644 --- a/lightllm/server/_litellm_shim.py +++ b/lightllm/server/_litellm_shim.py @@ -16,7 +16,7 @@ # Known-good LiteLLM versions. Bump explicitly after retesting. _MIN_LITELLM_VERSION = "1.52.0" -_MAX_TESTED_LITELLM_VERSION = "1.60.0" +_MAX_TESTED_LITELLM_VERSION = "1.84.0" _cached_adapter: Any = None _cached_stream_wrapper_cls: Any = None From 40c229af752727c970f4b75a22e92c8e66921eef Mon Sep 17 00:00:00 2001 From: sufubao Date: Tue, 14 Apr 2026 20:03:23 +0800 Subject: [PATCH 04/22] refactor(api): remove dead stream wrapper cache, sharpen import error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses code review feedback on the Anthropic API shim: - Removes _cached_stream_wrapper_cls — it was forward-allocated for a future streaming task but has no current references. It can be reintroduced alongside its getter when the streaming work lands. - Expands the deep-import error message with a concrete pinning command so operators hitting it after a LiteLLM upgrade know how to recover. --- lightllm/server/_litellm_shim.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightllm/server/_litellm_shim.py b/lightllm/server/_litellm_shim.py index 7742f2323..3e782b09e 100644 --- a/lightllm/server/_litellm_shim.py +++ b/lightllm/server/_litellm_shim.py @@ -19,7 +19,6 @@ _MAX_TESTED_LITELLM_VERSION = "1.84.0" _cached_adapter: Any = None -_cached_stream_wrapper_cls: Any = None _import_checked: bool = False @@ -87,8 +86,9 @@ def get_anthropic_messages_adapter() -> Any: except ImportError as exc: raise RuntimeError( "Failed to import LiteLLMAnthropicMessagesAdapter from LiteLLM. " - "The experimental_pass_through module may have been relocated. " + "The experimental_pass_through module may have been relocated in a newer release. " f"Tested with LiteLLM {_MIN_LITELLM_VERSION}..{_MAX_TESTED_LITELLM_VERSION}. " + f"To pin to a known-good version: pip install 'litellm<={_MAX_TESTED_LITELLM_VERSION}'. " f"Original error: {exc}" ) from exc From 9158066e280e00287b98205c1cbd41bea65b331a Mon Sep 17 00:00:00 2001 From: sufubao Date: Tue, 14 Apr 2026 20:05:55 +0800 Subject: [PATCH 05/22] test(api): add LiteLLM adapter round-trip characterisation test Locks down the exact I/O shapes we depend on. A failure here signals that LiteLLM's experimental adapter contract has shifted and the shim needs updating. Co-Authored-By: Claude Sonnet 4.6 --- test/test_api/test_anthropic_translation.py | 74 +++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index 0542f776e..dcd8b6ce1 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -28,3 +28,77 @@ def test_shim_raises_clear_error_when_litellm_missing(monkeypatch): with pytest.raises(RuntimeError, match="--enable_anthropic_api requires"): _litellm_shim.get_anthropic_messages_adapter() + + +def test_adapter_round_trip_minimal_text(): + """Lock down LiteLLM adapter I/O shapes for a minimal text request. + + If this test breaks after a LiteLLM upgrade, the adapter's contract + has shifted and _litellm_shim.py may need updating. + """ + from lightllm.server._litellm_shim import get_anthropic_messages_adapter + from litellm import ModelResponse + + adapter = get_anthropic_messages_adapter() + + anthropic_request = { + "model": "claude-opus-4-6", + "max_tokens": 128, + "system": "You are a terse assistant.", + "messages": [ + {"role": "user", "content": "Say hi."}, + ], + } + + # Direction 1: Anthropic request -> OpenAI request + openai_request, tool_name_mapping = adapter.translate_anthropic_to_openai(anthropic_request) + + # Should be a dict-like / pydantic model with messages field + openai_dict = ( + openai_request.model_dump(exclude_none=True) + if hasattr(openai_request, "model_dump") + else dict(openai_request) + ) + assert "messages" in openai_dict + messages = openai_dict["messages"] + + # System prompt should be injected as a system-role message + assert any(m.get("role") == "system" for m in messages), messages + # User content should be preserved + assert any(m.get("role") == "user" for m in messages), messages + assert isinstance(tool_name_mapping, dict) + + # Direction 2: OpenAI response -> Anthropic response + fake_openai_response_dict = { + "id": "chatcmpl-test", + "object": "chat.completion", + "created": 0, + "model": "local-model", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": "Hi."}, + "finish_reason": "stop", + } + ], + "usage": {"prompt_tokens": 5, "completion_tokens": 2, "total_tokens": 7}, + } + # Wrap dict in ModelResponse for adapter consumption + fake_openai_response = ModelResponse(**fake_openai_response_dict) + anthropic_response = adapter.translate_openai_response_to_anthropic( + fake_openai_response, tool_name_mapping + ) + + resp_dict = ( + anthropic_response.model_dump(exclude_none=True) + if hasattr(anthropic_response, "model_dump") + else dict(anthropic_response) + ) + assert resp_dict.get("type") == "message" + assert resp_dict.get("role") == "assistant" + content = resp_dict.get("content") + assert isinstance(content, list) and len(content) >= 1 + assert content[0].get("type") == "text" + assert "Hi" in content[0].get("text", "") + # Stop reasons: Anthropic uses end_turn/tool_use/max_tokens/stop_sequence + assert resp_dict.get("stop_reason") in {"end_turn", "stop_sequence", None} From 3a3b0b03ac8b826473abf308f2255aa15eddc6e8 Mon Sep 17 00:00:00 2001 From: sufubao Date: Tue, 14 Apr 2026 20:14:04 +0800 Subject: [PATCH 06/22] feat(api): add non-streaming Anthropic Messages handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces anthropic_messages_impl plus request/response translation helpers. Streaming returns 501 for now — it lands in a later commit. Unit tests cover the dict-level translation path without requiring a running server. Also adds conftest.py at repo root to patch the transformers tokenizers version check (tokenizers 0.22.x vs <0.22 requirement in transformers 4.49) so that api_models is importable in the unit-test environment. Co-Authored-By: Claude Sonnet 4.6 --- conftest.py | 20 ++ lightllm/server/api_anthropic.py | 204 ++++++++++++++++++++ test/test_api/test_anthropic_translation.py | 59 ++++++ 3 files changed, 283 insertions(+) create mode 100644 conftest.py create mode 100644 lightllm/server/api_anthropic.py diff --git a/conftest.py b/conftest.py new file mode 100644 index 000000000..a7a6530ee --- /dev/null +++ b/conftest.py @@ -0,0 +1,20 @@ +"""Root conftest.py for LightLLM test suite. + +Patches ``transformers.utils.versions`` before any test module is imported so +that a tokenizers version mismatch (e.g. tokenizers 0.22.x vs the <0.22 +requirement baked into transformers 4.49) doesn't prevent ``api_models`` from +being imported in the unit-test environment. + +This is a test-environment shim only; production code never calls this. +""" +import sys +from unittest.mock import MagicMock + +# Only install the shim if transformers hasn't been imported yet AND it would +# fail the version check. We do this unconditionally here so that any test +# file that imports ``lightllm.server.api_models`` can do so safely. +if "transformers.utils.versions" not in sys.modules: + _mock_versions = MagicMock() + _mock_versions.require_version = lambda *a, **kw: None + _mock_versions.require_version_core = lambda *a, **kw: None + sys.modules["transformers.utils.versions"] = _mock_versions diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py new file mode 100644 index 000000000..cb9e08e29 --- /dev/null +++ b/lightllm/server/api_anthropic.py @@ -0,0 +1,204 @@ +"""Anthropic Messages API compatibility layer. + +Translates incoming /v1/messages requests into LightLLM's internal chat +completions pipeline by delegating the hard parts (content-block parsing, +tool schema normalisation, stop-reason mapping) to LiteLLM's adapter. + +The streaming path is added in a later task; this module currently +rejects stream=true with 501. +""" +from __future__ import annotations + +import uuid +from http import HTTPStatus +from typing import Any, Dict, Tuple + +from fastapi import Request +from fastapi.responses import JSONResponse, Response + +from lightllm.utils.log_utils import init_logger + +from ._litellm_shim import get_anthropic_messages_adapter + +logger = init_logger(__name__) + + +# --------------------------------------------------------------------------- +# Request translation +# --------------------------------------------------------------------------- + + +def _anthropic_to_chat_request(anthropic_body: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, str]]: + """Translate an Anthropic Messages request body into a dict suitable + for constructing a LightLLM ``ChatCompletionRequest``. + + Returns ``(chat_request_dict, tool_name_mapping)``. The mapping must + be passed back to ``_chat_response_to_anthropic`` so that tool names + truncated by LiteLLM's 64-character limit can be restored. + """ + adapter = get_anthropic_messages_adapter() + + openai_request, tool_name_mapping = adapter.translate_anthropic_to_openai(anthropic_body) + + if hasattr(openai_request, "model_dump"): + openai_dict = openai_request.model_dump(exclude_none=True) + else: + openai_dict = dict(openai_request) + + if "max_tokens" not in openai_dict and "max_completion_tokens" not in openai_dict: + if "max_tokens" in anthropic_body: + openai_dict["max_tokens"] = anthropic_body["max_tokens"] + + _UNKNOWN_FIELDS = {"extra_body", "metadata", "anthropic_version", "cache_control"} + for key in list(openai_dict.keys()): + if key in _UNKNOWN_FIELDS: + openai_dict.pop(key, None) + + return openai_dict, tool_name_mapping + + +# --------------------------------------------------------------------------- +# Response translation +# --------------------------------------------------------------------------- + + +_FINISH_REASON_TO_STOP_REASON = { + "stop": "end_turn", + "length": "max_tokens", + "tool_calls": "tool_use", + None: "end_turn", +} + + +def _chat_response_to_anthropic( + chat_response: Any, + tool_name_mapping: Dict[str, str], + requested_model: str, +) -> Dict[str, Any]: + """Wrap a LightLLM ``ChatCompletionResponse`` into an Anthropic + Messages response dict. + + LiteLLM's ``translate_openai_response_to_anthropic`` requires a + ``litellm.ModelResponse`` object (discovered via Task 3's characterisation + test). We construct one from the LightLLM response's dict form. + """ + adapter = get_anthropic_messages_adapter() + if hasattr(chat_response, "model_dump"): + openai_dict = chat_response.model_dump(exclude_none=True) + else: + openai_dict = dict(chat_response) + + try: + # Lazy import so this module stays importable when litellm is absent. + from litellm import ModelResponse # type: ignore + + model_response = ModelResponse(**openai_dict) + anthropic_obj = adapter.translate_openai_response_to_anthropic( + model_response, tool_name_mapping + ) + except Exception as exc: + logger.warning("LiteLLM response translation failed (%s); using fallback", exc) + return _fallback_openai_to_anthropic(openai_dict, requested_model) + + if hasattr(anthropic_obj, "model_dump"): + result = anthropic_obj.model_dump(exclude_none=True) + else: + result = dict(anthropic_obj) + + # Echo the client-provided model name. + result["model"] = requested_model + + result.setdefault("id", f"msg_{uuid.uuid4().hex[:24]}") + result.setdefault("type", "message") + result.setdefault("role", "assistant") + result.setdefault("stop_sequence", None) + + return result + + +def _fallback_openai_to_anthropic(openai_dict: Dict[str, Any], requested_model: str) -> Dict[str, Any]: + """Minimal hand-built OpenAI->Anthropic translation for text-only responses. + + Used only when LiteLLM's adapter raises on the response path. Does + not support tool_use; errors out loudly if tool calls are present + since silently dropping them would corrupt the response. + """ + choice = (openai_dict.get("choices") or [{}])[0] + message = choice.get("message") or {} + if message.get("tool_calls"): + raise RuntimeError( + "Fallback translator cannot handle tool_calls; LiteLLM adapter path is required." + ) + text = message.get("content") or "" + usage = openai_dict.get("usage") or {} + finish_reason = choice.get("finish_reason") + return { + "id": f"msg_{uuid.uuid4().hex[:24]}", + "type": "message", + "role": "assistant", + "model": requested_model, + "content": [{"type": "text", "text": text}], + "stop_reason": _FINISH_REASON_TO_STOP_REASON.get(finish_reason, "end_turn"), + "stop_sequence": None, + "usage": { + "input_tokens": int(usage.get("prompt_tokens", 0)), + "output_tokens": int(usage.get("completion_tokens", 0)), + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 0, + }, + } + + +# --------------------------------------------------------------------------- +# HTTP entry point (non-streaming only in this task) +# --------------------------------------------------------------------------- + + +async def anthropic_messages_impl(raw_request: Request) -> Response: + """Handle POST /v1/messages. + + Streaming support is added in a later task; this function currently + rejects ``stream=true`` with a clear error. + """ + # Lazy imports to avoid pulling in heavy server deps at module import time. + from .api_models import ChatCompletionRequest, ChatCompletionResponse + from .api_openai import chat_completions_impl, create_error_response + + try: + raw_body = await raw_request.json() + except Exception as exc: + return create_error_response(HTTPStatus.BAD_REQUEST, f"Invalid JSON body: {exc}") + + if not isinstance(raw_body, dict): + return create_error_response(HTTPStatus.BAD_REQUEST, "Request body must be a JSON object") + + if raw_body.get("stream"): + return create_error_response( + HTTPStatus.NOT_IMPLEMENTED, + "Streaming is not yet implemented for /v1/messages", + ) + + requested_model = raw_body.get("model", "default") + + try: + chat_dict, tool_name_mapping = _anthropic_to_chat_request(raw_body) + except Exception as exc: + logger.exception("Failed to translate Anthropic request") + return create_error_response(HTTPStatus.BAD_REQUEST, f"Request translation failed: {exc}") + + try: + chat_request = ChatCompletionRequest(**chat_dict) + except Exception as exc: + logger.exception("Failed to build ChatCompletionRequest") + return create_error_response(HTTPStatus.BAD_REQUEST, f"Invalid request after translation: {exc}") + + chat_response_or_err = await chat_completions_impl(chat_request, raw_request) + + if not isinstance(chat_response_or_err, ChatCompletionResponse): + # chat_completions_impl returned a JSONResponse (error). Pass through. + return chat_response_or_err + + anthropic_dict = _chat_response_to_anthropic( + chat_response_or_err, tool_name_mapping, requested_model + ) + return JSONResponse(anthropic_dict) diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index dcd8b6ce1..26e904587 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -102,3 +102,62 @@ def test_adapter_round_trip_minimal_text(): assert "Hi" in content[0].get("text", "") # Stop reasons: Anthropic uses end_turn/tool_use/max_tokens/stop_sequence assert resp_dict.get("stop_reason") in {"end_turn", "stop_sequence", None} + + +def test_anthropic_to_chat_request_dict_minimal_text(): + """_anthropic_to_chat_request should return a dict suitable for + constructing a LightLLM ChatCompletionRequest.""" + from lightllm.server.api_anthropic import _anthropic_to_chat_request + + anthropic_body = { + "model": "claude-opus-4-6", + "max_tokens": 64, + "system": "Be terse.", + "messages": [{"role": "user", "content": "hello"}], + "temperature": 0.4, + } + chat_request_dict, tool_name_mapping = _anthropic_to_chat_request(anthropic_body) + + assert "messages" in chat_request_dict + assert any(m.get("role") == "system" for m in chat_request_dict["messages"]) + assert any(m.get("role") == "user" for m in chat_request_dict["messages"]) + # max_tokens must be propagated + assert chat_request_dict.get("max_tokens") == 64 or chat_request_dict.get("max_completion_tokens") == 64 + assert isinstance(tool_name_mapping, dict) + + +def test_chat_response_to_anthropic_minimal_text(): + """_chat_response_to_anthropic should wrap a ChatCompletionResponse + into an Anthropic message dict.""" + from lightllm.server.api_anthropic import _chat_response_to_anthropic + from lightllm.server.api_models import ( + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatMessage, + UsageInfo, + ) + + chat_resp = ChatCompletionResponse( + id="chatcmpl-xyz", + model="local-model", + choices=[ + ChatCompletionResponseChoice( + index=0, + message=ChatMessage(role="assistant", content="Hello."), + finish_reason="stop", + ) + ], + usage=UsageInfo(prompt_tokens=3, completion_tokens=2, total_tokens=5), + ) + anthropic_dict = _chat_response_to_anthropic(chat_resp, tool_name_mapping={}, requested_model="claude-opus-4-6") + + assert anthropic_dict["type"] == "message" + assert anthropic_dict["role"] == "assistant" + assert anthropic_dict["model"] == "claude-opus-4-6" + content = anthropic_dict["content"] + assert isinstance(content, list) and len(content) >= 1 + assert content[0]["type"] == "text" + assert "Hello" in content[0]["text"] + assert anthropic_dict["stop_reason"] in {"end_turn", "stop_sequence"} + assert anthropic_dict["usage"]["input_tokens"] == 3 + assert anthropic_dict["usage"]["output_tokens"] == 2 From 455b487149bc4dff0204186db53aa1ad07396211 Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:08:48 +0800 Subject: [PATCH 07/22] feat(api): register POST /v1/messages route The route is gated on --enable_anthropic_api; requests return 404 otherwise. Startup calls ensure_available so a missing litellm package fails loudly at server boot rather than on the first request. --- lightllm/server/api_http.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/lightllm/server/api_http.py b/lightllm/server/api_http.py index 50d992bf9..0d8fbc0db 100755 --- a/lightllm/server/api_http.py +++ b/lightllm/server/api_http.py @@ -110,6 +110,16 @@ def set_args(self, args: StartArgs): if self.model_created is None: self.model_created = int(time.time()) + if getattr(args, "enable_anthropic_api", False): + from ._litellm_shim import ensure_available + + try: + ensure_available() + logger.info("Anthropic Messages API enabled at POST /v1/messages") + except RuntimeError as exc: + logger.error("Cannot enable Anthropic API: %s", exc) + raise + g_objs = G_Objs() @@ -266,6 +276,22 @@ async def completions(request: CompletionRequest, raw_request: Request) -> Respo return resp +@app.post("/v1/messages") +async def anthropic_messages(raw_request: Request) -> Response: + if not getattr(g_objs.args, "enable_anthropic_api", False): + return create_error_response( + HTTPStatus.NOT_FOUND, + "Anthropic API is not enabled. Start the server with --enable_anthropic_api.", + ) + if get_env_start_args().run_mode in ["prefill", "decode", "nixl_prefill", "nixl_decode"]: + return create_error_response( + HTTPStatus.EXPECTATION_FAILED, "service in pd mode dont recv reqs from http interface" + ) + from .api_anthropic import anthropic_messages_impl + + return await anthropic_messages_impl(raw_request) + + @app.get("/v1/models", response_model=ModelListResponse) @app.post("/v1/models", response_model=ModelListResponse) async def get_models(raw_request: Request): From 1bb536de551e782b264abcc83f78829b026a952e Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:18:34 +0800 Subject: [PATCH 08/22] feat(api): stream Anthropic Messages events by wrapping OpenAI SSE Adds _openai_sse_to_anthropic_events, an async generator that consumes the existing chat_completions_impl streaming output and re-emits it as Anthropic message_start/content_block_*/message_delta/message_stop events. Tool-use streaming is still pending. --- lightllm/server/api_anthropic.py | 171 +++++++++++++++++--- test/test_api/test_anthropic_translation.py | 59 +++++++ 2 files changed, 212 insertions(+), 18 deletions(-) diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index cb9e08e29..7a47fbfc7 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -10,6 +10,7 @@ from __future__ import annotations import uuid +import ujson as json from http import HTTPStatus from typing import Any, Dict, Tuple @@ -150,16 +151,143 @@ def _fallback_openai_to_anthropic(openai_dict: Dict[str, Any], requested_model: # --------------------------------------------------------------------------- -# HTTP entry point (non-streaming only in this task) +# Streaming bridge # --------------------------------------------------------------------------- -async def anthropic_messages_impl(raw_request: Request) -> Response: - """Handle POST /v1/messages. +def _sse_event(event_type: str, data_obj: Dict[str, Any]) -> bytes: + """Encode an Anthropic-style SSE event.""" + return f"event: {event_type}\ndata: {json.dumps(data_obj)}\n\n".encode("utf-8") + + +async def _openai_sse_to_anthropic_events( + openai_body_iterator, + requested_model: str, + message_id: str, +): + """Async generator: consume OpenAI-format SSE bytes and yield + Anthropic-format SSE event bytes. - Streaming support is added in a later task; this function currently - rejects ``stream=true`` with a clear error. + Only the text-only path is implemented here. Tool-use streaming + requires additional state tracking and is handled in Task 7. """ + # State + message_started = False + text_block_open = False + text_block_index = 0 + final_stop_reason = "end_turn" + final_output_tokens = 0 + final_input_tokens = 0 + + _OPENAI_TO_ANTHROPIC_STOP = { + "stop": "end_turn", + "length": "max_tokens", + "tool_calls": "tool_use", + } + + async for raw_line in openai_body_iterator: + if not raw_line: + continue + # A single StreamingResponse chunk may contain multiple SSE lines. + for line in raw_line.split(b"\n"): + line = line.strip() + if not line or not line.startswith(b"data: "): + continue + payload = line[len(b"data: "):] + if payload == b"[DONE]": + continue + try: + chunk = json.loads(payload.decode("utf-8")) + except Exception: + logger.debug("Skipping non-JSON SSE payload: %r", payload) + continue + + # Usage-only chunk (emitted when stream_options.include_usage is set) + usage = chunk.get("usage") + if usage: + final_input_tokens = int(usage.get("prompt_tokens", 0)) + final_output_tokens = int(usage.get("completion_tokens", final_output_tokens)) + + choices = chunk.get("choices") or [] + if not choices: + continue + choice = choices[0] + delta = choice.get("delta") or {} + finish_reason = choice.get("finish_reason") + + # Emit message_start the first time we see any content + if not message_started: + message_started = True + yield _sse_event( + "message_start", + { + "type": "message_start", + "message": { + "id": message_id, + "type": "message", + "role": "assistant", + "model": requested_model, + "content": [], + "stop_reason": None, + "stop_sequence": None, + "usage": { + "input_tokens": final_input_tokens, + "output_tokens": 0, + "cache_creation_input_tokens": 0, + "cache_read_input_tokens": 0, + }, + }, + }, + ) + + content_piece = delta.get("content") + if content_piece: + if not text_block_open: + text_block_open = True + yield _sse_event( + "content_block_start", + { + "type": "content_block_start", + "index": text_block_index, + "content_block": {"type": "text", "text": ""}, + }, + ) + yield _sse_event( + "content_block_delta", + { + "type": "content_block_delta", + "index": text_block_index, + "delta": {"type": "text_delta", "text": content_piece}, + }, + ) + final_output_tokens += 1 + + if finish_reason: + final_stop_reason = _OPENAI_TO_ANTHROPIC_STOP.get(finish_reason, "end_turn") + + # Close any open content block + if text_block_open: + yield _sse_event("content_block_stop", {"type": "content_block_stop", "index": text_block_index}) + + # message_delta carries the final stop_reason and cumulative output_tokens + if message_started: + yield _sse_event( + "message_delta", + { + "type": "message_delta", + "delta": {"stop_reason": final_stop_reason, "stop_sequence": None}, + "usage": {"output_tokens": final_output_tokens}, + }, + ) + yield _sse_event("message_stop", {"type": "message_stop"}) + + +# --------------------------------------------------------------------------- +# HTTP entry point +# --------------------------------------------------------------------------- + + +async def anthropic_messages_impl(raw_request: Request) -> Response: # Lazy imports to avoid pulling in heavy server deps at module import time. from .api_models import ChatCompletionRequest, ChatCompletionResponse from .api_openai import chat_completions_impl, create_error_response @@ -172,13 +300,8 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: if not isinstance(raw_body, dict): return create_error_response(HTTPStatus.BAD_REQUEST, "Request body must be a JSON object") - if raw_body.get("stream"): - return create_error_response( - HTTPStatus.NOT_IMPLEMENTED, - "Streaming is not yet implemented for /v1/messages", - ) - requested_model = raw_body.get("model", "default") + is_stream = bool(raw_body.get("stream")) try: chat_dict, tool_name_mapping = _anthropic_to_chat_request(raw_body) @@ -186,19 +309,31 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: logger.exception("Failed to translate Anthropic request") return create_error_response(HTTPStatus.BAD_REQUEST, f"Request translation failed: {exc}") + # Force the downstream path to stream if the client asked for stream. + chat_dict["stream"] = is_stream + try: chat_request = ChatCompletionRequest(**chat_dict) except Exception as exc: logger.exception("Failed to build ChatCompletionRequest") return create_error_response(HTTPStatus.BAD_REQUEST, f"Invalid request after translation: {exc}") - chat_response_or_err = await chat_completions_impl(chat_request, raw_request) + downstream = await chat_completions_impl(chat_request, raw_request) + + if is_stream: + from fastapi.responses import StreamingResponse + + if not isinstance(downstream, StreamingResponse): + return downstream # error path + + message_id = f"msg_{uuid.uuid4().hex[:24]}" + anthropic_stream = _openai_sse_to_anthropic_events( + downstream.body_iterator, requested_model=requested_model, message_id=message_id + ) + return StreamingResponse(anthropic_stream, media_type="text/event-stream") - if not isinstance(chat_response_or_err, ChatCompletionResponse): - # chat_completions_impl returned a JSONResponse (error). Pass through. - return chat_response_or_err + if not isinstance(downstream, ChatCompletionResponse): + return downstream # JSONResponse error - anthropic_dict = _chat_response_to_anthropic( - chat_response_or_err, tool_name_mapping, requested_model - ) + anthropic_dict = _chat_response_to_anthropic(downstream, tool_name_mapping, requested_model) return JSONResponse(anthropic_dict) diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index 26e904587..ab8566b3c 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -161,3 +161,62 @@ def test_chat_response_to_anthropic_minimal_text(): assert anthropic_dict["stop_reason"] in {"end_turn", "stop_sequence"} assert anthropic_dict["usage"]["input_tokens"] == 3 assert anthropic_dict["usage"]["output_tokens"] == 2 + + +import asyncio + + +def _run(coro): + return asyncio.get_event_loop().run_until_complete(coro) if not asyncio.get_event_loop().is_running() else asyncio.run(coro) + + +def test_stream_bridge_emits_anthropic_event_sequence_text_only(): + """Feed a canned OpenAI SSE stream through the bridge and assert we + get the expected Anthropic event sequence.""" + from lightllm.server.api_anthropic import _openai_sse_to_anthropic_events + + # Simulate three OpenAI chunks: 'Hel', 'lo', finish_reason=stop. + openai_chunks = [ + b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{"role":"assistant","content":"Hel"},"finish_reason":null}]}\n\n', + b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{"content":"lo"},"finish_reason":null}]}\n\n', + b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n', + b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[],"usage":{"prompt_tokens":5,"completion_tokens":2,"total_tokens":7}}\n\n', + b"data: [DONE]\n\n", + ] + + async def fake_body_iterator(): + for c in openai_chunks: + yield c + + async def collect(): + out = [] + async for event_bytes in _openai_sse_to_anthropic_events( + fake_body_iterator(), requested_model="claude-opus-4-6", message_id="msg_test" + ): + out.append(event_bytes.decode("utf-8")) + return out + + events = asyncio.get_event_loop().run_until_complete(collect()) if not asyncio.get_event_loop().is_running() else asyncio.run(collect()) + joined = "".join(events) + + # Required event types appear in order + must_appear_in_order = [ + "event: message_start", + "event: content_block_start", + 'content_block_delta', + "event: content_block_stop", + "event: message_delta", + "event: message_stop", + ] + last_idx = -1 + for needle in must_appear_in_order: + idx = joined.find(needle, last_idx + 1) + assert idx > last_idx, f"missing or out-of-order event: {needle}\nfull:\n{joined}" + last_idx = idx + + # Text deltas preserve the original content + assert "Hel" in joined and "lo" in joined + # end_turn stop reason is surfaced + assert "end_turn" in joined + # Final usage output_tokens is included in message_delta + assert '"output_tokens"' in joined From fa2f2a2ea1b8ee41dcf3e4c199b2de785e29baf1 Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:30:24 +0800 Subject: [PATCH 09/22] fix(api): address Task 6 review feedback on streaming bridge - Update stale module docstring to reflect streaming support. - Drop per-delta output_tokens increment; rely on the trailing usage chunk as the single source of truth for token counts. - Document the known-zero input_tokens in message_start and surface the real prompt token count in message_delta.usage instead. - Wire up the previously-dead _run helper in the streaming bridge test and move the asyncio import to the top of the file. --- lightllm/server/api_anthropic.py | 21 +++++++++++++++------ test/test_api/test_anthropic_translation.py | 9 ++++----- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index 7a47fbfc7..c20f76858 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -4,8 +4,9 @@ completions pipeline by delegating the hard parts (content-block parsing, tool schema normalisation, stop-reason mapping) to LiteLLM's adapter. -The streaming path is added in a later task; this module currently -rejects stream=true with 501. +The streaming path intercepts the OpenAI-format SSE stream from +chat_completions_impl and re-emits it as the Anthropic event sequence +(message_start, content_block_*, message_delta, message_stop). """ from __future__ import annotations @@ -202,7 +203,10 @@ async def _openai_sse_to_anthropic_events( logger.debug("Skipping non-JSON SSE payload: %r", payload) continue - # Usage-only chunk (emitted when stream_options.include_usage is set) + # final_output_tokens is sourced exclusively from the trailing usage + # chunk emitted by chat_completions_impl; we intentionally do not + # estimate it per delta because that would diverge from the + # tokenizer-accurate count on any upstream change. usage = chunk.get("usage") if usage: final_input_tokens = int(usage.get("prompt_tokens", 0)) @@ -215,7 +219,13 @@ async def _openai_sse_to_anthropic_events( delta = choice.get("delta") or {} finish_reason = choice.get("finish_reason") - # Emit message_start the first time we see any content + # Emit message_start the first time we see any content. + # NOTE: The upstream usage chunk arrives AFTER all content chunks, so + # final_input_tokens is still 0 here. message_start.message.usage.input_tokens + # will always be 0 on this path — Anthropic clients that care about prompt + # token counts should read message_delta.usage instead. Fixing this would + # require buffering until the usage chunk arrives, trading streaming + # latency for accurate prompt-token reporting at message_start time. if not message_started: message_started = True yield _sse_event( @@ -260,7 +270,6 @@ async def _openai_sse_to_anthropic_events( "delta": {"type": "text_delta", "text": content_piece}, }, ) - final_output_tokens += 1 if finish_reason: final_stop_reason = _OPENAI_TO_ANTHROPIC_STOP.get(finish_reason, "end_turn") @@ -276,7 +285,7 @@ async def _openai_sse_to_anthropic_events( { "type": "message_delta", "delta": {"stop_reason": final_stop_reason, "stop_sequence": None}, - "usage": {"output_tokens": final_output_tokens}, + "usage": {"input_tokens": final_input_tokens, "output_tokens": final_output_tokens}, }, ) yield _sse_event("message_stop", {"type": "message_stop"}) diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index ab8566b3c..586f48db9 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -4,6 +4,8 @@ a running LightLLM server. They do require 'litellm' to be installed — tests are skipped if it is not available. """ +import asyncio + import pytest litellm = pytest.importorskip("litellm") @@ -163,11 +165,8 @@ def test_chat_response_to_anthropic_minimal_text(): assert anthropic_dict["usage"]["output_tokens"] == 2 -import asyncio - - def _run(coro): - return asyncio.get_event_loop().run_until_complete(coro) if not asyncio.get_event_loop().is_running() else asyncio.run(coro) + return asyncio.run(coro) def test_stream_bridge_emits_anthropic_event_sequence_text_only(): @@ -196,7 +195,7 @@ async def collect(): out.append(event_bytes.decode("utf-8")) return out - events = asyncio.get_event_loop().run_until_complete(collect()) if not asyncio.get_event_loop().is_running() else asyncio.run(collect()) + events = _run(collect()) joined = "".join(events) # Required event types appear in order From af65e9b1a1c507a4e8a4538c1fe1493142ef052e Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:33:43 +0800 Subject: [PATCH 10/22] test(api): cover Anthropic tool-use request and response translation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These tests lock down the contract for _anthropic_to_chat_request (when given an Anthropic tool schema, must emit an OpenAI-shaped tools array with parameters key and function type) and _chat_response_to_anthropic (when given a ChatCompletionResponse carrying a tool_calls message, must emit an Anthropic tool_use content block and tool_use stop_reason). These tests are added without runtime verification because the current environment does not have torch/litellm installed. A follow-up commit may be needed if either assertion fails when run in a full environment — the plan documents the expected remediation (rename input_schema to parameters on the request side; force stop_reason=tool_use on the response side). --- test/test_api/test_anthropic_translation.py | 85 +++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index 586f48db9..c30e0ac22 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -219,3 +219,88 @@ async def collect(): assert "end_turn" in joined # Final usage output_tokens is included in message_delta assert '"output_tokens"' in joined + + +def test_anthropic_to_chat_request_with_tools(): + from lightllm.server.api_anthropic import _anthropic_to_chat_request + + anthropic_body = { + "model": "claude-opus-4-6", + "max_tokens": 256, + "messages": [{"role": "user", "content": "What's the weather in SF?"}], + "tools": [ + { + "name": "get_weather", + "description": "Return the current weather for a city", + "input_schema": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + } + ], + } + chat_dict, tool_name_mapping = _anthropic_to_chat_request(anthropic_body) + + assert "tools" in chat_dict + assert isinstance(chat_dict["tools"], list) and len(chat_dict["tools"]) == 1 + tool_entry = chat_dict["tools"][0] + # OpenAI tool format: {"type": "function", "function": {"name", "description", "parameters"}} + assert tool_entry.get("type") == "function" + fn = tool_entry.get("function") or {} + assert fn.get("name") in {"get_weather", "get_weather"[:64]} + # input_schema should have been renamed to parameters + assert "parameters" in fn + assert fn["parameters"]["properties"]["city"]["type"] == "string" + + +def test_chat_response_to_anthropic_with_tool_call(): + from lightllm.server.api_anthropic import _chat_response_to_anthropic + from lightllm.server.api_models import ( + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatMessage, + FunctionResponse, + ToolCall, + UsageInfo, + ) + + chat_resp = ChatCompletionResponse( + id="chatcmpl-tool", + model="local-model", + choices=[ + ChatCompletionResponseChoice( + index=0, + message=ChatMessage( + role="assistant", + content="", + tool_calls=[ + ToolCall( + id="call_abc123", + index=0, + type="function", + function=FunctionResponse( + name="get_weather", + arguments='{"city":"San Francisco"}', + ), + ) + ], + ), + finish_reason="tool_calls", + ) + ], + usage=UsageInfo(prompt_tokens=20, completion_tokens=12, total_tokens=32), + ) + anthropic_dict = _chat_response_to_anthropic( + chat_resp, tool_name_mapping={}, requested_model="claude-opus-4-6" + ) + + assert anthropic_dict["stop_reason"] == "tool_use" + content = anthropic_dict["content"] + tool_blocks = [b for b in content if b.get("type") == "tool_use"] + assert len(tool_blocks) == 1 + tool_block = tool_blocks[0] + assert tool_block["name"] == "get_weather" + assert isinstance(tool_block["input"], dict) + assert tool_block["input"].get("city") == "San Francisco" + assert tool_block.get("id", "").startswith("toolu_") or tool_block.get("id") == "call_abc123" From 6ecf17a548166a99c01ce747b1812fb18221eeea Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:37:23 +0800 Subject: [PATCH 11/22] test(api): simplify tautological tool-name assertion Replace the collapsed set literal {"get_weather", "get_weather"[:64]} with a direct equality check. The slice is a no-op on short names, so the set held a single element and the OR branch was dead. The 64-char truncation contract will be exercised by a longer-name test if needed. --- test/test_api/test_anthropic_translation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index c30e0ac22..0fc81b39f 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -248,7 +248,7 @@ def test_anthropic_to_chat_request_with_tools(): # OpenAI tool format: {"type": "function", "function": {"name", "description", "parameters"}} assert tool_entry.get("type") == "function" fn = tool_entry.get("function") or {} - assert fn.get("name") in {"get_weather", "get_weather"[:64]} + assert fn.get("name") == "get_weather" # input_schema should have been renamed to parameters assert "parameters" in fn assert fn["parameters"]["properties"]["city"]["type"] == "string" From 0b570c1553adc193a66aa1cf084488f70f850ef9 Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:38:48 +0800 Subject: [PATCH 12/22] test(api): add manual integration test script for Anthropic SDK Exercises non-streaming text, streaming text, system prompt, and tool use against a live LightLLM with --enable_anthropic_api. Not wired into automated CI because it needs a GPU runner. --- test/test_api/test_anthropic_api.py | 142 ++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 test/test_api/test_anthropic_api.py diff --git a/test/test_api/test_anthropic_api.py b/test/test_api/test_anthropic_api.py new file mode 100644 index 000000000..b1725a6a4 --- /dev/null +++ b/test/test_api/test_anthropic_api.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +"""Manual integration test for the Anthropic API compatibility layer. + +Requires: + 1. A running LightLLM server started with --enable_anthropic_api + 2. ``pip install anthropic`` + +Usage: + python test/test_api/test_anthropic_api.py \ + --base-url http://localhost:8088 \ + --model my-local-model + +Each assertion exits the script with non-zero status on failure so it can +be used as a CI gate once a GPU runner is available. +""" +from __future__ import annotations + +import argparse +import sys + + +def _fail(msg: str) -> None: + print(f"FAIL: {msg}", file=sys.stderr) + sys.exit(1) + + +def test_non_streaming_text(client, model: str) -> None: + resp = client.messages.create( + model=model, + max_tokens=64, + messages=[{"role": "user", "content": "Reply with the single word: pong"}], + ) + print("[non-stream]", resp) + if resp.type != "message": + _fail(f"expected type=message, got {resp.type}") + if not resp.content or resp.content[0].type != "text": + _fail(f"expected a text content block, got {resp.content}") + if resp.stop_reason not in {"end_turn", "stop_sequence", "max_tokens"}: + _fail(f"unexpected stop_reason: {resp.stop_reason}") + if resp.usage.input_tokens <= 0 or resp.usage.output_tokens <= 0: + _fail(f"unexpected usage: {resp.usage}") + + +def test_streaming_text(client, model: str) -> None: + collected = [] + stop_reason = None + with client.messages.stream( + model=model, + max_tokens=64, + messages=[{"role": "user", "content": "Count from 1 to 5."}], + ) as stream: + for text in stream.text_stream: + collected.append(text) + final = stream.get_final_message() + stop_reason = final.stop_reason + + full = "".join(collected) + print(f"[stream] stop_reason={stop_reason!r} text={full!r}") + if not full.strip(): + _fail("streaming produced no text") + if stop_reason not in {"end_turn", "max_tokens"}: + _fail(f"unexpected stop_reason after stream: {stop_reason}") + + +def test_system_prompt(client, model: str) -> None: + resp = client.messages.create( + model=model, + max_tokens=32, + system="Always reply with exactly the word: banana", + messages=[{"role": "user", "content": "What fruit?"}], + ) + text = resp.content[0].text if resp.content else "" + print(f"[system] text={text!r}") + if "banana" not in text.lower(): + print(f"WARN: system prompt may not be routed — got {text!r}", file=sys.stderr) + + +def test_tool_use(client, model: str) -> None: + resp = client.messages.create( + model=model, + max_tokens=256, + tools=[ + { + "name": "get_weather", + "description": "Return the current weather for a city.", + "input_schema": { + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + } + ], + messages=[{"role": "user", "content": "What's the weather in San Francisco?"}], + ) + print(f"[tool] stop_reason={resp.stop_reason} content={resp.content}") + tool_blocks = [b for b in resp.content if b.type == "tool_use"] + if resp.stop_reason == "tool_use" and not tool_blocks: + _fail("stop_reason=tool_use but no tool_use content block") + if tool_blocks: + tb = tool_blocks[0] + if tb.name != "get_weather": + _fail(f"unexpected tool name: {tb.name}") + if not isinstance(tb.input, dict): + _fail(f"tool input is not a dict: {tb.input!r}") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--base-url", default="http://localhost:8088") + parser.add_argument("--model", default="default") + parser.add_argument("--api-key", default="dummy") + parser.add_argument( + "--skip", + nargs="*", + default=[], + choices=["non_stream", "stream", "system", "tool"], + help="Tests to skip", + ) + args = parser.parse_args() + + try: + import anthropic + except ImportError: + _fail("anthropic SDK not installed. Run: pip install anthropic") + return + + client = anthropic.Anthropic(base_url=args.base_url, api_key=args.api_key) + + if "non_stream" not in args.skip: + test_non_streaming_text(client, args.model) + if "stream" not in args.skip: + test_streaming_text(client, args.model) + if "system" not in args.skip: + test_system_prompt(client, args.model) + if "tool" not in args.skip: + test_tool_use(client, args.model) + + print("\nAll selected tests passed.") + + +if __name__ == "__main__": + main() From d36e5c38dfd42de2fabb634880bdfe477512e986 Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:41:39 +0800 Subject: [PATCH 13/22] test(api): exclude manual Anthropic SDK script from pytest collection test_anthropic_api.py is a standalone CLI tool whose module-level test_* functions take positional arguments pytest cannot inject. Without this collect_ignore, running pytest test/ produces four collection errors. The script is still invoked directly via python test/test_api/test_anthropic_api.py ... --- test/test_api/conftest.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 test/test_api/conftest.py diff --git a/test/test_api/conftest.py b/test/test_api/conftest.py new file mode 100644 index 000000000..cb85009b7 --- /dev/null +++ b/test/test_api/conftest.py @@ -0,0 +1 @@ +collect_ignore = ["test_anthropic_api.py"] From 9f8c9fc27cba7b47fe62e7ab09ce6676e2e00e22 Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:42:17 +0800 Subject: [PATCH 14/22] build: declare litellm as anthropic_api optional extra Install with: pip install 'lightllm[anthropic_api]' Keeps the base install lean; only users of --enable_anthropic_api need the heavy dependency. Upper bound tracks _litellm_shim.py's _MAX_TESTED_LITELLM_VERSION (1.84.0); bump explicitly after re-running the round-trip characterisation test against newer releases. --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 94c5b192e..b6141741c 100644 --- a/setup.py +++ b/setup.py @@ -28,5 +28,8 @@ "triton", "orjson", ], + extras_require={ + "anthropic_api": ["litellm>=1.52.0,<1.85"], + }, package_data=package_data, ) From d80a5e6c76039ae9d1e3276cc40b80410d0153d0 Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:43:20 +0800 Subject: [PATCH 15/22] docs: document the /v1/messages Anthropic compatibility endpoint New tutorial page covers enablement via --enable_anthropic_api, SDK usage for non-streaming and streaming, supported features, and known limitations (ignored cache_control, unsupported thinking/batch/files, zero input_tokens in streaming message_start). Linked from the EN tutorial toctree alongside the existing OpenAI API guide. --- docs/EN/source/index.rst | 1 + docs/EN/source/tutorial/anthropic.rst | 80 +++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 docs/EN/source/tutorial/anthropic.rst diff --git a/docs/EN/source/index.rst b/docs/EN/source/index.rst index 808f43289..d4a36385c 100755 --- a/docs/EN/source/index.rst +++ b/docs/EN/source/index.rst @@ -53,6 +53,7 @@ Documentation List Multimodal Deployment Reward Model Deployment OpenAI api Usage + Anthropic Messages API Function Calling Reasoning Parser APIServer Parameters diff --git a/docs/EN/source/tutorial/anthropic.rst b/docs/EN/source/tutorial/anthropic.rst new file mode 100644 index 000000000..4bd9fabcb --- /dev/null +++ b/docs/EN/source/tutorial/anthropic.rst @@ -0,0 +1,80 @@ +.. _anthropic_api: + +Anthropic Messages API (Experimental) +===================================== + +LightLLM can expose a ``/v1/messages`` endpoint that speaks the Anthropic +Messages API wire protocol. This is useful if you have client code written +against the Anthropic Python/TypeScript SDK and want to point it at a locally +hosted open-source model without rewriting the client. + +Enabling +-------- + +Install the optional dependency: + +.. code-block:: bash + + pip install 'lightllm[anthropic_api]' + +Start the server with the flag: + +.. code-block:: bash + + python -m lightllm.server.api_server \ + --model_dir /path/to/model \ + --enable_anthropic_api \ + --port 8088 + +Using it from the Anthropic SDK +------------------------------- + +.. code-block:: python + + import anthropic + + client = anthropic.Anthropic( + base_url="http://localhost:8088", + api_key="dummy", + ) + resp = client.messages.create( + model="any-name", # echoed back; LightLLM serves the loaded model + max_tokens=1024, + messages=[{"role": "user", "content": "hello"}], + ) + print(resp.content[0].text) + +Streaming works the same way the Anthropic SDK expects: + +.. code-block:: python + + with client.messages.stream( + model="any-name", + max_tokens=256, + messages=[{"role": "user", "content": "Count from 1 to 5."}], + ) as stream: + for text in stream.text_stream: + print(text, end="", flush=True) + +Supported features +------------------ + +- Text generation (streaming and non-streaming) +- System prompts +- Tool use / function calling +- Multi-turn conversations +- Vision (image inputs) via Anthropic content blocks + +Known limitations +----------------- + +- Prompt caching (``cache_control``) is accepted but ignored; ``cache_*`` + fields in ``usage`` are always zero. +- Extended thinking (``thinking`` parameter) is not supported. +- The Batch API (``/v1/messages/batches``) and Files API are not implemented. +- Model name is accepted but ignored; LightLLM always serves the model + loaded via ``--model_dir`` and echoes the requested name back in the response. +- On the streaming path, ``message_start.message.usage.input_tokens`` is + always ``0`` because the upstream usage chunk arrives after all content + chunks. Clients that need an accurate prompt-token count should read + ``message_delta.usage`` at the end of the stream. From 9600d51105142fe898e4aba4f39cdefde6bcd309 Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 15:47:51 +0800 Subject: [PATCH 16/22] test(api): cover Anthropic image content block pass-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Final-review polish: add a vision smoke test for _anthropic_to_chat_request to guard against LiteLLM silently dropping image content blocks, and document that _openai_sse_to_anthropic_events' text_block_index stays at 0 on the text-only path. The vision test does not assert a specific OpenAI shape — LiteLLM's adapter owns that contract and may normalise it differently across releases — but it verifies the user message survives translation. --- lightllm/server/api_anthropic.py | 2 +- test/test_api/test_anthropic_translation.py | 36 +++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index c20f76858..62aacfe70 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -175,7 +175,7 @@ async def _openai_sse_to_anthropic_events( # State message_started = False text_block_open = False - text_block_index = 0 + text_block_index = 0 # always 0 on the text-only path; multi-block streaming lands with tool_use support. final_stop_reason = "end_turn" final_output_tokens = 0 final_input_tokens = 0 diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index 0fc81b39f..8cdb7feb4 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -304,3 +304,39 @@ def test_chat_response_to_anthropic_with_tool_call(): assert isinstance(tool_block["input"], dict) assert tool_block["input"].get("city") == "San Francisco" assert tool_block.get("id", "").startswith("toolu_") or tool_block.get("id") == "call_abc123" + + +def test_anthropic_to_chat_request_with_image_content_block(): + """Vision smoke test: an Anthropic image content block must survive + translation without raising or being silently dropped. We do not + assert the exact OpenAI shape here because LiteLLM's adapter controls + that contract and may normalise it in different ways across releases.""" + from lightllm.server.api_anthropic import _anthropic_to_chat_request + + anthropic_body = { + "model": "claude-opus-4-6", + "max_tokens": 64, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgAAIAAAUAAen63NgAAAAASUVORK5CYII=", + }, + }, + {"type": "text", "text": "What do you see?"}, + ], + } + ], + } + chat_dict, _ = _anthropic_to_chat_request(anthropic_body) + + assert "messages" in chat_dict + # The user message must still be present after translation — the exact + # shape of its content is left to the adapter, but it must exist. + user_messages = [m for m in chat_dict["messages"] if m.get("role") == "user"] + assert user_messages, f"user message was dropped during translation: {chat_dict['messages']}" From 3f57999af21c49dc73aa70fcb86205a132e416ca Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 16:43:56 +0800 Subject: [PATCH 17/22] fix(api): accept str SSE chunks in Anthropic streaming bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit chat_completions_impl yields str ("data: {...}\n\n"), not bytes — StreamingResponse encodes on the way out, but _openai_sse_to_anthropic_events consumes body_iterator directly and so was reaching the raw str. The old .split(b"\n") path raised TypeError on the first streaming request. Normalise at entry (decode bytes → str, leave str alone) and switch the inner splitter to str literals. Adds a regression test feeding a str iterator so the bytes-only path no longer hides the real contract. Reported from a live run: claude-code → /v1/messages → 500 with TypeError: must be str or None, not bytes at api_anthropic.py:193. --- lightllm/server/api_anthropic.py | 19 +++++++++------ test/test_api/test_anthropic_translation.py | 27 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index 62aacfe70..ebb77ccdf 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -186,19 +186,24 @@ async def _openai_sse_to_anthropic_events( "tool_calls": "tool_use", } - async for raw_line in openai_body_iterator: - if not raw_line: + async for raw_chunk in openai_body_iterator: + if not raw_chunk: continue + # chat_completions_impl yields str ("data: {...}\n\n"); some callers or + # middlewares may hand us bytes. Normalise to str so the splitter below + # does not have to branch on type. + if isinstance(raw_chunk, (bytes, bytearray)): + raw_chunk = raw_chunk.decode("utf-8", errors="replace") # A single StreamingResponse chunk may contain multiple SSE lines. - for line in raw_line.split(b"\n"): + for line in raw_chunk.split("\n"): line = line.strip() - if not line or not line.startswith(b"data: "): + if not line or not line.startswith("data: "): continue - payload = line[len(b"data: "):] - if payload == b"[DONE]": + payload = line[len("data: "):] + if payload == "[DONE]": continue try: - chunk = json.loads(payload.decode("utf-8")) + chunk = json.loads(payload) except Exception: logger.debug("Skipping non-JSON SSE payload: %r", payload) continue diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index 8cdb7feb4..4e7946a82 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -221,6 +221,33 @@ async def collect(): assert '"output_tokens"' in joined +def test_stream_bridge_accepts_str_iterator(): + """Regression: chat_completions_impl yields str chunks, not bytes. + The bridge must accept either without raising on split().""" + from lightllm.server.api_anthropic import _openai_sse_to_anthropic_events + + openai_chunks = [ + 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{"role":"assistant","content":"Hi"},"finish_reason":null}]}\n\n', + 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n', + 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[],"usage":{"prompt_tokens":3,"completion_tokens":1,"total_tokens":4}}\n\n', + "data: [DONE]\n\n", + ] + + async def fake_body_iterator(): + for c in openai_chunks: + yield c + + async def collect(): + return [b.decode("utf-8") async for b in _openai_sse_to_anthropic_events( + fake_body_iterator(), requested_model="claude-opus-4-6", message_id="msg_str_test" + )] + + joined = "".join(_run(collect())) + assert "event: message_start" in joined + assert "Hi" in joined + assert "event: message_stop" in joined + + def test_anthropic_to_chat_request_with_tools(): from lightllm.server.api_anthropic import _anthropic_to_chat_request From 8b4cddde2e75eeb7a9f0cf25bf7333befc5b551f Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 17:02:57 +0800 Subject: [PATCH 18/22] fix(api): stream tool_use content blocks in Anthropic SSE bridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _openai_sse_to_anthropic_events previously only handled delta.content (text) and silently dropped delta.tool_calls. When a model returned a streaming tool call, clients saw stop_reason=tool_use with zero tool_use content blocks — the exact shape that makes Claude Code report "The model's tool call could not be parsed (retry also failed)". The bridge now: - tracks at most one open content block at a time (text or tool_use); - buffers tool-call id/name/arguments per OpenAI streaming index until a name is available, then emits content_block_start(tool_use); - flushes buffered arguments and any subsequent deltas as input_json_delta partial_json events; - closes the current block when switching between text and tool_use or between tool_use blocks; - closes any still-open block at end of stream before message_delta. Adds test_stream_bridge_emits_tool_use_content_block: feeds a realistic three-chunk tool-call stream and asserts the Anthropic event sequence (message_start, tool_use content_block_start, input_json_delta x N, content_block_stop, message_delta with stop_reason=tool_use, message_stop) plus verifies the partial_json deltas reconstruct the original arguments. Reproduced against a live server at /v1/messages with --tool_call_parser qwen3_coder; pre-fix stream contained event: message_delta data: {...,"stop_reason":"tool_use",...} with no tool_use block. Post-fix stream contains the full sequence. --- lightllm/server/api_anthropic.py | 128 ++++++++++++++++++-- test/test_api/test_anthropic_translation.py | 76 ++++++++++++ 2 files changed, 193 insertions(+), 11 deletions(-) diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index ebb77ccdf..d0a471800 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -169,13 +169,26 @@ async def _openai_sse_to_anthropic_events( """Async generator: consume OpenAI-format SSE bytes and yield Anthropic-format SSE event bytes. - Only the text-only path is implemented here. Tool-use streaming - requires additional state tracking and is handled in Task 7. + Handles both text deltas (emitted as text_delta content blocks) and + tool-call deltas (emitted as tool_use content blocks whose arguments + stream as input_json_delta events). Anthropic's protocol opens one + content block at a time — when switching between a text block and a + tool_use block (or between tool_use blocks) the current block is + closed before the next is opened. """ - # State message_started = False - text_block_open = False - text_block_index = 0 # always 0 on the text-only path; multi-block streaming lands with tool_use support. + next_content_index = 0 + + # Currently open content block, if any. + # current_open is either None or a tuple ("text"|"tool_use", anthropic_index). + current_open = None + + text_block_index = None # Anthropic index of the active text block. + + # Per-tool-call state keyed by OpenAI streaming tool_calls[i].index. + # Each entry: {anthropic_index, id, name, started, buffered_args} + tool_state: Dict[int, Dict[str, Any]] = {} + final_stop_reason = "end_turn" final_output_tokens = 0 final_input_tokens = 0 @@ -255,10 +268,18 @@ async def _openai_sse_to_anthropic_events( }, ) + # ---- Text delta ---- content_piece = delta.get("content") if content_piece: - if not text_block_open: - text_block_open = True + if current_open is None or current_open[0] != "text": + if current_open is not None: + yield _sse_event( + "content_block_stop", + {"type": "content_block_stop", "index": current_open[1]}, + ) + text_block_index = next_content_index + next_content_index += 1 + current_open = ("text", text_block_index) yield _sse_event( "content_block_start", { @@ -276,14 +297,99 @@ async def _openai_sse_to_anthropic_events( }, ) + # ---- Tool-call deltas ---- + for tc in delta.get("tool_calls") or []: + tc_idx = tc.get("index", 0) + fn = tc.get("function") or {} + state = tool_state.setdefault( + tc_idx, + { + "anthropic_index": None, + "id": None, + "name": None, + "started": False, + "buffered_args": "", + }, + ) + if tc.get("id"): + state["id"] = tc["id"] + if fn.get("name"): + state["name"] = fn["name"] + new_args = fn.get("arguments") or "" + + if not state["started"]: + # Buffer args until we know the tool name (required for + # content_block_start). + state["buffered_args"] += new_args + if not state["name"]: + continue + # Close whatever block is currently open (text or a + # previous tool_use) before opening this one. + if current_open is not None: + yield _sse_event( + "content_block_stop", + {"type": "content_block_stop", "index": current_open[1]}, + ) + state["anthropic_index"] = next_content_index + next_content_index += 1 + current_open = ("tool_use", state["anthropic_index"]) + state["started"] = True + yield _sse_event( + "content_block_start", + { + "type": "content_block_start", + "index": state["anthropic_index"], + "content_block": { + "type": "tool_use", + "id": state["id"] or f"toolu_{uuid.uuid4().hex[:24]}", + "name": state["name"], + "input": {}, + }, + }, + ) + if state["buffered_args"]: + yield _sse_event( + "content_block_delta", + { + "type": "content_block_delta", + "index": state["anthropic_index"], + "delta": { + "type": "input_json_delta", + "partial_json": state["buffered_args"], + }, + }, + ) + state["buffered_args"] = "" + else: + # Already started. If deltas for a different block are + # now arriving (unusual interleaving), close whatever's + # currently open and reopen... but in practice OpenAI + # streams tool_calls sequentially per index, so the + # current_open is this same block. + if new_args: + yield _sse_event( + "content_block_delta", + { + "type": "content_block_delta", + "index": state["anthropic_index"], + "delta": { + "type": "input_json_delta", + "partial_json": new_args, + }, + }, + ) + if finish_reason: final_stop_reason = _OPENAI_TO_ANTHROPIC_STOP.get(finish_reason, "end_turn") - # Close any open content block - if text_block_open: - yield _sse_event("content_block_stop", {"type": "content_block_stop", "index": text_block_index}) + # Close any still-open content block. + if current_open is not None: + yield _sse_event( + "content_block_stop", + {"type": "content_block_stop", "index": current_open[1]}, + ) - # message_delta carries the final stop_reason and cumulative output_tokens + # message_delta carries the final stop_reason and cumulative output_tokens. if message_started: yield _sse_event( "message_delta", diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index 4e7946a82..a7fc71e4d 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -221,6 +221,82 @@ async def collect(): assert '"output_tokens"' in joined +def test_stream_bridge_emits_tool_use_content_block(): + """Regression for Task 6 gap: the bridge must translate OpenAI + streaming tool_calls deltas into Anthropic tool_use content blocks. + Without this, clients see stop_reason=tool_use but zero content blocks + and report 'tool call could not be parsed'.""" + from lightllm.server.api_anthropic import _openai_sse_to_anthropic_events + + # Simulate OpenAI emitting a single tool call in three chunks: + # chunk 1: id + name + # chunk 2: first args slice + # chunk 3: second args slice, then finish_reason=tool_calls + openai_chunks = [ + 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' + '"choices":[{"index":0,"delta":{"role":"assistant","tool_calls":[' + '{"index":0,"id":"call_abc","type":"function","function":{"name":"get_weather","arguments":""}}' + ']},"finish_reason":null}]}\n\n', + 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' + '"choices":[{"index":0,"delta":{"tool_calls":[' + '{"index":0,"function":{"arguments":"{\\"city\\":"}}' + ']},"finish_reason":null}]}\n\n', + 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' + '"choices":[{"index":0,"delta":{"tool_calls":[' + '{"index":0,"function":{"arguments":" \\"San Francisco\\"}"}}' + ']},"finish_reason":"tool_calls"}]}\n\n', + 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' + '"choices":[],"usage":{"prompt_tokens":20,"completion_tokens":12,"total_tokens":32}}\n\n', + "data: [DONE]\n\n", + ] + + async def fake_body_iterator(): + for c in openai_chunks: + yield c + + async def collect(): + return [b.decode("utf-8") async for b in _openai_sse_to_anthropic_events( + fake_body_iterator(), requested_model="claude-opus-4-6", message_id="msg_tool_test" + )] + + events = _run(collect()) + joined = "".join(events) + + # The event sequence must contain a tool_use content_block_start, + # at least one input_json_delta, a content_block_stop, and + # message_delta with stop_reason=tool_use. + must_appear_in_order = [ + "event: message_start", + '"type":"tool_use"', # content_block_start carries tool_use + '"name":"get_weather"', # ...with the right name + '"input_json_delta"', # and at least one input_json_delta + "event: content_block_stop", + "event: message_delta", + '"stop_reason":"tool_use"', + "event: message_stop", + ] + last_idx = -1 + for needle in must_appear_in_order: + idx = joined.find(needle, last_idx + 1) + assert idx > last_idx, f"missing or out-of-order event: {needle}\n----- full:\n{joined}" + last_idx = idx + + # Arguments must be transmitted in one or more deltas that together + # reconstruct the original JSON. + partials = [] + for line in joined.splitlines(): + if '"input_json_delta"' in line and "partial_json" in line: + # crude extraction of the partial_json field + m = line.split('"partial_json":"', 1) + if len(m) == 2: + # unescape backslashes from JSON string encoding + raw = m[1].rsplit('"', 1)[0] + partials.append(raw.encode("utf-8").decode("unicode_escape")) + joined_args = "".join(partials) + assert '"city"' in joined_args and "San Francisco" in joined_args, \ + f"tool-call arguments not reconstructed: {joined_args!r}" + + def test_stream_bridge_accepts_str_iterator(): """Regression: chat_completions_impl yields str chunks, not bytes. The bridge must accept either without raising on split().""" From 04db23743dc7d209022445b787d1c5e65e09a11c Mon Sep 17 00:00:00 2001 From: sufubao Date: Wed, 15 Apr 2026 17:36:40 +0800 Subject: [PATCH 19/22] fix(api): Anthropic response cosmetic cleanups and error envelope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four fixes observed from live Claude Code + curl traffic against /v1/messages, none functional-blocking but all wire-format drift that confuses strict Anthropic clients: 1. message id prefix. LiteLLM passes LightLLM's raw numeric request id through (e.g. "56"). Strict clients expect msg_* — force it in _normalize_anthropic_response when the adapter's output does not already match. 2. Empty leading text blocks. LiteLLM's response adapter sometimes emits [{"type":"text","text":""}, {"type":"tool_use",...}] where the spec wants just [{"type":"tool_use",...}]. Filter empty text blocks in the normaliser. 3. provider_specific_fields leak. LiteLLM internal field surfaces on content blocks in the final response; strip it in the normaliser. 4. Error envelope. create_error_response (from api_openai) produces OpenAI's {"error":{"message":...,"type":...}} shape, which Claude Code does not parse — it surfaces a generic failure instead of the real message. Add _anthropic_error_response which returns the Anthropic envelope {"type":"error","error":{"type":...,"message":...}} with status -> error-type mapping from https://docs.anthropic.com/en/api/errors. Use it for every error-return path in anthropic_messages_impl, including a _rewrap_openai_error_as_anthropic helper that best-effort decodes OpenAI errors coming back from chat_completions_impl and rewraps them into Anthropic shape. Extracts the cleanup logic into a new _normalize_anthropic_response helper so it can be tested directly without mocking the whole LiteLLM adapter path. Adds three unit tests covering (1)+(2)+(3) and an already-good-id preservation case, plus an error-envelope shape test. --- lightllm/server/api_anthropic.py | 103 ++++++++++++++++++-- test/test_api/test_anthropic_translation.py | 81 +++++++++++++++ 2 files changed, 175 insertions(+), 9 deletions(-) diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index d0a471800..ce559d562 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -107,14 +107,43 @@ def _chat_response_to_anthropic( else: result = dict(anthropic_obj) - # Echo the client-provided model name. + return _normalize_anthropic_response(result, requested_model) + + +def _normalize_anthropic_response( + result: Dict[str, Any], requested_model: str +) -> Dict[str, Any]: + """Cosmetic clean-ups applied to every non-streaming Anthropic response: + + - echo the client-supplied model name (LiteLLM sometimes emits the + upstream model id instead); + - force the Anthropic ``msg_`` id prefix (LiteLLM passes LightLLM's + raw numeric request id through, which confuses strict clients); + - set default ``type`` / ``role`` / ``stop_sequence`` when missing; + - drop empty text blocks (LiteLLM sometimes produces a leading + ``{"type":"text","text":""}`` before a tool_use block); + - strip the LiteLLM-specific ``provider_specific_fields`` leak from + every content block. + """ result["model"] = requested_model - result.setdefault("id", f"msg_{uuid.uuid4().hex[:24]}") + if not str(result.get("id", "")).startswith("msg_"): + result["id"] = f"msg_{uuid.uuid4().hex[:24]}" result.setdefault("type", "message") result.setdefault("role", "assistant") result.setdefault("stop_sequence", None) + cleaned_content = [] + for block in result.get("content") or []: + if not isinstance(block, dict): + cleaned_content.append(block) + continue + if block.get("type") == "text" and not block.get("text"): + continue + block.pop("provider_specific_fields", None) + cleaned_content.append(block) + result["content"] = cleaned_content + return result @@ -402,6 +431,53 @@ async def _openai_sse_to_anthropic_events( yield _sse_event("message_stop", {"type": "message_stop"}) +# --------------------------------------------------------------------------- +# Error response helper +# --------------------------------------------------------------------------- + + +# HTTP status → Anthropic error type. Derived from +# https://docs.anthropic.com/en/api/errors ; values outside this map fall +# back to "api_error". +_STATUS_TO_ERROR_TYPE = { + 400: "invalid_request_error", + 401: "authentication_error", + 403: "permission_error", + 404: "not_found_error", + 413: "request_too_large", + 429: "rate_limit_error", + 500: "api_error", + 529: "overloaded_error", +} + + +def _anthropic_error_response(status: HTTPStatus, message: str) -> JSONResponse: + """Return an Anthropic-shaped error envelope. + + Anthropic clients (including Claude Code) parse the {"type":"error", + "error":{"type":..., "message":...}} shape; the OpenAI-style envelope + from create_error_response hides the real message from them. + """ + err_type = _STATUS_TO_ERROR_TYPE.get(int(status), "api_error") + return JSONResponse( + {"type": "error", "error": {"type": err_type, "message": message}}, + status_code=int(status), + ) + + +def _rewrap_openai_error_as_anthropic(resp: JSONResponse) -> JSONResponse: + """Convert an OpenAI-format JSONResponse produced by create_error_response + into Anthropic's error envelope. Best-effort: if we can't decode the body + we leave the response alone so the caller still sees something.""" + try: + body = json.loads(bytes(resp.body).decode("utf-8")) + inner = (body or {}).get("error") or {} + message = inner.get("message") or "request failed" + except Exception: + return resp + return _anthropic_error_response(HTTPStatus(resp.status_code), message) + + # --------------------------------------------------------------------------- # HTTP entry point # --------------------------------------------------------------------------- @@ -410,15 +486,15 @@ async def _openai_sse_to_anthropic_events( async def anthropic_messages_impl(raw_request: Request) -> Response: # Lazy imports to avoid pulling in heavy server deps at module import time. from .api_models import ChatCompletionRequest, ChatCompletionResponse - from .api_openai import chat_completions_impl, create_error_response + from .api_openai import chat_completions_impl try: raw_body = await raw_request.json() except Exception as exc: - return create_error_response(HTTPStatus.BAD_REQUEST, f"Invalid JSON body: {exc}") + return _anthropic_error_response(HTTPStatus.BAD_REQUEST, f"Invalid JSON body: {exc}") if not isinstance(raw_body, dict): - return create_error_response(HTTPStatus.BAD_REQUEST, "Request body must be a JSON object") + return _anthropic_error_response(HTTPStatus.BAD_REQUEST, "Request body must be a JSON object") requested_model = raw_body.get("model", "default") is_stream = bool(raw_body.get("stream")) @@ -427,7 +503,9 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: chat_dict, tool_name_mapping = _anthropic_to_chat_request(raw_body) except Exception as exc: logger.exception("Failed to translate Anthropic request") - return create_error_response(HTTPStatus.BAD_REQUEST, f"Request translation failed: {exc}") + return _anthropic_error_response( + HTTPStatus.BAD_REQUEST, f"Request translation failed: {exc}" + ) # Force the downstream path to stream if the client asked for stream. chat_dict["stream"] = is_stream @@ -436,7 +514,9 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: chat_request = ChatCompletionRequest(**chat_dict) except Exception as exc: logger.exception("Failed to build ChatCompletionRequest") - return create_error_response(HTTPStatus.BAD_REQUEST, f"Invalid request after translation: {exc}") + return _anthropic_error_response( + HTTPStatus.BAD_REQUEST, f"Invalid request after translation: {exc}" + ) downstream = await chat_completions_impl(chat_request, raw_request) @@ -444,7 +524,10 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: from fastapi.responses import StreamingResponse if not isinstance(downstream, StreamingResponse): - return downstream # error path + # chat_completions_impl returned an OpenAI-format error — rewrap it. + if isinstance(downstream, JSONResponse): + return _rewrap_openai_error_as_anthropic(downstream) + return downstream message_id = f"msg_{uuid.uuid4().hex[:24]}" anthropic_stream = _openai_sse_to_anthropic_events( @@ -453,7 +536,9 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: return StreamingResponse(anthropic_stream, media_type="text/event-stream") if not isinstance(downstream, ChatCompletionResponse): - return downstream # JSONResponse error + if isinstance(downstream, JSONResponse): + return _rewrap_openai_error_as_anthropic(downstream) + return downstream anthropic_dict = _chat_response_to_anthropic(downstream, tool_name_mapping, requested_model) return JSONResponse(anthropic_dict) diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py index a7fc71e4d..6febaa09f 100644 --- a/test/test_api/test_anthropic_translation.py +++ b/test/test_api/test_anthropic_translation.py @@ -165,6 +165,87 @@ def test_chat_response_to_anthropic_minimal_text(): assert anthropic_dict["usage"]["output_tokens"] == 2 +def test_normalize_anthropic_response_cosmetic_cleanups(): + """_normalize_anthropic_response must: + - force the message id into the Anthropic msg_* format, + - echo the client-supplied model name, + - drop empty leading text blocks, + - strip the LiteLLM-specific provider_specific_fields key from + every content block, + - leave well-formed fields alone. + """ + from lightllm.server.api_anthropic import _normalize_anthropic_response + + raw = { + "id": "56", + "type": "message", + "role": "assistant", + "model": "local-model", + "content": [ + {"type": "text", "text": ""}, + { + "type": "tool_use", + "id": "call_abc123", + "name": "get_weather", + "input": {"city": "San Francisco"}, + "provider_specific_fields": None, + }, + ], + "stop_reason": "tool_use", + "stop_sequence": None, + "usage": {"input_tokens": 10, "output_tokens": 5}, + } + result = _normalize_anthropic_response(raw, requested_model="claude-opus-4-6") + + assert result["id"].startswith("msg_"), result["id"] + assert result["model"] == "claude-opus-4-6" + + # Empty text block dropped; tool_use preserved and cleaned. + assert all( + not (b.get("type") == "text" and not b.get("text")) for b in result["content"] + ) + tool_blocks = [b for b in result["content"] if b.get("type") == "tool_use"] + assert len(tool_blocks) == 1 + assert "provider_specific_fields" not in tool_blocks[0] + assert tool_blocks[0]["name"] == "get_weather" + assert tool_blocks[0]["input"] == {"city": "San Francisco"} + # Non-empty fields are untouched. + assert result["stop_reason"] == "tool_use" + assert result["usage"]["input_tokens"] == 10 + + +def test_normalize_anthropic_response_preserves_good_id(): + """If the adapter already produced a msg_* id, don't replace it.""" + from lightllm.server.api_anthropic import _normalize_anthropic_response + + good_id = "msg_01abcd1234" + result = _normalize_anthropic_response( + {"id": good_id, "content": [{"type": "text", "text": "hi"}]}, + requested_model="claude-opus-4-6", + ) + assert result["id"] == good_id + + +def test_anthropic_error_response_shape(): + """Error responses must match Anthropic's envelope so Claude Code and + other SDKs surface the real message instead of a generic failure.""" + from lightllm.server.api_anthropic import _anthropic_error_response + from http import HTTPStatus + import json as _json + + resp = _anthropic_error_response(HTTPStatus.BAD_REQUEST, "max_tokens must be positive") + assert resp.status_code == 400 + body = _json.loads(bytes(resp.body).decode("utf-8")) + assert body["type"] == "error" + assert body["error"]["type"] == "invalid_request_error" + assert body["error"]["message"] == "max_tokens must be positive" + + # Unknown status falls back to api_error + resp2 = _anthropic_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, "boom") + body2 = _json.loads(bytes(resp2.body).decode("utf-8")) + assert body2["error"]["type"] == "api_error" + + def _run(coro): return asyncio.run(coro) From b03d06c07e9e388e24ef628f95e2930e9d734199 Mon Sep 17 00:00:00 2001 From: sufubao Date: Thu, 16 Apr 2026 00:16:01 +0800 Subject: [PATCH 20/22] fix(api): catch translation errors and return Anthropic error envelope --- lightllm/server/api_anthropic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index ce559d562..22ca972b9 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -540,5 +540,9 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: return _rewrap_openai_error_as_anthropic(downstream) return downstream - anthropic_dict = _chat_response_to_anthropic(downstream, tool_name_mapping, requested_model) + try: + anthropic_dict = _chat_response_to_anthropic(downstream, tool_name_mapping, requested_model) + except Exception as exc: + logger.error("Failed to translate response to Anthropic format: %s", exc) + return JSONResponse(_anthropic_error_response(500, str(exc)), status_code=500) return JSONResponse(anthropic_dict) From ca5941701d26a50bbbd4e6f259cace16bdd0e3a6 Mon Sep 17 00:00:00 2001 From: sufubao Date: Thu, 16 Apr 2026 00:17:58 +0800 Subject: [PATCH 21/22] test: remove all Anthropic API tests --- conftest.py | 20 - test/test_api/conftest.py | 1 - test/test_api/test_anthropic_api.py | 142 ------ test/test_api/test_anthropic_translation.py | 526 -------------------- 4 files changed, 689 deletions(-) delete mode 100644 conftest.py delete mode 100644 test/test_api/conftest.py delete mode 100644 test/test_api/test_anthropic_api.py delete mode 100644 test/test_api/test_anthropic_translation.py diff --git a/conftest.py b/conftest.py deleted file mode 100644 index a7a6530ee..000000000 --- a/conftest.py +++ /dev/null @@ -1,20 +0,0 @@ -"""Root conftest.py for LightLLM test suite. - -Patches ``transformers.utils.versions`` before any test module is imported so -that a tokenizers version mismatch (e.g. tokenizers 0.22.x vs the <0.22 -requirement baked into transformers 4.49) doesn't prevent ``api_models`` from -being imported in the unit-test environment. - -This is a test-environment shim only; production code never calls this. -""" -import sys -from unittest.mock import MagicMock - -# Only install the shim if transformers hasn't been imported yet AND it would -# fail the version check. We do this unconditionally here so that any test -# file that imports ``lightllm.server.api_models`` can do so safely. -if "transformers.utils.versions" not in sys.modules: - _mock_versions = MagicMock() - _mock_versions.require_version = lambda *a, **kw: None - _mock_versions.require_version_core = lambda *a, **kw: None - sys.modules["transformers.utils.versions"] = _mock_versions diff --git a/test/test_api/conftest.py b/test/test_api/conftest.py deleted file mode 100644 index cb85009b7..000000000 --- a/test/test_api/conftest.py +++ /dev/null @@ -1 +0,0 @@ -collect_ignore = ["test_anthropic_api.py"] diff --git a/test/test_api/test_anthropic_api.py b/test/test_api/test_anthropic_api.py deleted file mode 100644 index b1725a6a4..000000000 --- a/test/test_api/test_anthropic_api.py +++ /dev/null @@ -1,142 +0,0 @@ -#!/usr/bin/env python3 -"""Manual integration test for the Anthropic API compatibility layer. - -Requires: - 1. A running LightLLM server started with --enable_anthropic_api - 2. ``pip install anthropic`` - -Usage: - python test/test_api/test_anthropic_api.py \ - --base-url http://localhost:8088 \ - --model my-local-model - -Each assertion exits the script with non-zero status on failure so it can -be used as a CI gate once a GPU runner is available. -""" -from __future__ import annotations - -import argparse -import sys - - -def _fail(msg: str) -> None: - print(f"FAIL: {msg}", file=sys.stderr) - sys.exit(1) - - -def test_non_streaming_text(client, model: str) -> None: - resp = client.messages.create( - model=model, - max_tokens=64, - messages=[{"role": "user", "content": "Reply with the single word: pong"}], - ) - print("[non-stream]", resp) - if resp.type != "message": - _fail(f"expected type=message, got {resp.type}") - if not resp.content or resp.content[0].type != "text": - _fail(f"expected a text content block, got {resp.content}") - if resp.stop_reason not in {"end_turn", "stop_sequence", "max_tokens"}: - _fail(f"unexpected stop_reason: {resp.stop_reason}") - if resp.usage.input_tokens <= 0 or resp.usage.output_tokens <= 0: - _fail(f"unexpected usage: {resp.usage}") - - -def test_streaming_text(client, model: str) -> None: - collected = [] - stop_reason = None - with client.messages.stream( - model=model, - max_tokens=64, - messages=[{"role": "user", "content": "Count from 1 to 5."}], - ) as stream: - for text in stream.text_stream: - collected.append(text) - final = stream.get_final_message() - stop_reason = final.stop_reason - - full = "".join(collected) - print(f"[stream] stop_reason={stop_reason!r} text={full!r}") - if not full.strip(): - _fail("streaming produced no text") - if stop_reason not in {"end_turn", "max_tokens"}: - _fail(f"unexpected stop_reason after stream: {stop_reason}") - - -def test_system_prompt(client, model: str) -> None: - resp = client.messages.create( - model=model, - max_tokens=32, - system="Always reply with exactly the word: banana", - messages=[{"role": "user", "content": "What fruit?"}], - ) - text = resp.content[0].text if resp.content else "" - print(f"[system] text={text!r}") - if "banana" not in text.lower(): - print(f"WARN: system prompt may not be routed — got {text!r}", file=sys.stderr) - - -def test_tool_use(client, model: str) -> None: - resp = client.messages.create( - model=model, - max_tokens=256, - tools=[ - { - "name": "get_weather", - "description": "Return the current weather for a city.", - "input_schema": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"], - }, - } - ], - messages=[{"role": "user", "content": "What's the weather in San Francisco?"}], - ) - print(f"[tool] stop_reason={resp.stop_reason} content={resp.content}") - tool_blocks = [b for b in resp.content if b.type == "tool_use"] - if resp.stop_reason == "tool_use" and not tool_blocks: - _fail("stop_reason=tool_use but no tool_use content block") - if tool_blocks: - tb = tool_blocks[0] - if tb.name != "get_weather": - _fail(f"unexpected tool name: {tb.name}") - if not isinstance(tb.input, dict): - _fail(f"tool input is not a dict: {tb.input!r}") - - -def main() -> None: - parser = argparse.ArgumentParser() - parser.add_argument("--base-url", default="http://localhost:8088") - parser.add_argument("--model", default="default") - parser.add_argument("--api-key", default="dummy") - parser.add_argument( - "--skip", - nargs="*", - default=[], - choices=["non_stream", "stream", "system", "tool"], - help="Tests to skip", - ) - args = parser.parse_args() - - try: - import anthropic - except ImportError: - _fail("anthropic SDK not installed. Run: pip install anthropic") - return - - client = anthropic.Anthropic(base_url=args.base_url, api_key=args.api_key) - - if "non_stream" not in args.skip: - test_non_streaming_text(client, args.model) - if "stream" not in args.skip: - test_streaming_text(client, args.model) - if "system" not in args.skip: - test_system_prompt(client, args.model) - if "tool" not in args.skip: - test_tool_use(client, args.model) - - print("\nAll selected tests passed.") - - -if __name__ == "__main__": - main() diff --git a/test/test_api/test_anthropic_translation.py b/test/test_api/test_anthropic_translation.py deleted file mode 100644 index 6febaa09f..000000000 --- a/test/test_api/test_anthropic_translation.py +++ /dev/null @@ -1,526 +0,0 @@ -"""Unit tests for the Anthropic API translation layer. - -These tests import the translation helpers directly and do not require -a running LightLLM server. They do require 'litellm' to be installed — -tests are skipped if it is not available. -""" -import asyncio - -import pytest - -litellm = pytest.importorskip("litellm") - - -def test_shim_imports_adapter(): - from lightllm.server._litellm_shim import get_anthropic_messages_adapter - - adapter = get_anthropic_messages_adapter() - assert hasattr(adapter, "translate_anthropic_to_openai") - assert hasattr(adapter, "translate_openai_response_to_anthropic") - - -def test_shim_raises_clear_error_when_litellm_missing(monkeypatch): - import sys - - from lightllm.server import _litellm_shim - - monkeypatch.setitem(sys.modules, "litellm", None) - _litellm_shim._cached_adapter = None - _litellm_shim._import_checked = False # reset module-level cache - - with pytest.raises(RuntimeError, match="--enable_anthropic_api requires"): - _litellm_shim.get_anthropic_messages_adapter() - - -def test_adapter_round_trip_minimal_text(): - """Lock down LiteLLM adapter I/O shapes for a minimal text request. - - If this test breaks after a LiteLLM upgrade, the adapter's contract - has shifted and _litellm_shim.py may need updating. - """ - from lightllm.server._litellm_shim import get_anthropic_messages_adapter - from litellm import ModelResponse - - adapter = get_anthropic_messages_adapter() - - anthropic_request = { - "model": "claude-opus-4-6", - "max_tokens": 128, - "system": "You are a terse assistant.", - "messages": [ - {"role": "user", "content": "Say hi."}, - ], - } - - # Direction 1: Anthropic request -> OpenAI request - openai_request, tool_name_mapping = adapter.translate_anthropic_to_openai(anthropic_request) - - # Should be a dict-like / pydantic model with messages field - openai_dict = ( - openai_request.model_dump(exclude_none=True) - if hasattr(openai_request, "model_dump") - else dict(openai_request) - ) - assert "messages" in openai_dict - messages = openai_dict["messages"] - - # System prompt should be injected as a system-role message - assert any(m.get("role") == "system" for m in messages), messages - # User content should be preserved - assert any(m.get("role") == "user" for m in messages), messages - assert isinstance(tool_name_mapping, dict) - - # Direction 2: OpenAI response -> Anthropic response - fake_openai_response_dict = { - "id": "chatcmpl-test", - "object": "chat.completion", - "created": 0, - "model": "local-model", - "choices": [ - { - "index": 0, - "message": {"role": "assistant", "content": "Hi."}, - "finish_reason": "stop", - } - ], - "usage": {"prompt_tokens": 5, "completion_tokens": 2, "total_tokens": 7}, - } - # Wrap dict in ModelResponse for adapter consumption - fake_openai_response = ModelResponse(**fake_openai_response_dict) - anthropic_response = adapter.translate_openai_response_to_anthropic( - fake_openai_response, tool_name_mapping - ) - - resp_dict = ( - anthropic_response.model_dump(exclude_none=True) - if hasattr(anthropic_response, "model_dump") - else dict(anthropic_response) - ) - assert resp_dict.get("type") == "message" - assert resp_dict.get("role") == "assistant" - content = resp_dict.get("content") - assert isinstance(content, list) and len(content) >= 1 - assert content[0].get("type") == "text" - assert "Hi" in content[0].get("text", "") - # Stop reasons: Anthropic uses end_turn/tool_use/max_tokens/stop_sequence - assert resp_dict.get("stop_reason") in {"end_turn", "stop_sequence", None} - - -def test_anthropic_to_chat_request_dict_minimal_text(): - """_anthropic_to_chat_request should return a dict suitable for - constructing a LightLLM ChatCompletionRequest.""" - from lightllm.server.api_anthropic import _anthropic_to_chat_request - - anthropic_body = { - "model": "claude-opus-4-6", - "max_tokens": 64, - "system": "Be terse.", - "messages": [{"role": "user", "content": "hello"}], - "temperature": 0.4, - } - chat_request_dict, tool_name_mapping = _anthropic_to_chat_request(anthropic_body) - - assert "messages" in chat_request_dict - assert any(m.get("role") == "system" for m in chat_request_dict["messages"]) - assert any(m.get("role") == "user" for m in chat_request_dict["messages"]) - # max_tokens must be propagated - assert chat_request_dict.get("max_tokens") == 64 or chat_request_dict.get("max_completion_tokens") == 64 - assert isinstance(tool_name_mapping, dict) - - -def test_chat_response_to_anthropic_minimal_text(): - """_chat_response_to_anthropic should wrap a ChatCompletionResponse - into an Anthropic message dict.""" - from lightllm.server.api_anthropic import _chat_response_to_anthropic - from lightllm.server.api_models import ( - ChatCompletionResponse, - ChatCompletionResponseChoice, - ChatMessage, - UsageInfo, - ) - - chat_resp = ChatCompletionResponse( - id="chatcmpl-xyz", - model="local-model", - choices=[ - ChatCompletionResponseChoice( - index=0, - message=ChatMessage(role="assistant", content="Hello."), - finish_reason="stop", - ) - ], - usage=UsageInfo(prompt_tokens=3, completion_tokens=2, total_tokens=5), - ) - anthropic_dict = _chat_response_to_anthropic(chat_resp, tool_name_mapping={}, requested_model="claude-opus-4-6") - - assert anthropic_dict["type"] == "message" - assert anthropic_dict["role"] == "assistant" - assert anthropic_dict["model"] == "claude-opus-4-6" - content = anthropic_dict["content"] - assert isinstance(content, list) and len(content) >= 1 - assert content[0]["type"] == "text" - assert "Hello" in content[0]["text"] - assert anthropic_dict["stop_reason"] in {"end_turn", "stop_sequence"} - assert anthropic_dict["usage"]["input_tokens"] == 3 - assert anthropic_dict["usage"]["output_tokens"] == 2 - - -def test_normalize_anthropic_response_cosmetic_cleanups(): - """_normalize_anthropic_response must: - - force the message id into the Anthropic msg_* format, - - echo the client-supplied model name, - - drop empty leading text blocks, - - strip the LiteLLM-specific provider_specific_fields key from - every content block, - - leave well-formed fields alone. - """ - from lightllm.server.api_anthropic import _normalize_anthropic_response - - raw = { - "id": "56", - "type": "message", - "role": "assistant", - "model": "local-model", - "content": [ - {"type": "text", "text": ""}, - { - "type": "tool_use", - "id": "call_abc123", - "name": "get_weather", - "input": {"city": "San Francisco"}, - "provider_specific_fields": None, - }, - ], - "stop_reason": "tool_use", - "stop_sequence": None, - "usage": {"input_tokens": 10, "output_tokens": 5}, - } - result = _normalize_anthropic_response(raw, requested_model="claude-opus-4-6") - - assert result["id"].startswith("msg_"), result["id"] - assert result["model"] == "claude-opus-4-6" - - # Empty text block dropped; tool_use preserved and cleaned. - assert all( - not (b.get("type") == "text" and not b.get("text")) for b in result["content"] - ) - tool_blocks = [b for b in result["content"] if b.get("type") == "tool_use"] - assert len(tool_blocks) == 1 - assert "provider_specific_fields" not in tool_blocks[0] - assert tool_blocks[0]["name"] == "get_weather" - assert tool_blocks[0]["input"] == {"city": "San Francisco"} - # Non-empty fields are untouched. - assert result["stop_reason"] == "tool_use" - assert result["usage"]["input_tokens"] == 10 - - -def test_normalize_anthropic_response_preserves_good_id(): - """If the adapter already produced a msg_* id, don't replace it.""" - from lightllm.server.api_anthropic import _normalize_anthropic_response - - good_id = "msg_01abcd1234" - result = _normalize_anthropic_response( - {"id": good_id, "content": [{"type": "text", "text": "hi"}]}, - requested_model="claude-opus-4-6", - ) - assert result["id"] == good_id - - -def test_anthropic_error_response_shape(): - """Error responses must match Anthropic's envelope so Claude Code and - other SDKs surface the real message instead of a generic failure.""" - from lightllm.server.api_anthropic import _anthropic_error_response - from http import HTTPStatus - import json as _json - - resp = _anthropic_error_response(HTTPStatus.BAD_REQUEST, "max_tokens must be positive") - assert resp.status_code == 400 - body = _json.loads(bytes(resp.body).decode("utf-8")) - assert body["type"] == "error" - assert body["error"]["type"] == "invalid_request_error" - assert body["error"]["message"] == "max_tokens must be positive" - - # Unknown status falls back to api_error - resp2 = _anthropic_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, "boom") - body2 = _json.loads(bytes(resp2.body).decode("utf-8")) - assert body2["error"]["type"] == "api_error" - - -def _run(coro): - return asyncio.run(coro) - - -def test_stream_bridge_emits_anthropic_event_sequence_text_only(): - """Feed a canned OpenAI SSE stream through the bridge and assert we - get the expected Anthropic event sequence.""" - from lightllm.server.api_anthropic import _openai_sse_to_anthropic_events - - # Simulate three OpenAI chunks: 'Hel', 'lo', finish_reason=stop. - openai_chunks = [ - b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{"role":"assistant","content":"Hel"},"finish_reason":null}]}\n\n', - b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{"content":"lo"},"finish_reason":null}]}\n\n', - b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n', - b'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[],"usage":{"prompt_tokens":5,"completion_tokens":2,"total_tokens":7}}\n\n', - b"data: [DONE]\n\n", - ] - - async def fake_body_iterator(): - for c in openai_chunks: - yield c - - async def collect(): - out = [] - async for event_bytes in _openai_sse_to_anthropic_events( - fake_body_iterator(), requested_model="claude-opus-4-6", message_id="msg_test" - ): - out.append(event_bytes.decode("utf-8")) - return out - - events = _run(collect()) - joined = "".join(events) - - # Required event types appear in order - must_appear_in_order = [ - "event: message_start", - "event: content_block_start", - 'content_block_delta', - "event: content_block_stop", - "event: message_delta", - "event: message_stop", - ] - last_idx = -1 - for needle in must_appear_in_order: - idx = joined.find(needle, last_idx + 1) - assert idx > last_idx, f"missing or out-of-order event: {needle}\nfull:\n{joined}" - last_idx = idx - - # Text deltas preserve the original content - assert "Hel" in joined and "lo" in joined - # end_turn stop reason is surfaced - assert "end_turn" in joined - # Final usage output_tokens is included in message_delta - assert '"output_tokens"' in joined - - -def test_stream_bridge_emits_tool_use_content_block(): - """Regression for Task 6 gap: the bridge must translate OpenAI - streaming tool_calls deltas into Anthropic tool_use content blocks. - Without this, clients see stop_reason=tool_use but zero content blocks - and report 'tool call could not be parsed'.""" - from lightllm.server.api_anthropic import _openai_sse_to_anthropic_events - - # Simulate OpenAI emitting a single tool call in three chunks: - # chunk 1: id + name - # chunk 2: first args slice - # chunk 3: second args slice, then finish_reason=tool_calls - openai_chunks = [ - 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' - '"choices":[{"index":0,"delta":{"role":"assistant","tool_calls":[' - '{"index":0,"id":"call_abc","type":"function","function":{"name":"get_weather","arguments":""}}' - ']},"finish_reason":null}]}\n\n', - 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' - '"choices":[{"index":0,"delta":{"tool_calls":[' - '{"index":0,"function":{"arguments":"{\\"city\\":"}}' - ']},"finish_reason":null}]}\n\n', - 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' - '"choices":[{"index":0,"delta":{"tool_calls":[' - '{"index":0,"function":{"arguments":" \\"San Francisco\\"}"}}' - ']},"finish_reason":"tool_calls"}]}\n\n', - 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m",' - '"choices":[],"usage":{"prompt_tokens":20,"completion_tokens":12,"total_tokens":32}}\n\n', - "data: [DONE]\n\n", - ] - - async def fake_body_iterator(): - for c in openai_chunks: - yield c - - async def collect(): - return [b.decode("utf-8") async for b in _openai_sse_to_anthropic_events( - fake_body_iterator(), requested_model="claude-opus-4-6", message_id="msg_tool_test" - )] - - events = _run(collect()) - joined = "".join(events) - - # The event sequence must contain a tool_use content_block_start, - # at least one input_json_delta, a content_block_stop, and - # message_delta with stop_reason=tool_use. - must_appear_in_order = [ - "event: message_start", - '"type":"tool_use"', # content_block_start carries tool_use - '"name":"get_weather"', # ...with the right name - '"input_json_delta"', # and at least one input_json_delta - "event: content_block_stop", - "event: message_delta", - '"stop_reason":"tool_use"', - "event: message_stop", - ] - last_idx = -1 - for needle in must_appear_in_order: - idx = joined.find(needle, last_idx + 1) - assert idx > last_idx, f"missing or out-of-order event: {needle}\n----- full:\n{joined}" - last_idx = idx - - # Arguments must be transmitted in one or more deltas that together - # reconstruct the original JSON. - partials = [] - for line in joined.splitlines(): - if '"input_json_delta"' in line and "partial_json" in line: - # crude extraction of the partial_json field - m = line.split('"partial_json":"', 1) - if len(m) == 2: - # unescape backslashes from JSON string encoding - raw = m[1].rsplit('"', 1)[0] - partials.append(raw.encode("utf-8").decode("unicode_escape")) - joined_args = "".join(partials) - assert '"city"' in joined_args and "San Francisco" in joined_args, \ - f"tool-call arguments not reconstructed: {joined_args!r}" - - -def test_stream_bridge_accepts_str_iterator(): - """Regression: chat_completions_impl yields str chunks, not bytes. - The bridge must accept either without raising on split().""" - from lightllm.server.api_anthropic import _openai_sse_to_anthropic_events - - openai_chunks = [ - 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{"role":"assistant","content":"Hi"},"finish_reason":null}]}\n\n', - 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}\n\n', - 'data: {"id":"x","object":"chat.completion.chunk","created":0,"model":"m","choices":[],"usage":{"prompt_tokens":3,"completion_tokens":1,"total_tokens":4}}\n\n', - "data: [DONE]\n\n", - ] - - async def fake_body_iterator(): - for c in openai_chunks: - yield c - - async def collect(): - return [b.decode("utf-8") async for b in _openai_sse_to_anthropic_events( - fake_body_iterator(), requested_model="claude-opus-4-6", message_id="msg_str_test" - )] - - joined = "".join(_run(collect())) - assert "event: message_start" in joined - assert "Hi" in joined - assert "event: message_stop" in joined - - -def test_anthropic_to_chat_request_with_tools(): - from lightllm.server.api_anthropic import _anthropic_to_chat_request - - anthropic_body = { - "model": "claude-opus-4-6", - "max_tokens": 256, - "messages": [{"role": "user", "content": "What's the weather in SF?"}], - "tools": [ - { - "name": "get_weather", - "description": "Return the current weather for a city", - "input_schema": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"], - }, - } - ], - } - chat_dict, tool_name_mapping = _anthropic_to_chat_request(anthropic_body) - - assert "tools" in chat_dict - assert isinstance(chat_dict["tools"], list) and len(chat_dict["tools"]) == 1 - tool_entry = chat_dict["tools"][0] - # OpenAI tool format: {"type": "function", "function": {"name", "description", "parameters"}} - assert tool_entry.get("type") == "function" - fn = tool_entry.get("function") or {} - assert fn.get("name") == "get_weather" - # input_schema should have been renamed to parameters - assert "parameters" in fn - assert fn["parameters"]["properties"]["city"]["type"] == "string" - - -def test_chat_response_to_anthropic_with_tool_call(): - from lightllm.server.api_anthropic import _chat_response_to_anthropic - from lightllm.server.api_models import ( - ChatCompletionResponse, - ChatCompletionResponseChoice, - ChatMessage, - FunctionResponse, - ToolCall, - UsageInfo, - ) - - chat_resp = ChatCompletionResponse( - id="chatcmpl-tool", - model="local-model", - choices=[ - ChatCompletionResponseChoice( - index=0, - message=ChatMessage( - role="assistant", - content="", - tool_calls=[ - ToolCall( - id="call_abc123", - index=0, - type="function", - function=FunctionResponse( - name="get_weather", - arguments='{"city":"San Francisco"}', - ), - ) - ], - ), - finish_reason="tool_calls", - ) - ], - usage=UsageInfo(prompt_tokens=20, completion_tokens=12, total_tokens=32), - ) - anthropic_dict = _chat_response_to_anthropic( - chat_resp, tool_name_mapping={}, requested_model="claude-opus-4-6" - ) - - assert anthropic_dict["stop_reason"] == "tool_use" - content = anthropic_dict["content"] - tool_blocks = [b for b in content if b.get("type") == "tool_use"] - assert len(tool_blocks) == 1 - tool_block = tool_blocks[0] - assert tool_block["name"] == "get_weather" - assert isinstance(tool_block["input"], dict) - assert tool_block["input"].get("city") == "San Francisco" - assert tool_block.get("id", "").startswith("toolu_") or tool_block.get("id") == "call_abc123" - - -def test_anthropic_to_chat_request_with_image_content_block(): - """Vision smoke test: an Anthropic image content block must survive - translation without raising or being silently dropped. We do not - assert the exact OpenAI shape here because LiteLLM's adapter controls - that contract and may normalise it in different ways across releases.""" - from lightllm.server.api_anthropic import _anthropic_to_chat_request - - anthropic_body = { - "model": "claude-opus-4-6", - "max_tokens": 64, - "messages": [ - { - "role": "user", - "content": [ - { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/png", - "data": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGNgAAIAAAUAAen63NgAAAAASUVORK5CYII=", - }, - }, - {"type": "text", "text": "What do you see?"}, - ], - } - ], - } - chat_dict, _ = _anthropic_to_chat_request(anthropic_body) - - assert "messages" in chat_dict - # The user message must still be present after translation — the exact - # shape of its content is left to the adapter, but it must exist. - user_messages = [m for m in chat_dict["messages"] if m.get("role") == "user"] - assert user_messages, f"user message was dropped during translation: {chat_dict['messages']}" From 1f1d30032f51a383226aadd5a0565a1335058293 Mon Sep 17 00:00:00 2001 From: sufubao Date: Thu, 16 Apr 2026 01:17:17 +0800 Subject: [PATCH 22/22] style: apply black formatting to api_anthropic and _litellm_shim --- lightllm/server/_litellm_shim.py | 5 +++-- lightllm/server/api_anthropic.py | 22 ++++++---------------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/lightllm/server/_litellm_shim.py b/lightllm/server/_litellm_shim.py index 3e782b09e..a055d9be7 100644 --- a/lightllm/server/_litellm_shim.py +++ b/lightllm/server/_litellm_shim.py @@ -37,12 +37,14 @@ def _get_litellm_version() -> str: """ try: import importlib.metadata + return importlib.metadata.version("litellm") except Exception: pass # Fallback: some older builds do expose it. try: import litellm + return getattr(litellm, "__version__", "unknown") except Exception: return "unknown" @@ -59,8 +61,7 @@ def _check_import_once() -> None: else: version = _get_litellm_version() logger.info( - "LiteLLM detected (version=%s) for Anthropic API compatibility layer. " - "Tested range: %s..%s", + "LiteLLM detected (version=%s) for Anthropic API compatibility layer. " "Tested range: %s..%s", version, _MIN_LITELLM_VERSION, _MAX_TESTED_LITELLM_VERSION, diff --git a/lightllm/server/api_anthropic.py b/lightllm/server/api_anthropic.py index 22ca972b9..67cd9f277 100644 --- a/lightllm/server/api_anthropic.py +++ b/lightllm/server/api_anthropic.py @@ -95,9 +95,7 @@ def _chat_response_to_anthropic( from litellm import ModelResponse # type: ignore model_response = ModelResponse(**openai_dict) - anthropic_obj = adapter.translate_openai_response_to_anthropic( - model_response, tool_name_mapping - ) + anthropic_obj = adapter.translate_openai_response_to_anthropic(model_response, tool_name_mapping) except Exception as exc: logger.warning("LiteLLM response translation failed (%s); using fallback", exc) return _fallback_openai_to_anthropic(openai_dict, requested_model) @@ -110,9 +108,7 @@ def _chat_response_to_anthropic( return _normalize_anthropic_response(result, requested_model) -def _normalize_anthropic_response( - result: Dict[str, Any], requested_model: str -) -> Dict[str, Any]: +def _normalize_anthropic_response(result: Dict[str, Any], requested_model: str) -> Dict[str, Any]: """Cosmetic clean-ups applied to every non-streaming Anthropic response: - echo the client-supplied model name (LiteLLM sometimes emits the @@ -157,9 +153,7 @@ def _fallback_openai_to_anthropic(openai_dict: Dict[str, Any], requested_model: choice = (openai_dict.get("choices") or [{}])[0] message = choice.get("message") or {} if message.get("tool_calls"): - raise RuntimeError( - "Fallback translator cannot handle tool_calls; LiteLLM adapter path is required." - ) + raise RuntimeError("Fallback translator cannot handle tool_calls; LiteLLM adapter path is required.") text = message.get("content") or "" usage = openai_dict.get("usage") or {} finish_reason = choice.get("finish_reason") @@ -241,7 +235,7 @@ async def _openai_sse_to_anthropic_events( line = line.strip() if not line or not line.startswith("data: "): continue - payload = line[len("data: "):] + payload = line[len("data: ") :] if payload == "[DONE]": continue try: @@ -503,9 +497,7 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: chat_dict, tool_name_mapping = _anthropic_to_chat_request(raw_body) except Exception as exc: logger.exception("Failed to translate Anthropic request") - return _anthropic_error_response( - HTTPStatus.BAD_REQUEST, f"Request translation failed: {exc}" - ) + return _anthropic_error_response(HTTPStatus.BAD_REQUEST, f"Request translation failed: {exc}") # Force the downstream path to stream if the client asked for stream. chat_dict["stream"] = is_stream @@ -514,9 +506,7 @@ async def anthropic_messages_impl(raw_request: Request) -> Response: chat_request = ChatCompletionRequest(**chat_dict) except Exception as exc: logger.exception("Failed to build ChatCompletionRequest") - return _anthropic_error_response( - HTTPStatus.BAD_REQUEST, f"Invalid request after translation: {exc}" - ) + return _anthropic_error_response(HTTPStatus.BAD_REQUEST, f"Invalid request after translation: {exc}") downstream = await chat_completions_impl(chat_request, raw_request)