From 4747657b37ecf5be94437175744ab1d46244fedb Mon Sep 17 00:00:00 2001 From: zerob13 Date: Wed, 27 May 2026 10:35:12 +0800 Subject: [PATCH] fix(tts): route MiMo Pro as chat --- docs/archives/chat-audio-tts-routing/plan.md | 21 +++ docs/archives/chat-audio-tts-routing/spec.md | 26 +++ docs/archives/chat-audio-tts-routing/tasks.md | 8 + .../llmProviderPresenter/aiSdk/runtime.ts | 26 ++- src/shared/ttsSettings.ts | 7 +- .../llmProviderPresenter/aiSdkRuntime.test.ts | 172 ++++++++++++++++++ test/main/shared/ttsSettings.test.ts | 14 ++ 7 files changed, 266 insertions(+), 8 deletions(-) create mode 100644 docs/archives/chat-audio-tts-routing/plan.md create mode 100644 docs/archives/chat-audio-tts-routing/spec.md create mode 100644 docs/archives/chat-audio-tts-routing/tasks.md create mode 100644 test/main/shared/ttsSettings.test.ts diff --git a/docs/archives/chat-audio-tts-routing/plan.md b/docs/archives/chat-audio-tts-routing/plan.md new file mode 100644 index 000000000..e711ff914 --- /dev/null +++ b/docs/archives/chat-audio-tts-routing/plan.md @@ -0,0 +1,21 @@ +# Chat Audio TTS Routing Plan + +## Implementation + +- Tighten `isChatAudioTtsModel` so MiMo IDs must match the known MiMo prefixes and include a standalone `tts` segment. +- Update `executeTtsPatternB` to treat `message.content` as unknown response data. +- Extract audio parts only after checking `Array.isArray(message.content)`. +- Keep `message.audio.data` as the first-preference extraction path. +- Leave the existing missing-audio error path in place for responses that contain no audio data. + +## Test Strategy + +- Add shared helper coverage for MiMo TTS and non-TTS model IDs. +- Extend `test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts`. +- Cover `mimo-v2.5-pro` using normal chat streaming instead of direct TTS `fetch`. +- Cover a successful HTTP response with string `message.content` and no audio payload. +- Assert the runtime rejects with the expected missing-audio error, not `content.find is not a function`. + +## Compatibility + +This change is backward-compatible for actual MiMo TTS models. Non-TTS MiMo chat models stop being routed through TTS handling, while providers returning `message.audio.data` or array content audio parts keep the same behavior. diff --git a/docs/archives/chat-audio-tts-routing/spec.md b/docs/archives/chat-audio-tts-routing/spec.md new file mode 100644 index 000000000..bfbbca2c5 --- /dev/null +++ b/docs/archives/chat-audio-tts-routing/spec.md @@ -0,0 +1,26 @@ +# Chat Audio TTS Routing + +## User Story + +When a MiMo chat model is selected, DeepChat should only enter chat-audio TTS handling for model IDs that are actually TTS variants. Regular MiMo chat models such as `MiMo-V2.5-Pro` should use the normal chat streaming runtime. + +## Acceptance Criteria + +- `mimo-v2.5-pro` and provider-prefixed variants are not classified as TTS models. +- MiMo model IDs with a `tts` segment, such as `mimo-v2.5-tts`, continue to use chat-audio TTS Pattern B. +- Chat-audio TTS responses with `choices[0].message.audio.data` continue to emit cached audio. +- Chat-audio TTS responses with array `choices[0].message.content` can still extract an audio content part. +- Chat-audio TTS responses with string `choices[0].message.content` do not throw a `TypeError`. +- If no audio payload exists, DeepChat raises the existing missing-audio error instead of a response-shape crash. + +## Non-Goals + +- No changes to renderer audio display behavior. +- No changes to request body construction for chat-audio TTS models. + +## Constraints + +- Keep the fix localized to the AI SDK runtime. +- Keep TTS model classification in shared helpers so provider and agent runtime checks agree. +- Preserve current OpenAI-compatible chat-audio behavior. +- Add focused regression coverage for the reported MiMo Pro misrouting and response shape. diff --git a/docs/archives/chat-audio-tts-routing/tasks.md b/docs/archives/chat-audio-tts-routing/tasks.md new file mode 100644 index 000000000..9ff9742f3 --- /dev/null +++ b/docs/archives/chat-audio-tts-routing/tasks.md @@ -0,0 +1,8 @@ +# Chat Audio TTS Routing Tasks + +- [x] Create SDD issue artifacts. +- [x] Guard chat-audio TTS content audio extraction by response shape. +- [x] Add a regression test for string `message.content`. +- [x] Tighten MiMo chat-audio TTS classification. +- [x] Add regression coverage for MiMo Pro chat routing. +- [x] Run focused test coverage and quality checks. diff --git a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts index bf5cf959b..6c3db6f9e 100644 --- a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts +++ b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts @@ -403,6 +403,22 @@ function extractTtsText(messages: ChatMessage[]): string { return '' } +function extractChatAudioContentData(content: unknown): string | undefined { + if (!Array.isArray(content)) { + return undefined + } + + const audioPart = content.find( + (item) => item && typeof item === 'object' && 'type' in item && item.type === 'audio' + ) + const audioData = + audioPart && typeof audioPart === 'object' && 'audio' in audioPart + ? (audioPart.audio as { data?: unknown } | undefined)?.data + : undefined + + return typeof audioData === 'string' && audioData ? audioData : undefined +} + /** * Pattern A: calls the standard OpenAI-compatible /audio/speech endpoint. */ @@ -521,15 +537,15 @@ async function executeTtsPatternB( const json = (await response.json()) as { choices?: Array<{ message?: { - audio?: { data?: string } - content?: Array<{ type?: string; audio?: { data?: string } }> + audio?: { data?: unknown } + content?: unknown } }> } const firstMessage = json.choices?.[0]?.message - const audioData = - firstMessage?.audio?.data ?? - firstMessage?.content?.find((item) => item?.type === 'audio')?.audio?.data + const directAudioData = + typeof firstMessage?.audio?.data === 'string' ? firstMessage.audio.data : undefined + const audioData = directAudioData ?? extractChatAudioContentData(firstMessage?.content) if (!audioData) { throw new Error('TTS response missing audio data in choices[0].message.audio.data') } diff --git a/src/shared/ttsSettings.ts b/src/shared/ttsSettings.ts index d7f5255aa..9609fb58b 100644 --- a/src/shared/ttsSettings.ts +++ b/src/shared/ttsSettings.ts @@ -28,7 +28,8 @@ export const GEMINI_GENERATE_CONTENT_TTS_MODELS = [ * Model ID prefixes for TTS models that use the chat completions endpoint * with audio output (Pattern B), e.g. xiaomimimo mimo-v2.5-tts series. */ -export const CHAT_AUDIO_TTS_MODEL_PREFIXES = ['mimo-v'] as const +export const CHAT_AUDIO_TTS_MODEL_PREFIXES = ['mimo-v', 'xiaomi-mimo-v'] as const +const CHAT_AUDIO_TTS_MODEL_MARKER_PATTERN = /(^|-)tts($|-)/ function normalizeTtsModelId(modelId: string): string { const trimmed = modelId.trim().toLowerCase() @@ -59,8 +60,8 @@ export function isGeminiGenerateContentTtsModel(modelId: string): boolean { export function isChatAudioTtsModel(modelId: string): boolean { const id = normalizeTtsModelId(modelId) return ( - CHAT_AUDIO_TTS_MODEL_PREFIXES.some((prefix) => id.startsWith(prefix)) || - id.startsWith('xiaomi-mimo-v') + CHAT_AUDIO_TTS_MODEL_PREFIXES.some((prefix) => id.startsWith(prefix)) && + CHAT_AUDIO_TTS_MODEL_MARKER_PATTERN.test(id) ) } diff --git a/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts b/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts index b76887d89..82a500afc 100644 --- a/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts +++ b/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts @@ -367,6 +367,43 @@ describe('AI SDK runtime', () => { expect(request).not.toHaveProperty('providerOptions') }) + it('uses normal chat streaming for non-TTS MiMo Pro models', async () => { + const fetchMock = vi.fn() + vi.stubGlobal('fetch', fetchMock) + + const context = { + providerKind: 'openai-compatible', + provider: { + id: 'xiaomimimo', + apiType: 'openai-compatible', + baseUrl: 'https://example.com/v1', + apiKey: 'test-key' + }, + configPresenter: {}, + defaultHeaders: {} + } as any + + const events = [] + for await (const event of runAiSdkCoreStream( + context, + [{ role: 'user', content: 'hello mimo' }], + 'mimo-v2.5-pro', + { + apiEndpoint: 'chat', + functionCall: false + } as any, + 0.7, + 1024, + [] + )) { + events.push(event) + } + + expect(fetchMock).not.toHaveBeenCalled() + expect(mockStreamText).toHaveBeenCalledTimes(1) + expect(events).toEqual([]) + }) + it('includes an assistant role message for chat-audio TTS requests', async () => { const fetchMock = vi.fn().mockResolvedValue( new Response( @@ -450,6 +487,141 @@ describe('AI SDK runtime', () => { ]) }) + it('extracts chat-audio TTS data from content audio parts', async () => { + const fetchMock = vi.fn().mockResolvedValue( + new Response( + JSON.stringify({ + choices: [ + { + message: { + content: [ + { type: 'text', text: 'ok' }, + { + type: 'audio', + audio: { + data: 'ZmFrZS1hdWRpby1wYXJ0' + } + } + ] + } + } + ] + }), + { + status: 200, + headers: { + 'Content-Type': 'application/json' + } + } + ) + ) + vi.stubGlobal('fetch', fetchMock) + + const context = { + providerKind: 'openai-compatible', + provider: { + id: 'xiaomimimo', + apiType: 'openai-compatible', + baseUrl: 'https://example.com/v1', + apiKey: 'test-key' + }, + configPresenter: {}, + defaultHeaders: {}, + shouldUseTts: () => true + } as any + + const events = [] + for await (const event of runAiSdkCoreStream( + context, + [{ role: 'user', content: 'hello tts' }], + 'mimo-v2.5-tts', + { + apiEndpoint: 'chat', + tts: { + responseFormat: 'wav' + } + } as any, + 0.7, + 1024, + [] + )) { + events.push(event) + } + + expect(events).toEqual([ + { + type: 'image_data', + image_data: { + data: 'cached://image', + mimeType: 'audio/wav' + } + }, + { + type: 'stop', + stop_reason: 'complete' + } + ]) + }) + + it('fails cleanly when chat-audio TTS content is text without audio data', async () => { + const fetchMock = vi.fn().mockResolvedValue( + new Response( + JSON.stringify({ + choices: [ + { + message: { + content: 'plain text response without audio' + } + } + ] + }), + { + status: 200, + headers: { + 'Content-Type': 'application/json' + } + } + ) + ) + vi.stubGlobal('fetch', fetchMock) + + const context = { + providerKind: 'openai-compatible', + provider: { + id: 'xiaomimimo', + apiType: 'openai-compatible', + baseUrl: 'https://example.com/v1', + apiKey: 'test-key' + }, + configPresenter: {}, + defaultHeaders: {}, + shouldUseTts: () => true + } as any + + const drainStream = async () => { + for await (const _event of runAiSdkCoreStream( + context, + [{ role: 'user', content: 'hello tts' }], + 'mimo-v2.5-tts', + { + apiEndpoint: 'chat', + tts: { + responseFormat: 'wav' + } + } as any, + 0.7, + 1024, + [] + )) { + // Drain stream. + } + } + + await expect(drainStream()).rejects.toThrow( + 'TTS response missing audio data in choices[0].message.audio.data' + ) + }) + it('uses Gemini generateContent compatibility mode for AIHubMix Gemini TTS models', async () => { const pcmBase64 = Buffer.from([0, 0, 255, 127]).toString('base64') const fetchMock = vi.fn().mockResolvedValue( diff --git a/test/main/shared/ttsSettings.test.ts b/test/main/shared/ttsSettings.test.ts new file mode 100644 index 000000000..6dfba1ec7 --- /dev/null +++ b/test/main/shared/ttsSettings.test.ts @@ -0,0 +1,14 @@ +import { describe, expect, it } from 'vitest' +import { isChatAudioTtsModel, isTtsModelId } from '@shared/ttsSettings' + +describe('TTS model helpers', () => { + it('classifies only MiMo TTS variants as chat-audio TTS models', () => { + expect(isChatAudioTtsModel('mimo-v2.5-tts')).toBe(true) + expect(isChatAudioTtsModel('xiaomi-mimo-v2.5-tts-preview')).toBe(true) + expect(isChatAudioTtsModel('xiaomimimo/mimo-v2.5-tts')).toBe(true) + + expect(isChatAudioTtsModel('mimo-v2.5-pro')).toBe(false) + expect(isChatAudioTtsModel('xiaomimimo/mimo-v2.5-pro')).toBe(false) + expect(isTtsModelId('mimo-v2.5-pro')).toBe(false) + }) +})