From 4747657b37ecf5be94437175744ab1d46244fedb Mon Sep 17 00:00:00 2001
From: zerob13 <zerob13@gmail.com>
Date: Wed, 27 May 2026 10:35:12 +0800
Subject: [PATCH] fix(tts): route MiMo Pro as chat

---
 docs/archives/chat-audio-tts-routing/plan.md  |  21 +++
 docs/archives/chat-audio-tts-routing/spec.md  |  26 +++
 docs/archives/chat-audio-tts-routing/tasks.md |   8 +
 .../llmProviderPresenter/aiSdk/runtime.ts     |  26 ++-
 src/shared/ttsSettings.ts                     |   7 +-
 .../llmProviderPresenter/aiSdkRuntime.test.ts | 172 ++++++++++++++++++
 test/main/shared/ttsSettings.test.ts          |  14 ++
 7 files changed, 266 insertions(+), 8 deletions(-)
 create mode 100644 docs/archives/chat-audio-tts-routing/plan.md
 create mode 100644 docs/archives/chat-audio-tts-routing/spec.md
 create mode 100644 docs/archives/chat-audio-tts-routing/tasks.md
 create mode 100644 test/main/shared/ttsSettings.test.ts

diff --git a/docs/archives/chat-audio-tts-routing/plan.md b/docs/archives/chat-audio-tts-routing/plan.md
new file mode 100644
index 000000000..e711ff914
--- /dev/null
+++ b/docs/archives/chat-audio-tts-routing/plan.md
@@ -0,0 +1,21 @@
+# Chat Audio TTS Routing Plan
+
+## Implementation
+
+- Tighten `isChatAudioTtsModel` so MiMo IDs must match the known MiMo prefixes and include a standalone `tts` segment.
+- Update `executeTtsPatternB` to treat `message.content` as unknown response data.
+- Extract audio parts only after checking `Array.isArray(message.content)`.
+- Keep `message.audio.data` as the first-preference extraction path.
+- Leave the existing missing-audio error path in place for responses that contain no audio data.
+
+## Test Strategy
+
+- Add shared helper coverage for MiMo TTS and non-TTS model IDs.
+- Extend `test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts`.
+- Cover `mimo-v2.5-pro` using normal chat streaming instead of direct TTS `fetch`.
+- Cover a successful HTTP response with string `message.content` and no audio payload.
+- Assert the runtime rejects with the expected missing-audio error, not `content.find is not a function`.
+
+## Compatibility
+
+This change is backward-compatible for actual MiMo TTS models. Non-TTS MiMo chat models stop being routed through TTS handling, while providers returning `message.audio.data` or array content audio parts keep the same behavior.
diff --git a/docs/archives/chat-audio-tts-routing/spec.md b/docs/archives/chat-audio-tts-routing/spec.md
new file mode 100644
index 000000000..bfbbca2c5
--- /dev/null
+++ b/docs/archives/chat-audio-tts-routing/spec.md
@@ -0,0 +1,26 @@
+# Chat Audio TTS Routing
+
+## User Story
+
+When a MiMo chat model is selected, DeepChat should only enter chat-audio TTS handling for model IDs that are actually TTS variants. Regular MiMo chat models such as `MiMo-V2.5-Pro` should use the normal chat streaming runtime.
+
+## Acceptance Criteria
+
+- `mimo-v2.5-pro` and provider-prefixed variants are not classified as TTS models.
+- MiMo model IDs with a `tts` segment, such as `mimo-v2.5-tts`, continue to use chat-audio TTS Pattern B.
+- Chat-audio TTS responses with `choices[0].message.audio.data` continue to emit cached audio.
+- Chat-audio TTS responses with array `choices[0].message.content` can still extract an audio content part.
+- Chat-audio TTS responses with string `choices[0].message.content` do not throw a `TypeError`.
+- If no audio payload exists, DeepChat raises the existing missing-audio error instead of a response-shape crash.
+
+## Non-Goals
+
+- No changes to renderer audio display behavior.
+- No changes to request body construction for chat-audio TTS models.
+
+## Constraints
+
+- Keep the fix localized to the AI SDK runtime.
+- Keep TTS model classification in shared helpers so provider and agent runtime checks agree.
+- Preserve current OpenAI-compatible chat-audio behavior.
+- Add focused regression coverage for the reported MiMo Pro misrouting and response shape.
diff --git a/docs/archives/chat-audio-tts-routing/tasks.md b/docs/archives/chat-audio-tts-routing/tasks.md
new file mode 100644
index 000000000..9ff9742f3
--- /dev/null
+++ b/docs/archives/chat-audio-tts-routing/tasks.md
@@ -0,0 +1,8 @@
+# Chat Audio TTS Routing Tasks
+
+- [x] Create SDD issue artifacts.
+- [x] Guard chat-audio TTS content audio extraction by response shape.
+- [x] Add a regression test for string `message.content`.
+- [x] Tighten MiMo chat-audio TTS classification.
+- [x] Add regression coverage for MiMo Pro chat routing.
+- [x] Run focused test coverage and quality checks.
diff --git a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts
index bf5cf959b..6c3db6f9e 100644
--- a/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts
+++ b/src/main/presenter/llmProviderPresenter/aiSdk/runtime.ts
@@ -403,6 +403,22 @@ function extractTtsText(messages: ChatMessage[]): string {
   return ''
 }
 
+function extractChatAudioContentData(content: unknown): string | undefined {
+  if (!Array.isArray(content)) {
+    return undefined
+  }
+
+  const audioPart = content.find(
+    (item) => item && typeof item === 'object' && 'type' in item && item.type === 'audio'
+  )
+  const audioData =
+    audioPart && typeof audioPart === 'object' && 'audio' in audioPart
+      ? (audioPart.audio as { data?: unknown } | undefined)?.data
+      : undefined
+
+  return typeof audioData === 'string' && audioData ? audioData : undefined
+}
+
 /**
  * Pattern A: calls the standard OpenAI-compatible /audio/speech endpoint.
  */
@@ -521,15 +537,15 @@ async function executeTtsPatternB(
     const json = (await response.json()) as {
       choices?: Array<{
         message?: {
-          audio?: { data?: string }
-          content?: Array<{ type?: string; audio?: { data?: string } }>
+          audio?: { data?: unknown }
+          content?: unknown
         }
       }>
     }
     const firstMessage = json.choices?.[0]?.message
-    const audioData =
-      firstMessage?.audio?.data ??
-      firstMessage?.content?.find((item) => item?.type === 'audio')?.audio?.data
+    const directAudioData =
+      typeof firstMessage?.audio?.data === 'string' ? firstMessage.audio.data : undefined
+    const audioData = directAudioData ?? extractChatAudioContentData(firstMessage?.content)
     if (!audioData) {
       throw new Error('TTS response missing audio data in choices[0].message.audio.data')
     }
diff --git a/src/shared/ttsSettings.ts b/src/shared/ttsSettings.ts
index d7f5255aa..9609fb58b 100644
--- a/src/shared/ttsSettings.ts
+++ b/src/shared/ttsSettings.ts
@@ -28,7 +28,8 @@ export const GEMINI_GENERATE_CONTENT_TTS_MODELS = [
  * Model ID prefixes for TTS models that use the chat completions endpoint
  * with audio output (Pattern B), e.g. xiaomimimo mimo-v2.5-tts series.
  */
-export const CHAT_AUDIO_TTS_MODEL_PREFIXES = ['mimo-v'] as const
+export const CHAT_AUDIO_TTS_MODEL_PREFIXES = ['mimo-v', 'xiaomi-mimo-v'] as const
+const CHAT_AUDIO_TTS_MODEL_MARKER_PATTERN = /(^|-)tts($|-)/
 
 function normalizeTtsModelId(modelId: string): string {
   const trimmed = modelId.trim().toLowerCase()
@@ -59,8 +60,8 @@ export function isGeminiGenerateContentTtsModel(modelId: string): boolean {
 export function isChatAudioTtsModel(modelId: string): boolean {
   const id = normalizeTtsModelId(modelId)
   return (
-    CHAT_AUDIO_TTS_MODEL_PREFIXES.some((prefix) => id.startsWith(prefix)) ||
-    id.startsWith('xiaomi-mimo-v')
+    CHAT_AUDIO_TTS_MODEL_PREFIXES.some((prefix) => id.startsWith(prefix)) &&
+    CHAT_AUDIO_TTS_MODEL_MARKER_PATTERN.test(id)
   )
 }
 
diff --git a/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts b/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts
index b76887d89..82a500afc 100644
--- a/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts
+++ b/test/main/presenter/llmProviderPresenter/aiSdkRuntime.test.ts
@@ -367,6 +367,43 @@ describe('AI SDK runtime', () => {
     expect(request).not.toHaveProperty('providerOptions')
   })
 
+  it('uses normal chat streaming for non-TTS MiMo Pro models', async () => {
+    const fetchMock = vi.fn()
+    vi.stubGlobal('fetch', fetchMock)
+
+    const context = {
+      providerKind: 'openai-compatible',
+      provider: {
+        id: 'xiaomimimo',
+        apiType: 'openai-compatible',
+        baseUrl: 'https://example.com/v1',
+        apiKey: 'test-key'
+      },
+      configPresenter: {},
+      defaultHeaders: {}
+    } as any
+
+    const events = []
+    for await (const event of runAiSdkCoreStream(
+      context,
+      [{ role: 'user', content: 'hello mimo' }],
+      'mimo-v2.5-pro',
+      {
+        apiEndpoint: 'chat',
+        functionCall: false
+      } as any,
+      0.7,
+      1024,
+      []
+    )) {
+      events.push(event)
+    }
+
+    expect(fetchMock).not.toHaveBeenCalled()
+    expect(mockStreamText).toHaveBeenCalledTimes(1)
+    expect(events).toEqual([])
+  })
+
   it('includes an assistant role message for chat-audio TTS requests', async () => {
     const fetchMock = vi.fn().mockResolvedValue(
       new Response(
@@ -450,6 +487,141 @@ describe('AI SDK runtime', () => {
     ])
   })
 
+  it('extracts chat-audio TTS data from content audio parts', async () => {
+    const fetchMock = vi.fn().mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          choices: [
+            {
+              message: {
+                content: [
+                  { type: 'text', text: 'ok' },
+                  {
+                    type: 'audio',
+                    audio: {
+                      data: 'ZmFrZS1hdWRpby1wYXJ0'
+                    }
+                  }
+                ]
+              }
+            }
+          ]
+        }),
+        {
+          status: 200,
+          headers: {
+            'Content-Type': 'application/json'
+          }
+        }
+      )
+    )
+    vi.stubGlobal('fetch', fetchMock)
+
+    const context = {
+      providerKind: 'openai-compatible',
+      provider: {
+        id: 'xiaomimimo',
+        apiType: 'openai-compatible',
+        baseUrl: 'https://example.com/v1',
+        apiKey: 'test-key'
+      },
+      configPresenter: {},
+      defaultHeaders: {},
+      shouldUseTts: () => true
+    } as any
+
+    const events = []
+    for await (const event of runAiSdkCoreStream(
+      context,
+      [{ role: 'user', content: 'hello tts' }],
+      'mimo-v2.5-tts',
+      {
+        apiEndpoint: 'chat',
+        tts: {
+          responseFormat: 'wav'
+        }
+      } as any,
+      0.7,
+      1024,
+      []
+    )) {
+      events.push(event)
+    }
+
+    expect(events).toEqual([
+      {
+        type: 'image_data',
+        image_data: {
+          data: 'cached://image',
+          mimeType: 'audio/wav'
+        }
+      },
+      {
+        type: 'stop',
+        stop_reason: 'complete'
+      }
+    ])
+  })
+
+  it('fails cleanly when chat-audio TTS content is text without audio data', async () => {
+    const fetchMock = vi.fn().mockResolvedValue(
+      new Response(
+        JSON.stringify({
+          choices: [
+            {
+              message: {
+                content: 'plain text response without audio'
+              }
+            }
+          ]
+        }),
+        {
+          status: 200,
+          headers: {
+            'Content-Type': 'application/json'
+          }
+        }
+      )
+    )
+    vi.stubGlobal('fetch', fetchMock)
+
+    const context = {
+      providerKind: 'openai-compatible',
+      provider: {
+        id: 'xiaomimimo',
+        apiType: 'openai-compatible',
+        baseUrl: 'https://example.com/v1',
+        apiKey: 'test-key'
+      },
+      configPresenter: {},
+      defaultHeaders: {},
+      shouldUseTts: () => true
+    } as any
+
+    const drainStream = async () => {
+      for await (const _event of runAiSdkCoreStream(
+        context,
+        [{ role: 'user', content: 'hello tts' }],
+        'mimo-v2.5-tts',
+        {
+          apiEndpoint: 'chat',
+          tts: {
+            responseFormat: 'wav'
+          }
+        } as any,
+        0.7,
+        1024,
+        []
+      )) {
+        // Drain stream.
+      }
+    }
+
+    await expect(drainStream()).rejects.toThrow(
+      'TTS response missing audio data in choices[0].message.audio.data'
+    )
+  })
+
   it('uses Gemini generateContent compatibility mode for AIHubMix Gemini TTS models', async () => {
     const pcmBase64 = Buffer.from([0, 0, 255, 127]).toString('base64')
     const fetchMock = vi.fn().mockResolvedValue(
diff --git a/test/main/shared/ttsSettings.test.ts b/test/main/shared/ttsSettings.test.ts
new file mode 100644
index 000000000..6dfba1ec7
--- /dev/null
+++ b/test/main/shared/ttsSettings.test.ts
@@ -0,0 +1,14 @@
+import { describe, expect, it } from 'vitest'
+import { isChatAudioTtsModel, isTtsModelId } from '@shared/ttsSettings'
+
+describe('TTS model helpers', () => {
+  it('classifies only MiMo TTS variants as chat-audio TTS models', () => {
+    expect(isChatAudioTtsModel('mimo-v2.5-tts')).toBe(true)
+    expect(isChatAudioTtsModel('xiaomi-mimo-v2.5-tts-preview')).toBe(true)
+    expect(isChatAudioTtsModel('xiaomimimo/mimo-v2.5-tts')).toBe(true)
+
+    expect(isChatAudioTtsModel('mimo-v2.5-pro')).toBe(false)
+    expect(isChatAudioTtsModel('xiaomimimo/mimo-v2.5-pro')).toBe(false)
+    expect(isTtsModelId('mimo-v2.5-pro')).toBe(false)
+  })
+})