diff --git a/testing/e2e/global-setup.ts b/testing/e2e/global-setup.ts index f869df01a..5f1bd7ca9 100644 --- a/testing/e2e/global-setup.ts +++ b/testing/e2e/global-setup.ts @@ -43,6 +43,19 @@ export default async function globalSetup() { mock.mount('/v1/text-to-speech', elevenLabsTTSMount()) mock.mount('/v1/speech-to-text', elevenLabsSTTMount()) + // Gemini TTS hits the standard Gemini generateContent endpoint + // (POST /v1beta/models/{model}:generateContent) with + // responseModalities: ['AUDIO']. aimock's native Gemini audio helper derives + // the mime type from the fixture's `format`/`contentType`, so it can't emit + // the raw `audio/L16;codec=pcm;rate=24000` PCM that real Gemini TTS returns. + // Mount the TTS model's generateContent path directly so we can hand back + // PCM and exercise the adapter's PCM→WAV normalization. The path is specific + // to the TTS model, so it doesn't intercept Gemini chat/summarize requests. + mock.mount( + '/v1beta/models/gemini-3.1-flash-tts-preview:generateContent', + geminiTTSMount(), + ) + // Anthropic server_tool_use bug reproduction (issue #604). aimock can't // natively synthesize `server_tool_use` / `web_fetch_tool_result` content // blocks, so this mount hand-crafts the raw SSE Claude would emit when a @@ -107,6 +120,14 @@ const FAKE_MP3_BYTES = Buffer.from([ 0xff, 0xfb, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]) +/** + * Raw 16-bit little-endian PCM bytes. Gemini TTS returns audio as + * `audio/L16;codec=pcm;rate=24000` inlineData, which the adapter wraps in a + * RIFF/WAV header before handing it to the browser. The samples are arbitrary + * silence — the spec only asserts the `