From a2538547e49de913646170ea5f46e7b2093c202a Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Fri, 22 May 2026 17:14:19 +0200 Subject: [PATCH 1/3] fix(docs): update useSpeechToText and useVAD documentation --- .../useSpeechToText.md | 1 - .../01-natural-language-processing/useVAD.md | 14 ++++++++++---- .../SpeechToTextModule.md | 18 +++++++++--------- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md index 02d0008dda..5862f7d52f 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md @@ -207,7 +207,6 @@ The hook returns an object with: - `streamInsert(audio)`: Push audio to the stream buffer. - `streamStop()`: Finish the current stream. - `isGenerating`: Boolean indicating if the model is busy. -- `loading`: Boolean indicating if the model is being loaded. ## Supported models diff --git a/docs/docs/03-hooks/01-natural-language-processing/useVAD.md b/docs/docs/03-hooks/01-natural-language-processing/useVAD.md index f05d53c10f..7aebdaec6d 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useVAD.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useVAD.md @@ -56,8 +56,10 @@ You can fine-tune the streaming behavior via the `options` object: ```tsx import { useVAD, models } from 'react-native-executorch'; +import { AudioRecorder } from 'react-native-audio-api'; const model = useVAD({ model: models.vad.fsmn_vad() }); +const recorder = new AudioRecorder(); const startLiveVAD = async () => { // Start the continuous streaming listener @@ -70,13 +72,17 @@ const startLiveVAD = async () => { }, }); - // Example: Hook into your audio recorder's data event - audioRecorder.on('data', (chunk: Float32Array) => { - model.streamInsert(chunk); - }); + // Capture microphone input at 16kHz + recorder.onAudioReady( + { sampleRate: 16000, bufferLength: 1600, channelCount: 1 }, + (chunk) => model.streamInsert(chunk.buffer.getChannelData(0)) + ); + + await recorder.start(); }; const stopLiveVAD = () => { + recorder.stop(); model.streamStop(); }; ``` diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md index 2e2597397d..989fc4fe6e 100644 --- a/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md +++ b/docs/docs/04-typescript-api/01-natural-language-processing/SpeechToTextModule.md @@ -98,20 +98,20 @@ const model = await SpeechToTextModule.fromModelName( AudioManager.setAudioSessionOptions({ iosCategory: 'playAndRecord', iosMode: 'spokenAudio', - iosOptions: ['allowBluetooth', 'defaultToSpeaker'], + iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'], }); await AudioManager.requestRecordingPermissions(); // 2. Setup Audio Recorder -const recorder = new AudioRecorder({ - sampleRate: 16000, - channelCount: 1, -}); +const recorder = new AudioRecorder(); -recorder.onAudioReady((chunk) => { - // Feed chunks directly into the model's buffer - model.streamInsert(chunk.buffer.getChannelData(0)); -}); +recorder.onAudioReady( + { sampleRate: 16000, bufferLength: 1600, channelCount: 1 }, + (chunk) => { + // Feed chunks directly into the model's buffer + model.streamInsert(chunk.buffer.getChannelData(0)); + } +); await recorder.start(); From e4d8cc8892f861e8235a396ac5544828c619f63f Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Fri, 22 May 2026 17:21:22 +0200 Subject: [PATCH 2/3] fix(docs): fix useSpeechToText and useVAD return types in docs --- .../useSpeechToText.md | 19 ++++++++++++------- .../01-natural-language-processing/useVAD.md | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md index 5862f7d52f..53869c6fd5 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md @@ -200,13 +200,18 @@ const result = await model.transcribe(audioBuffer, { verbose: true }); ### Returns -The hook returns an object with: - -- `transcribe(audio, options)`: One-shot transcription. -- `stream(options)`: Async generator for streaming results. -- `streamInsert(audio)`: Push audio to the stream buffer. -- `streamStop()`: Finish the current stream. -- `isGenerating`: Boolean indicating if the model is busy. +The hook returns a [`SpeechToTextType`](../../06-api-reference/interfaces/SpeechToTextType.md) object containing: + +- `error`: `null | RnExecutorchError` - Contains the error message if the model failed to load. +- `isReady`: `boolean` - Indicates whether the model has successfully loaded and is ready for inference. +- `isGenerating`: `boolean` - Indicates whether the model is currently processing an inference. +- `downloadProgress`: `number` - Tracks the progress of the model download process as a value between `0` and `1`. +- `transcribe(audio, options)`: Starts a transcription process for a given input array, which should be a waveform at 16kHz. Returns a promise resolving to a [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md). +- `stream(options)`: Starts a streaming transcription process. Asynchronous generator that yields objects containing `committed` and `nonCommitted` transcriptions, both of type [`TranscriptionResult`](../../06-api-reference/interfaces/TranscriptionResult.md). +- `streamInsert(audio)`: Inserts a chunk of audio data (sampled at 16kHz) into the ongoing streaming transcription. +- `streamStop()`: Stops the ongoing streaming transcription process. +- `encode(audio)`: Runs the encoding part of the model on the provided waveform. Returns a promise resolving to the encoded `Float32Array`. +- `decode(tokens, encoderOutput)`: Runs the decoder of the model with the given tokens (`Int32Array`) and encoder output (`Float32Array`). Returns a promise resolving to the decoded `Float32Array`. ## Supported models diff --git a/docs/docs/03-hooks/01-natural-language-processing/useVAD.md b/docs/docs/03-hooks/01-natural-language-processing/useVAD.md index 7aebdaec6d..99d41f1608 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useVAD.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useVAD.md @@ -90,7 +90,7 @@ const stopLiveVAD = () => { ### Arguments & Returns - **Arguments**: `useVAD` takes a [`VADProps`](../../06-api-reference/interfaces/VADProps.md) object containing the `model` and an optional `preventLoad` flag. -- **Returns**: A [`VADType`](../../06-api-reference/interfaces/VADType.md) object providing `forward`, `stream`, `streamInsert`, and `streamStop` methods, along with `isReady` and `error` states. +- **Returns**: A [`VADType`](../../06-api-reference/interfaces/VADType.md) object providing `forward`, `stream`, `streamInsert`, and `streamStop` methods, along with `error`, `isReady`, `isGenerating`, and `downloadProgress` states. ## Supported models From 53bb38f46bb6e085fd0c8a60cd9c4c56b433abdb Mon Sep 17 00:00:00 2001 From: Bartosz Hanc Date: Fri, 22 May 2026 17:25:11 +0200 Subject: [PATCH 3/3] fix(docs): update TextToSpeechModule examples to include streamInsert usage --- .../01-natural-language-processing/TextToSpeechModule.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md index daf5cb735b..fb248baecc 100644 --- a/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md +++ b/docs/docs/04-typescript-api/01-natural-language-processing/TextToSpeechModule.md @@ -100,8 +100,8 @@ const tts = await TextToSpeechModule.fromModelName( const audioContext = new AudioContext({ sampleRate: 24000 }); try { + tts.streamInsert('This is a streaming test, with a sample input.'); for await (const chunk of tts.stream({ - text: 'This is a streaming test, with a sample input.', speed: 1.0, })) { // Play each chunk sequentially @@ -135,8 +135,10 @@ const tts = await TextToSpeechModule.fromModelName( const waveform = await tts.forward('həlˈO wˈɜɹld!', 1.0, false); // Or stream from phonemes +tts.streamInsert( + 'ɐ mˈæn hˌu dˈʌzᵊnt tɹˈʌst hɪmsˈɛlf, kæn nˈɛvəɹ ɹˈiᵊli tɹˈʌst ˈɛniwˌʌn ˈɛls.' +); for await (const chunk of tts.stream({ - text: 'ɐ mˈæn hˌu dˈʌzᵊnt tɹˈʌst hɪmsˈɛlf, kæn nˈɛvəɹ ɹˈiᵊli tɹˈʌst ˈɛniwˌʌn ˈɛls.', speed: 1.0, phonemize: false, })) {