diff --git a/apps/llm/package.json b/apps/llm/package.json index 8243a5eefe..acdc2379e7 100644 --- a/apps/llm/package.json +++ b/apps/llm/package.json @@ -28,7 +28,7 @@ "metro-config": "^0.83.0", "react": "19.2.5", "react-native": "0.83.4", - "react-native-audio-api": "0.12.0", + "react-native-audio-api": "0.12.2", "react-native-device-info": "^15.0.2", "react-native-executorch": "workspace:*", "react-native-executorch-expo-resource-fetcher": "workspace:*", diff --git a/apps/speech/App.tsx b/apps/speech/App.tsx index d336319805..3532fbce1a 100644 --- a/apps/speech/App.tsx +++ b/apps/speech/App.tsx @@ -2,6 +2,7 @@ import React, { useState } from 'react'; import { View, Text, StyleSheet, TouchableOpacity } from 'react-native'; import { TextToSpeechScreen } from './screens/TextToSpeechScreen'; import { SpeechToTextScreen } from './screens/SpeechToTextScreen'; +import { VoiceActivityDetectionScreen } from './screens/VoiceActivityDetectionScreen'; import ColorPalette from './colors'; import ExecutorchLogo from './assets/executorch.svg'; import { Quiz } from './screens/Quiz'; @@ -15,7 +16,12 @@ initExecutorch({ export default function App() { const [currentScreen, setCurrentScreen] = useState< - 'menu' | 'speech-to-text' | 'text-to-speech' | 'quiz' | 'text-to-speech-llm' + | 'menu' + | 'speech-to-text' + | 'text-to-speech' + | 'quiz' + | 'text-to-speech-llm' + | 'vad' >('menu'); const goToMenu = () => setCurrentScreen('menu'); @@ -28,6 +34,10 @@ export default function App() { return ; } + if (currentScreen === 'vad') { + return ; + } + if (currentScreen === 'quiz') { return ; } @@ -47,6 +57,12 @@ export default function App() { > Speech to Text + setCurrentScreen('vad')} + > + Voice Activity Detection + setCurrentScreen('text-to-speech')} diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 5c6e2228fc..5f88f3764a 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -9,6 +9,7 @@ import { KeyboardAvoidingView, Platform, Switch, + Keyboard, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { @@ -19,6 +20,7 @@ import { } from 'react-native-executorch'; import { ModelPicker, ModelOption } from '../components/ModelPicker'; const speechToText = models.speech_to_text; +const vad = models.vad; type STTModelSources = SpeechToTextProps['model']; @@ -51,6 +53,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const model = useSpeechToText({ model: selectedModel, + vad: vad.fsmn_vad(), }); const [transcription, setTranscription] = @@ -65,6 +68,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { } | null>(null); const [enableTimestamps, setEnableTimestamps] = useState(false); + const [useVAD, setUseVAD] = useState(true); const [error, setError] = useState(null); const [audioURL, setAudioURL] = useState(''); const [hasMicPermission, setHasMicPermission] = useState(false); @@ -104,11 +108,15 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { } const handleTranscribeFromURL = async () => { - if (!audioURL.trim()) { - console.warn('Please provide a valid audio file URL'); + if (!audioURL.trim() || model.isGenerating) { + if (!audioURL.trim()) { + console.warn('Please provide a valid audio file URL'); + } return; } + Keyboard.dismiss(); + // Reset previous states setTranscription(null); setLiveResult(null); @@ -131,8 +139,10 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { }; const handleStartTranscribeFromMicrophone = async () => { - if (!hasMicPermission) { - setError('Microphone permission denied. Please enable it in Settings.'); + if (!hasMicPermission || model.isGenerating || liveTranscribing) { + if (!hasMicPermission) { + setError('Microphone permission denied. Please enable it in Settings.'); + } return; } @@ -177,7 +187,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const streamIter = model.stream({ verbose: enableTimestamps, - timeout: 100, + timeout: 200, + useVAD: useVAD, + vadDetectionMargin: 1200, }); for await (const { committed, nonCommitted } of streamIter) { @@ -352,22 +364,64 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { Stop Live Transcription ) : ( - - - - {isSimulator - ? 'Recording is not available on Simulator' - : 'Start Live Transcription'} - - + + + + + {isSimulator ? 'No Mic' : 'Start Live'} + + + + setUseVAD(!useVAD)} + activeOpacity={0.7} + accessibilityRole="switch" + accessibilityState={{ checked: useVAD }} + accessibilityLabel={`Voice Activity Detection ${useVAD ? 'on' : 'off'}`} + style={[ + styles.vadButton, + useVAD ? styles.vadActive : styles.vadInactive, + recordingButtonDisabled && styles.disabled, + ]} + > + + + + VAD + + + {useVAD ? 'ON' : 'OFF'} + + + + )} @@ -492,6 +546,54 @@ const styles = StyleSheet.create({ backgroundBlue: { backgroundColor: '#0f186e', }, + buttonRow: { + flexDirection: 'row', + gap: 8, + marginTop: 12, + }, + flex1: { + flex: 1, + marginTop: 0, + }, + vadButton: { + flexDirection: 'row', + alignItems: 'center', + justifyContent: 'center', + paddingHorizontal: 14, + borderRadius: 12, + gap: 10, + }, + vadActive: { + backgroundColor: '#0f186e', + }, + vadInactive: { + backgroundColor: '#f1f5f9', + }, + vadTextContainer: { + alignItems: 'flex-start', + }, + vadButtonLabel: { + fontWeight: '800', + fontSize: 13, + letterSpacing: 0.5, + }, + vadButtonLabelActive: { + color: 'white', + }, + vadButtonLabelInactive: { + color: '#64748b', + }, + vadButtonState: { + fontWeight: '700', + fontSize: 10, + letterSpacing: 1, + }, + vadButtonStateActive: { + color: '#bbf7d0', + }, + vadButtonStateInactive: { + color: '#94a3b8', + }, disabled: { opacity: 0.5, }, diff --git a/apps/speech/screens/TextToSpeechScreen.tsx b/apps/speech/screens/TextToSpeechScreen.tsx index 198c40faf8..919076dc35 100644 --- a/apps/speech/screens/TextToSpeechScreen.tsx +++ b/apps/speech/screens/TextToSpeechScreen.tsx @@ -7,6 +7,7 @@ import { TextInput, KeyboardAvoidingView, Platform, + Keyboard, } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { @@ -124,6 +125,7 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => { return; } + Keyboard.dismiss(); setIsPlaying(true); try { diff --git a/apps/speech/screens/VoiceActivityDetectionScreen.tsx b/apps/speech/screens/VoiceActivityDetectionScreen.tsx new file mode 100644 index 0000000000..724ea52500 --- /dev/null +++ b/apps/speech/screens/VoiceActivityDetectionScreen.tsx @@ -0,0 +1,329 @@ +import React, { useEffect, useRef, useState } from 'react'; +import { + Text, + View, + StyleSheet, + TouchableOpacity, + ScrollView, + Platform, +} from 'react-native'; +import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; +import { + models, + useVAD +} from 'react-native-executorch'; +import FontAwesome from '@expo/vector-icons/FontAwesome'; +import { AudioManager, AudioRecorder } from 'react-native-audio-api'; +import SWMIcon from '../assets/swm_icon.svg'; +import DeviceInfo from 'react-native-device-info'; +import ErrorBanner from '../components/ErrorBanner'; + +const isSimulator = DeviceInfo.isEmulatorSync(); + +export const VoiceActivityDetectionScreen = ({ + onBack, +}: { + onBack: () => void; +}) => { + const model = useVAD({ + model: models.vad.fsmn_vad(), + }); + + const [isSpeaking, setIsSpeaking] = useState(false); + const [error, setError] = useState(null); + const [hasMicPermission, setHasMicPermission] = useState(false); + const [isStreaming, setIsStreaming] = useState(false); + + const recorder = useRef(new AudioRecorder()); + const logScrollRef = useRef(null); + const [logs, setLogs] = useState([]); + + const addLog = (msg: string) => { + setLogs((prev) => [...prev, `${new Date().toLocaleTimeString()}: ${msg}`]); + }; + + useEffect(() => { + AudioManager.setAudioSessionOptions({ + iosCategory: 'playAndRecord', + iosMode: 'spokenAudio', + iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'], + }); + const checkPerms = async () => { + const status = await AudioManager.requestRecordingPermissions(); + setHasMicPermission(status === 'Granted'); + }; + checkPerms(); + }, []); + + const handleStartStreaming = async () => { + if (isStreaming || model.isGenerating || !model.isReady) { + return; + } + + setIsStreaming(true); + if (!hasMicPermission) { + setError('Microphone permission denied. Please enable it in Settings.'); + setIsStreaming(false); + return; + } + + setLogs([]); + addLog('Starting VAD stream...'); + + const sampleRate = 16000; + + recorder.current.onAudioReady( + { + sampleRate, + bufferLength: 0.1 * sampleRate, + channelCount: 1, + }, + ({ buffer }) => { + model.streamInsert(buffer.getChannelData(0)); + } + ); + + try { + const success = await AudioManager.setAudioSessionActivity(true); + if (!success) { + setError('Cannot start audio session correctly'); + } + const result = recorder.current.start(); + if (result.status === 'error') { + setError(`Recording problems: ${result.status}`); + } + + await model.stream({ + onSpeechBegin: () => { + setIsSpeaking(true); + addLog('Speech detected (Begin)'); + }, + onSpeechEnd: () => { + setIsSpeaking(false); + addLog('Silence detected (End)'); + }, + options: { + timeout: 100, + detectionMargin: 300, + }, + }); + } catch (e) { + setError(e instanceof Error ? e.message : String(e)); + setIsStreaming(false); + } + }; + + const handleStopStreaming = () => { + recorder.current.stop(); + model.streamStop(); + setIsStreaming(false); + setIsSpeaking(false); + addLog('VAD stream stopped'); + }; + + const getModelStatus = () => { + if (isStreaming || model.isGenerating) return 'Processing...'; + if (model.isReady) return 'Ready'; + return `Loading model: ${(100 * model.downloadProgress).toFixed(2)}%`; + }; + + useEffect(() => { + if (model.error) setError(String(model.error)); + }, [model.error]); + + const readyToStream = model.isReady; + const recordingButtonDisabled = + isSimulator || !readyToStream || model.isGenerating; + + return ( + + + + + + + + React Native ExecuTorch + Voice Activity Detection + + + + Status: {getModelStatus()} + + + setError(null)} /> + + + + + {isSpeaking ? 'SPEAKING' : 'SILENT'} + + + + + VAD Events + + logScrollRef.current?.scrollToEnd({ animated: true }) + } + > + {logs.length > 0 ? ( + logs.map((log, i) => ( + + {log} + + )) + ) : ( + + No events logged yet... + + )} + + + + + {isStreaming ? ( + + + Stop VAD Stream + + ) : ( + + + + {isSimulator + ? 'Recording not available on Simulator' + : 'Start VAD Stream'} + + + )} + + + + ); +}; + +const styles = StyleSheet.create({ + container: { + flex: 1, + alignItems: 'center', + backgroundColor: 'white', + paddingHorizontal: 16, + }, + header: { + alignItems: 'center', + position: 'relative', + width: '100%', + }, + backButton: { + position: 'absolute', + left: 0, + top: 10, + padding: 10, + zIndex: 1, + }, + headerText: { + fontSize: 22, + fontWeight: 'bold', + color: '#0f186e', + }, + statusContainer: { + marginTop: 12, + alignItems: 'center', + }, + visualizerContainer: { + flex: 1, + justifyContent: 'center', + alignItems: 'center', + }, + visualizerText: { + marginTop: 20, + fontSize: 24, + fontWeight: '800', + letterSpacing: 2, + }, + speakingText: { + color: '#22c55e', + }, + silentText: { + color: '#ef4444', + }, + logContainer: { + height: 150, + width: '100%', + marginVertical: 12, + }, + logLabel: { + marginLeft: 12, + marginBottom: 4, + color: '#0f186e', + fontWeight: '600', + }, + logScrollContainer: { + borderRadius: 12, + borderWidth: 1, + borderColor: '#0f186e', + padding: 12, + backgroundColor: '#f8fafc', + }, + logText: { + fontSize: 12, + fontFamily: Platform.OS === 'ios' ? 'Menlo' : 'monospace', + color: '#334155', + marginBottom: 2, + }, + placeholderText: { + color: '#aaa', + fontStyle: 'italic', + }, + inputContainer: { + marginBottom: 30, + width: '100%', + }, + liveButton: { + flexDirection: 'row', + justifyContent: 'center', + alignItems: 'center', + padding: 16, + borderRadius: 12, + gap: 8, + }, + backgroundRed: { + backgroundColor: '#ef4444', + }, + backgroundBlue: { + backgroundColor: '#0f186e', + }, + buttonText: { + color: 'white', + fontWeight: '600', + fontSize: 16, + }, + disabled: { + opacity: 0.5, + }, +}); diff --git a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md index 80141db45f..02d0008dda 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useSpeechToText.md @@ -49,7 +49,7 @@ import { AudioContext } from 'react-native-audio-api'; import * as FileSystem from 'expo-file-system'; const model = useSpeechToText({ - model: models.speech_to_text.whisper_tiny_en(), + model: models.speech_to_text.whisper_tiny_en(), // Use whisper_tiny_en for English or whisper_tiny for multilingual support }); // 1. Get audio file @@ -89,7 +89,13 @@ The `stream()` function accepts several optional parameters: - `language`: The language code (e.g., `'es'`, `'fr'`). Required for multilingual models. - `verbose`: If `true`, includes word-level timestamps and segment metadata in the result objects. +- `useVAD`: Enable the Voice Activity Detection submodule (if configured in `useSpeechToText` props) to optimize performance by filtering silence. Defaults to `false`. - `timeout`: (Advanced) The interval (in milliseconds) between processing consecutive audio chunks in streaming mode. Lower values provide more frequent updates and lower latency, while higher values reduce CPU consumption. Defaults to `100`. +- `vadDetectionMargin`: (Advanced) The duration of silence (in milliseconds) required after speech is detected before "committing" a segment. Defaults to `500`. Only active when VAD module is used. + +### Voice Activity Detection (VAD) + +Integrating a VAD submodule is highly recommended for streaming. It improves performance by automatically removing silence, which reduces CPU usage, saves battery, and prevents the model from "hallucinating" text during silent periods. ### Example @@ -102,6 +108,7 @@ import { AudioManager, AudioRecorder } from 'react-native-audio-api'; export default function LiveTranscriber() { const model = useSpeechToText({ model: models.speech_to_text.whisper_tiny_en(), + vad: models.vad.fsmn_vad(), }); const [text, setText] = useState(''); const isRecordingRef = useRef(false); @@ -111,7 +118,7 @@ export default function LiveTranscriber() { isRecordingRef.current = true; setText(''); - // 1. Capture microphone input + // 2. Capture microphone input recorder.onAudioReady( { sampleRate: 16000, bufferLength: 1600, channelCount: 1 }, (chunk) => model.streamInsert(chunk.buffer.getChannelData(0)) @@ -119,10 +126,14 @@ export default function LiveTranscriber() { await recorder.start(); - // 2. Process the stream + // 3. Process the stream with VAD enabled try { let finalizedText = ''; - const streamIter = model.stream({ verbose: false }); + const streamIter = model.stream({ + verbose: false, + useVAD: true, // Enable VAD filter + vadDetectionMargin: 500, // Wait for 500ms of silence before committing + }); for await (const { committed, nonCommitted } of streamIter) { if (!isRecordingRef.current) break; @@ -163,6 +174,9 @@ To transcribe languages other than English, use a multilingual model (e.g., `mod ```typescript // Transcribe in Spanish +const model = useSpeechToText({ + model: models.speech_to_text.whisper_tiny(), +}); const result = await model.transcribe(spanishAudio, { language: 'es' }); ``` diff --git a/docs/docs/03-hooks/01-natural-language-processing/useVAD.md b/docs/docs/03-hooks/01-natural-language-processing/useVAD.md index 01cf8cd4e3..f05d53c10f 100644 --- a/docs/docs/03-hooks/01-natural-language-processing/useVAD.md +++ b/docs/docs/03-hooks/01-natural-language-processing/useVAD.md @@ -13,145 +13,79 @@ It is recommended to use models provided by us, which are available at our [Hugg - For detailed API Reference for `useVAD` see: [`useVAD` API Reference](../../06-api-reference/functions/useVAD.md). - For all VAD models available out-of-the-box in React Native ExecuTorch see: [VAD Models](../../06-api-reference/index.md#models---voice-activity-detection). -## High Level Overview +## Static Audio (Batch) processing -You can obtain waveform from audio in any way most suitable to you, however in the snippet below we utilize [`react-native-audio-api`](https://docs.swmansion.com/react-native-audio-api/) library to process a `.mp3` file. +This mode is best suited for processing pre-recorded audio files or existing buffers. You provide a full waveform to the `forward` method, which returns an array of detected speech segments. ```typescript -import { models, useVAD } from 'react-native-executorch'; -import { AudioContext } from 'react-native-audio-api'; -import * as FileSystem from 'expo-file-system'; +import { useVAD, models } from 'react-native-executorch'; -const model = useVAD({ - model: models.vad.fsmn_vad(), -}); +const model = useVAD({ model: models.vad.fsmn_vad() }); -const { uri } = await FileSystem.downloadAsync( - 'https://some-audio-url.com/file.mp3', - FileSystem.cacheDirectory + 'audio_file' -); - -const audioContext = new AudioContext({ sampleRate: 16000 }); -const decodedAudioData = await audioContext.decodeAudioDataSource(uri); -const audioBuffer = decodedAudioData.getChannelData(0); +// ... obtain audioBuffer (Float32Array) at 16kHz ... try { - // NOTE: to obtain segments in seconds, you need to divide - // start / end of the segment by the sampling rate (16k) - const speechSegments = await model.forward(audioBuffer); - console.log(speechSegments); + console.log('Speech detected at:', speechSegments); } catch (error) { - console.error('Error during running VAD model', error); + console.error('VAD Error:', error); } ``` -### Arguments - -`useVAD` takes [`VADProps`](../../06-api-reference/interfaces/VADProps.md) that consists of: +:::note +Timestamps in `Segment[]` correspond to the indices of the input array. Divide them by your sampling rate (usually 16000) to get results in seconds. +::: -- `model` containing [`modelSource`](../../06-api-reference/interfaces/VADProps.md#modelsource). -- An optional flag [`preventLoad`](../../06-api-reference/interfaces/VADProps.md#preventload) which prevents auto-loading of the model. +## Live Streaming (Real-time detection) -You need more details? Check the following resources: +Live streaming allows you to process audio in real-time as it arrives from a microphone or network stream. It uses an internal state to track speech transitions across chunks. -- For detailed information about `useVAD` arguments check this section: [`useVAD` arguments](../../06-api-reference/functions/useVAD.md#parameters). -- For all VAD models available out-of-the-box in React Native ExecuTorch see: [VAD Models](../../06-api-reference/index.md#models---voice-activity-detection). -- For more information on loading resources, take a look at [loading models](../../01-fundamentals/02-loading-models.md) page. +### How it works -### Returns +1. **Start the session**: Call `model.stream()` with callbacks for speech events. This returns a promise that stays active until the stream is stopped. +2. **Feed audio**: Periodically push audio chunks using `model.streamInsert()`. +3. **Handle events**: Use `onSpeechBegin` and `onSpeechEnd` callbacks to trigger UI updates or toggle recording for other tasks (like STT). +4. **End the session**: Call `model.streamStop()` to clean up. -`useVAD` returns an object called `VADType` containing bunch of functions to interact with VAD models. To get more details please read: [`VADType` API Reference](../../06-api-reference/interfaces/VADType.md). +### Configuration Options -## Running the model +You can fine-tune the streaming behavior via the `options` object: -Before running the model's [`forward`](../../06-api-reference/interfaces/VADType.md#forward) method, make sure to extract the audio waveform you want to process. You'll need to handle this step yourself, ensuring the audio is sampled at 16 kHz. Once you have the waveform, pass it as an argument to the [`forward`](../../06-api-reference/interfaces/VADType.md#forward) method. The method returns a promise that resolves to the array of detected speech [`Segment[]`](../../06-api-reference/interfaces/Segment.md). - -:::note -Timestamps in returned speech segments, correspond to indices of input array (waveform). -::: - -## Example +- **`timeout`** (default: `100`ms): Specifies the interval between consecutive VAD inferences. A lower value makes the detection more responsive but increases CPU usage. +- **`detectionMargin`** (default: `100`ms): Specifies the maximum allowed gap between the last detected speech segment and the current time to still consider the speech as "ongoing." This value determines how much silence is tolerated before `onSpeechEnd` is triggered. ```tsx -import React from 'react'; -import { Button, Text, SafeAreaView } from 'react-native'; -import { models, useVAD } from 'react-native-executorch'; -import { AudioContext } from 'react-native-audio-api'; -import * as FileSystem from 'expo-file-system'; - -export default function App() { - const model = useVAD({ - model: models.vad.fsmn_vad(), +import { useVAD, models } from 'react-native-executorch'; + +const model = useVAD({ model: models.vad.fsmn_vad() }); + +const startLiveVAD = async () => { + // Start the continuous streaming listener + model.stream({ + onSpeechBegin: () => console.log('User started speaking'), + onSpeechEnd: () => console.log('User stopped speaking'), + options: { + timeout: 100, // Checks every 100ms + detectionMargin: 500, // 500ms of silence before ending speech + }, }); - const audioURL = 'https://some-audio-url.com/file.mp3'; - - const handleAudio = async () => { - if (!model) { - console.error('VAD model is not loaded yet.'); - return; - } - - console.log('Processing URL:', audioURL); - - try { - const { uri } = await FileSystem.downloadAsync( - audioURL, - FileSystem.cacheDirectory + 'vad_example.tmp' - ); - - const audioContext = new AudioContext({ sampleRate: 16000 }); - const originalDecodedBuffer = - await audioContext.decodeAudioDataSource(uri); - const originalChannelData = originalDecodedBuffer.getChannelData(0); - - const segments = await model.forward(originalChannelData); - if (segments.length === 0) { - console.log('No speech segments were found.'); - return; - } - console.log(`Found ${segments.length} speech segments.`); - - const totalLength = segments.reduce( - (sum, seg) => sum + (seg.end - seg.start), - 0 - ); - const newAudioBuffer = audioContext.createBuffer( - 1, // Mono - totalLength, - originalDecodedBuffer.sampleRate - ); - const newChannelData = newAudioBuffer.getChannelData(0); - - let offset = 0; - for (const segment of segments) { - const slice = originalChannelData.subarray(segment.start, segment.end); - newChannelData.set(slice, offset); - offset += slice.length; - } - - // Play the processed audio - const source = audioContext.createBufferSource(); - source.buffer = newAudioBuffer; - source.connect(audioContext.destination); - source.start(); - } catch (error) { - console.error('Error processing audio data:', error); - } - }; - - return ( - - - Press the button to process and play speech from a sample file. - -