diff --git a/examples/models/voxtral_realtime/CMakePresets.json b/examples/models/voxtral_realtime/CMakePresets.json index 707e94b0169..33ae5072908 100644 --- a/examples/models/voxtral_realtime/CMakePresets.json +++ b/examples/models/voxtral_realtime/CMakePresets.json @@ -14,12 +14,16 @@ { "name": "voxtral-realtime-cpu", "displayName": "Voxtral Realtime runner (CPU)", - "inherits": ["voxtral-realtime-base"] + "inherits": [ + "voxtral-realtime-base" + ] }, { "name": "voxtral-realtime-metal", "displayName": "Voxtral Realtime runner (Metal)", - "inherits": ["voxtral-realtime-base"], + "inherits": [ + "voxtral-realtime-base" + ], "cacheVariables": { "EXECUTORCH_BUILD_METAL": "ON" }, @@ -32,14 +36,19 @@ { "name": "voxtral-realtime-cuda", "displayName": "Voxtral Realtime runner (CUDA)", - "inherits": ["voxtral-realtime-base"], + "inherits": [ + "voxtral-realtime-base" + ], "cacheVariables": { "EXECUTORCH_BUILD_CUDA": "ON" }, "condition": { "type": "inList", "string": "${hostSystemName}", - "list": ["Linux", "Windows"] + "list": [ + "Linux", + "Windows" + ] } } ], @@ -48,20 +57,26 @@ "name": "voxtral-realtime-cpu", "displayName": "Build Voxtral Realtime runner (CPU)", "configurePreset": "voxtral-realtime-cpu", - "targets": ["voxtral_realtime_runner"] + "targets": [ + "voxtral_realtime_runner" + ] }, { "name": "voxtral-realtime-metal", "displayName": "Build Voxtral Realtime runner (Metal)", "configurePreset": "voxtral-realtime-metal", "configuration": "Release", - "targets": ["voxtral_realtime_runner"] + "targets": [ + "voxtral_realtime_runner" + ] }, { "name": "voxtral-realtime-cuda", "displayName": "Build Voxtral Realtime runner (CUDA)", "configurePreset": "voxtral-realtime-cuda", - "targets": ["voxtral_realtime_runner"] + "targets": [ + "voxtral_realtime_runner" + ] } ], "workflowPresets": [ @@ -108,4 +123,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/examples/models/voxtral_realtime/README.md b/examples/models/voxtral_realtime/README.md index 7d29ba8c11b..664b99d59ef 100644 --- a/examples/models/voxtral_realtime/README.md +++ b/examples/models/voxtral_realtime/README.md @@ -89,6 +89,7 @@ python export_voxtral_rt.py \ | `xnnpack` | ✓ | ✓ | `4w`, `8w`, `8da4w`, `8da8w` | | `metal` | ✓ | ✓ | none (fp32) or `fpa4w` (Metal-specific 4-bit) | | `cuda` | ✓ | ✓ | `4w`, `8w` | +| `cuda-windows` | ✓ | ✓ | `4w`, `8w` | Metal backend provides Apple GPU acceleration. CUDA backend provides NVIDIA GPU acceleration via AOTInductor. @@ -163,6 +164,38 @@ Alternatively, you can build torchao with Metal support while installing ExecuTo EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_executorch.sh ``` +### CUDA-Windows Export + +Before running `cuda-windows` export, make sure these requirements are set up: +- `x86_64-w64-mingw32-g++` is installed and on `PATH` (mingw-w64 cross-compiler). +- `WINDOWS_CUDA_HOME` points to the extracted Windows CUDA package directory. + +Example setup on Ubuntu (refer to [Parakeet README](../../parakeet/README.md) for detailed extraction steps): + +```bash +# Ensure the WINDOWS_CUDA_HOME environment variable is set +export WINDOWS_CUDA_HOME=/opt/cuda-windows/extracted/cuda_cudart/cudart +``` + +Export the model for Windows CUDA (example with int4 quantization): + +```bash +python export_voxtral_rt.py \ + --model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \ + --backend cuda-windows \ + --dtype bf16 \ + --output-dir ./voxtral_rt_exports \ + --qlinear-encoder 4w \ + --qlinear-encoder-packing-format tile_packed_to_4d \ + --qlinear 4w \ + --qlinear-packing-format tile_packed_to_4d \ + --qembedding 8w +``` + +This generates: +- `model.pte` +- `aoti_cuda_blob.ptd` + ### Options | Flag | Default | Description | @@ -220,6 +253,18 @@ make voxtral_realtime-metal This builds ExecuTorch with Metal backend support. The runner binary is at the same path as above. Metal exports can only run on macOS with Apple Silicon. +### CUDA-Windows + +On Windows (PowerShell), use CMake workflow presets directly from the executorch root directory. Note that if you exported the model with 4-bit quantization, you may need to specify your GPU's compute capability (e.g., `80;86;89;90;120` for Ampere, Lovelace, Hopper, and Blackwell) to avoid "invalid device function" errors at runtime, as the `int4mm` kernels require SM 80 or newer. + +```powershell +$env:CMAKE_CUDA_ARCHITECTURES="80;86;89;90;120" +cmake --workflow --preset llm-release-cuda +Push-Location examples/models/voxtral_realtime +cmake --workflow --preset voxtral-realtime-cuda +Pop-Location +``` + ## Run The runner requires: @@ -229,22 +274,24 @@ The runner requires: - A 16kHz mono WAV audio file (or live audio via `--mic`) - For CUDA: `aoti_cuda_blob.ptd` — delegate data file (pass via `--data_path`) -```bash -cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \ - --model_path voxtral_rt_exports/model.pte \ - --tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \ - --preprocessor_path voxtral_rt_exports/preprocessor.pte \ +### Windows (PowerShell) + +```powershell +.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe ` + --model_path voxtral_rt_exports\model.pte ` + --tokenizer_path C:\path\to\tekken.json ` + --preprocessor_path voxtral_rt_exports\preprocessor.pte ` --audio_path input.wav ``` For CUDA, include the `.ptd` data file: -```bash -cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \ - --model_path voxtral_rt_exports/model.pte \ - --data_path voxtral_rt_exports/aoti_cuda_blob.ptd \ - --tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \ - --preprocessor_path voxtral_rt_exports/preprocessor.pte \ +```powershell +.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe ` + --model_path voxtral_rt_exports\model.pte ` + --data_path voxtral_rt_exports\aoti_cuda_blob.ptd ` + --tokenizer_path C:\path\to\tekken.json ` + --preprocessor_path voxtral_rt_exports\preprocessor.pte ` --audio_path input.wav ``` @@ -252,12 +299,12 @@ For streaming, add `--streaming`. This requires a model exported with `--streaming`. The runner processes audio in 80ms steps, computing mel and running the encoder+decoder incrementally. -```bash -cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \ - --model_path voxtral_rt_exports/model.pte \ - --tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \ - --preprocessor_path voxtral_rt_exports/preprocessor.pte \ - --audio_path input.wav \ +```powershell +.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe ` + --model_path voxtral_rt_exports\model.pte ` + --tokenizer_path C:\path\to\tekken.json ` + --preprocessor_path voxtral_rt_exports\preprocessor.pte ` + --audio_path input.wav ` --streaming ``` @@ -277,6 +324,17 @@ ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f f32le -nostats -loglevel error Ctrl+C stops recording and flushes remaining text. +**Windows (PowerShell):** + +```powershell +.\cmake-out\examples\models\voxtral_realtime\Release\voxtral_realtime_runner.exe ` + --model_path C:\path\to\voxtral_rt_exports\model.pte ` + --data_path C:\path\to\voxtral_rt_exports\aoti_cuda_blob.ptd ` + --tokenizer_path C:\path\to\tekken.json ` + --preprocessor_path C:\path\to\voxtral_rt_exports\preprocessor.pte ` + --audio_path C:\path\to\input.wav +``` + **CUDA:** Add `--data_path voxtral_rt_exports/aoti_cuda_blob.ptd` to all run commands above when using the CUDA backend.