Skip to content

Commit 5193141

Browse files
authored
Voxtral Realtime: enable CUDA backend with int4 quantization (#17798)
1 parent 0a180ca commit 5193141

12 files changed

Lines changed: 580 additions & 116 deletions

File tree

.ci/scripts/export_model_artifact.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -257,10 +257,14 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
257257

258258
# Per-component quantization flags
259259
VR_QUANT_ARGS=""
260+
VR_DTYPE_ARGS=""
260261
if [ "$QUANT_NAME" = "quantized-8da4w" ]; then
261262
VR_QUANT_ARGS="--qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
262263
elif [ "$QUANT_NAME" = "quantized-int4-metal" ]; then
263264
VR_QUANT_ARGS="--qlinear-encoder fpa4w --qlinear fpa4w"
265+
elif [ "$QUANT_NAME" = "quantized-int4-tile-packed" ]; then
266+
VR_QUANT_ARGS="--qlinear-encoder 4w --qlinear-encoder-packing-format tile_packed_to_4d --qlinear 4w --qlinear-packing-format tile_packed_to_4d --qembedding 8w"
267+
VR_DTYPE_ARGS="--dtype bf16"
264268
fi
265269

266270
# Determine streaming mode based on MODE parameter
@@ -284,7 +288,8 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
284288
--backend "$DEVICE" \
285289
${STREAMING_ARG} \
286290
--output-dir "${OUTPUT_DIR}" \
287-
${VR_QUANT_ARGS}
291+
${VR_QUANT_ARGS} \
292+
${VR_DTYPE_ARGS}
288293

289294
# Export preprocessor
290295
python -m executorch.extension.audio.mel_spectrogram ${PREPROCESSOR_ARGS}

.ci/scripts/test_model_e2e.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,10 @@ EOF
298298
;;
299299
voxtral_realtime)
300300
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
301+
# Add CUDA data path if present
302+
if [ "$DEVICE" = "cuda" ] && [ -f "${MODEL_DIR}/aoti_cuda_blob.ptd" ]; then
303+
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
304+
fi
301305
# Determine streaming mode based on MODE parameter
302306
USE_STREAMING="true"
303307
if [ "$MODE" = "vr-offline" ]; then

.github/workflows/cuda.yml

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,8 @@ jobs:
137137
model:
138138
- repo: "mistralai"
139139
name: "Voxtral-Mini-3B-2507"
140+
- repo: "mistralai"
141+
name: "Voxtral-Mini-4B-Realtime-2602"
140142
- repo: "openai"
141143
name: "whisper-small"
142144
- repo: "openai"
@@ -157,6 +159,15 @@ jobs:
157159
repo: "google"
158160
name: "gemma-3-4b-it"
159161
quant: "quantized-int4-weight-only"
162+
# Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
163+
- model:
164+
repo: "mistralai"
165+
name: "Voxtral-Mini-4B-Realtime-2602"
166+
quant: "non-quantized"
167+
- model:
168+
repo: "mistralai"
169+
name: "Voxtral-Mini-4B-Realtime-2602"
170+
quant: "quantized-int4-weight-only"
160171
with:
161172
timeout: 90
162173
secrets-env: EXECUTORCH_HF_TOKEN
@@ -186,7 +197,12 @@ jobs:
186197
echo "::endgroup::"
187198
fi
188199
189-
source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
200+
# Voxtral Realtime uses offline mode for CUDA CI (not streaming)
201+
VR_MODE=""
202+
if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
203+
VR_MODE="vr-offline"
204+
fi
205+
source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"
190206
191207
test-model-cuda-e2e:
192208
name: test-model-cuda-e2e
@@ -201,6 +217,8 @@ jobs:
201217
model:
202218
- repo: "mistralai"
203219
name: "Voxtral-Mini-3B-2507"
220+
- repo: "mistralai"
221+
name: "Voxtral-Mini-4B-Realtime-2602"
204222
- repo: "openai"
205223
name: "whisper-small"
206224
- repo: "openai"
@@ -219,6 +237,15 @@ jobs:
219237
repo: "google"
220238
name: "gemma-3-4b-it"
221239
quant: "quantized-int4-weight-only"
240+
# Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
241+
- model:
242+
repo: "mistralai"
243+
name: "Voxtral-Mini-4B-Realtime-2602"
244+
quant: "non-quantized"
245+
- model:
246+
repo: "mistralai"
247+
name: "Voxtral-Mini-4B-Realtime-2602"
248+
quant: "quantized-int4-weight-only"
222249
with:
223250
timeout: 90
224251
runner: linux.g5.4xlarge.nvidia.gpu
@@ -229,7 +256,12 @@ jobs:
229256
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
230257
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
231258
script: |
232-
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
259+
# Voxtral Realtime uses offline mode for CUDA CI (not streaming)
260+
VR_MODE=""
261+
if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
262+
VR_MODE="vr-offline"
263+
fi
264+
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"
233265
234266
test-cuda-pybind:
235267
name: test-cuda-pybind

Makefile

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
# SUPPORTED MODELS:
1616
# -----------------
1717
# - voxtral: Multimodal voice + text model (CPU, CUDA, Metal)
18-
# - voxtral_realtime: Realtime speech-to-text model (CPU)
18+
# - voxtral_realtime: Realtime speech-to-text model (CPU, CUDA, Metal)
1919
# - whisper: Speech recognition model (CPU, CUDA, Metal)
2020
# - parakeet: Speech recognition model (CPU, CUDA, Metal)
2121
# - sortformer: Speaker diarization model (CPU)
@@ -91,13 +91,14 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
9898
@echo " voxtral-cuda - Build Voxtral runner with CUDA backend"
9999
@echo " voxtral-cpu - Build Voxtral runner with CPU backend"
100100
@echo " voxtral-metal - Build Voxtral runner with Metal backend (macOS only)"
101+
@echo " voxtral_realtime-cuda - Build Voxtral Realtime runner with CUDA backend"
101102
@echo " voxtral_realtime-cpu - Build Voxtral Realtime runner with CPU backend"
102103
@echo " voxtral_realtime-metal - Build Voxtral Realtime runner with Metal backend (macOS only)"
103104
@echo " whisper-cuda - Build Whisper runner with CUDA backend"
@@ -244,6 +245,15 @@ voxtral_realtime-metal:
244245
@echo "✓ Build complete!"
245246
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
246247

248+
voxtral_realtime-cuda:
249+
@echo "==> Building and installing ExecuTorch with CUDA..."
250+
cmake --workflow --preset llm-release-cuda
251+
@echo "==> Building Voxtral Realtime runner with CUDA..."
252+
cd examples/models/voxtral_realtime && cmake --workflow --preset voxtral-realtime-cuda
253+
@echo ""
254+
@echo "✓ Build complete!"
255+
@echo " Binary: cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner"
256+
247257
silero-vad-cpu:
248258
@echo "==> Building and installing ExecuTorch..."
249259
cmake --workflow --preset llm-release

examples/models/voxtral_realtime/CMakePresets.json

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,19 @@
2828
"type": "equals",
2929
"rhs": "Darwin"
3030
}
31+
},
32+
{
33+
"name": "voxtral-realtime-cuda",
34+
"displayName": "Voxtral Realtime runner (CUDA)",
35+
"inherits": ["voxtral-realtime-base"],
36+
"cacheVariables": {
37+
"EXECUTORCH_BUILD_CUDA": "ON"
38+
},
39+
"condition": {
40+
"type": "inList",
41+
"string": "${hostSystemName}",
42+
"list": ["Linux", "Windows"]
43+
}
3144
}
3245
],
3346
"buildPresets": [
@@ -43,6 +56,12 @@
4356
"configurePreset": "voxtral-realtime-metal",
4457
"configuration": "Release",
4558
"targets": ["voxtral_realtime_runner"]
59+
},
60+
{
61+
"name": "voxtral-realtime-cuda",
62+
"displayName": "Build Voxtral Realtime runner (CUDA)",
63+
"configurePreset": "voxtral-realtime-cuda",
64+
"targets": ["voxtral_realtime_runner"]
4665
}
4766
],
4867
"workflowPresets": [
@@ -73,6 +92,20 @@
7392
"name": "voxtral-realtime-metal"
7493
}
7594
]
95+
},
96+
{
97+
"name": "voxtral-realtime-cuda",
98+
"displayName": "Configure and build Voxtral Realtime runner (CUDA)",
99+
"steps": [
100+
{
101+
"type": "configure",
102+
"name": "voxtral-realtime-cuda"
103+
},
104+
{
105+
"type": "build",
106+
"name": "voxtral-realtime-cuda"
107+
}
108+
]
76109
}
77110
]
78111
}

examples/models/voxtral_realtime/README.md

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,43 @@ python export_voxtral_rt.py \
8888
|---------|---------|-----------|--------------|
8989
| `xnnpack` ||| `4w`, `8w`, `8da4w`, `8da8w` |
9090
| `metal` ||| none (fp32) or `fpa4w` (Metal-specific 4-bit) |
91+
| `cuda` ||| `4w`, `8w` |
9192

92-
Metal backend provides Apple GPU acceleration.
93+
Metal backend provides Apple GPU acceleration. CUDA backend provides NVIDIA GPU
94+
acceleration via AOTInductor.
95+
96+
#### CUDA export examples
97+
98+
Offline with int4 quantization:
99+
100+
```bash
101+
python export_voxtral_rt.py \
102+
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
103+
--backend cuda \
104+
--dtype bf16 \
105+
--output-dir ./voxtral_rt_exports \
106+
--qlinear-encoder 4w \
107+
--qlinear-encoder-packing-format tile_packed_to_4d \
108+
--qlinear 4w \
109+
--qlinear-packing-format tile_packed_to_4d \
110+
--qembedding 8w
111+
```
112+
113+
Streaming with int4 quantization:
114+
115+
```bash
116+
python export_voxtral_rt.py \
117+
--model-path ~/models/Voxtral-Mini-4B-Realtime-2602 \
118+
--backend cuda \
119+
--dtype bf16 \
120+
--streaming \
121+
--output-dir ./voxtral_rt_exports \
122+
--qlinear-encoder 4w \
123+
--qlinear-encoder-packing-format tile_packed_to_4d \
124+
--qlinear 4w \
125+
--qlinear-packing-format tile_packed_to_4d \
126+
--qembedding 8w
127+
```
93128

94129
#### Metal export examples
95130

@@ -133,14 +168,17 @@ EXECUTORCH_BUILD_KERNELS_TORCHAO=1 TORCHAO_BUILD_EXPERIMENTAL_MPS=1 ./install_ex
133168
| Flag | Default | Description |
134169
|------|---------|-------------|
135170
| `--model-path` | (required) | Directory with `params.json` + `consolidated.safetensors` |
136-
| `--backend` | `xnnpack` | `xnnpack`, `metal`, or `portable` |
171+
| `--backend` | `xnnpack` | `xnnpack`, `metal`, `cuda`, or `portable` |
172+
| `--dtype` | `fp32` | Model dtype: `fp32` or `bf16` |
137173
| `--output-dir` | `./voxtral_rt_exports` | Output directory |
138174
| `--max-seq-len` | `4096` | KV cache length |
139175
| `--delay-tokens` | `6` | Transcription delay in tokens (6 = 480ms) |
140176
| `--qlinear` | (none) | Decoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
141177
| `--qlinear-group-size` | `32` | Group size for decoder linear quantization |
178+
| `--qlinear-packing-format` | (none) | Packing format for decoder 4w quantization (`tile_packed_to_4d` for CUDA) |
142179
| `--qlinear-encoder` | (none) | Encoder linear layer quantization (`4w`, `8w`, `8da4w`, `8da8w`, `fpa4w`) |
143180
| `--qlinear-encoder-group-size` | `32` | Group size for encoder linear quantization |
181+
| `--qlinear-encoder-packing-format` | (none) | Packing format for encoder 4w quantization (`tile_packed_to_4d` for CUDA) |
144182
| `--qembedding` | (none) | Embedding layer quantization (`8w`) |
145183
| `--streaming` | off | Export streaming encoder with KV cache |
146184
| `--max-enc-len` | `750` | Encoder sliding window size (streaming only) |
@@ -164,6 +202,15 @@ make voxtral_realtime-cpu
164202
This builds ExecuTorch core libraries with XNNPACK, then the runner binary
165203
at `cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner`.
166204

205+
### CUDA (NVIDIA GPU)
206+
207+
```bash
208+
make voxtral_realtime-cuda
209+
```
210+
211+
This builds ExecuTorch with CUDA backend support. The runner binary is at
212+
the same path as above. Requires NVIDIA GPU with CUDA toolkit installed.
213+
167214
### Metal (Apple GPU)
168215

169216
```bash
@@ -180,10 +227,22 @@ The runner requires:
180227
- `tekken.json` — tokenizer from the model weights directory
181228
- `preprocessor.pte` — mel spectrogram preprocessor (see [Preprocessor](#preprocessor))
182229
- A 16kHz mono WAV audio file (or live audio via `--mic`)
230+
- For CUDA: `aoti_cuda_blob.ptd` — delegate data file (pass via `--data_path`)
231+
232+
```bash
233+
cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
234+
--model_path voxtral_rt_exports/model.pte \
235+
--tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
236+
--preprocessor_path voxtral_rt_exports/preprocessor.pte \
237+
--audio_path input.wav
238+
```
239+
240+
For CUDA, include the `.ptd` data file:
183241

184242
```bash
185243
cmake-out/examples/models/voxtral_realtime/voxtral_realtime_runner \
186244
--model_path voxtral_rt_exports/model.pte \
245+
--data_path voxtral_rt_exports/aoti_cuda_blob.ptd \
187246
--tokenizer_path ~/models/Voxtral-Mini-4B-Realtime-2602/tekken.json \
188247
--preprocessor_path voxtral_rt_exports/preprocessor.pte \
189248
--audio_path input.wav
@@ -218,9 +277,13 @@ ffmpeg -f avfoundation -i ":0" -ar 16000 -ac 1 -f f32le -nostats -loglevel error
218277

219278
Ctrl+C stops recording and flushes remaining text.
220279

280+
**CUDA:** Add `--data_path voxtral_rt_exports/aoti_cuda_blob.ptd` to all
281+
run commands above when using the CUDA backend.
282+
221283
| Flag | Default | Description |
222284
|------|---------|-------------|
223285
| `--model_path` | `model.pte` | Path to exported model |
286+
| `--data_path` | (none) | Path to delegate data file (`.ptd`, required for CUDA) |
224287
| `--tokenizer_path` | `tekken.json` | Path to Tekken tokenizer |
225288
| `--preprocessor_path` | (none) | Path to mel preprocessor `.pte` |
226289
| `--audio_path` | (none) | Path to 16kHz mono WAV file |

0 commit comments

Comments
 (0)