Skip to content

Commit 6ad5f75

Browse files
authored
Voxtral Realtime: enable streaming mode in CUDA CI (#17844)
Remove the vr-offline override so the CUDA CI runs Voxtral Realtime in streaming mode (the default). The streaming encoder path exercises the full pipeline including ring buffer KV cache and incremental mel processing.
1 parent 5ddbab2 commit 6ad5f75

1 file changed

Lines changed: 4 additions & 14 deletions

File tree

.github/workflows/cuda.yml

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ jobs:
159159
repo: "google"
160160
name: "gemma-3-4b-it"
161161
quant: "quantized-int4-weight-only"
162-
# Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
162+
# Voxtral Realtime only supports int4-tile-packed on CUDA
163163
- model:
164164
repo: "mistralai"
165165
name: "Voxtral-Mini-4B-Realtime-2602"
@@ -197,12 +197,7 @@ jobs:
197197
echo "::endgroup::"
198198
fi
199199
200-
# Voxtral Realtime uses offline mode for CUDA CI (not streaming)
201-
VR_MODE=""
202-
if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
203-
VR_MODE="vr-offline"
204-
fi
205-
source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"
200+
source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
206201
207202
test-model-cuda-e2e:
208203
name: test-model-cuda-e2e
@@ -237,7 +232,7 @@ jobs:
237232
repo: "google"
238233
name: "gemma-3-4b-it"
239234
quant: "quantized-int4-weight-only"
240-
# Voxtral Realtime only supports int4-tile-packed on CUDA (offline mode)
235+
# Voxtral Realtime only supports int4-tile-packed on CUDA
241236
- model:
242237
repo: "mistralai"
243238
name: "Voxtral-Mini-4B-Realtime-2602"
@@ -256,12 +251,7 @@ jobs:
256251
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
257252
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
258253
script: |
259-
# Voxtral Realtime uses offline mode for CUDA CI (not streaming)
260-
VR_MODE=""
261-
if [ "${{ matrix.model.name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
262-
VR_MODE="vr-offline"
263-
fi
264-
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "$VR_MODE"
254+
source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
265255
266256
test-cuda-pybind:
267257
name: test-cuda-pybind

0 commit comments

Comments
 (0)