Skip to content

Commit 2dd9034

Browse files
authored
Add Windows CUDA CI coverage for Voxtral Realtime (#17852)
### Summary This PR adds mistralai/Voxtral-Mini-4B-Realtime-2602 coverage to Windows CUDA CI by following the existing cuda-windows.yml pattern (Linux export artifact + Windows runner E2E). cuda-windows CI previously covered Voxtral 3B and Parakeet, but not Voxtral Realtime. Voxtral Realtime also needed explicit cuda-windows export support in its exporter to generate Windows-targeted CUDA artifacts. ### What changed * Updated Windows CUDA workflow matrix to include Voxtral Realtime with supported quantization: * quantized-int4-tile-packed * Kept Voxtral Realtime in offline mode for CI: * passes vr-offline in export job * Extended Windows E2E script support for Voxtral Realtime: * builds voxtral_realtime_runner * runs with --preprocessor_path, tokenizer, audio, and CUDA --data_path * validates expected runtime output string * Added cuda-windows backend support in export_voxtral_rt.py: * accepts --backend cuda-windows * lowers using CUDA partitioner with Windows compile spec (platform=windows) * normalizes model/export logic to CUDA internals where appropriate * Added export artifact validation: * asserts aoti_cuda_blob.ptd exists for Voxtral Realtime on cuda/cuda-windows
1 parent 11c8269 commit 2dd9034

4 files changed

Lines changed: 85 additions & 32 deletions

File tree

.ci/scripts/export_model_artifact.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,9 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
296296

297297
test -f "${OUTPUT_DIR}/model.pte"
298298
test -f "${OUTPUT_DIR}/preprocessor.pte"
299+
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
300+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
301+
fi
299302
# Copy tokenizer from downloaded model weights
300303
cp "$LOCAL_MODEL_DIR/tekken.json" "${OUTPUT_DIR}/tekken.json"
301304
ls -al "${OUTPUT_DIR}"

.ci/scripts/test_model_e2e_windows.ps1

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,19 @@ switch ($HfModel) {
6464
$audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
6565
$audioFile = "test_audio.wav"
6666
}
67+
"mistralai/Voxtral-Mini-4B-Realtime-2602" {
68+
$runnerTarget = "voxtral_realtime_runner"
69+
$runnerPath = "voxtral_realtime"
70+
$runnerPreset = "voxtral-realtime-cuda"
71+
$expectedOutput = "Loading audio from"
72+
$preprocessor = "preprocessor.pte"
73+
$tokenizerUrl = ""
74+
$tokenizerFile = "tekken.json"
75+
$audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
76+
$audioFile = "poem.wav"
77+
}
6778
default {
68-
throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, nvidia/parakeet-tdt"
79+
throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/parakeet-tdt"
6980
}
7081
}
7182

@@ -171,6 +182,14 @@ try {
171182
"--data_path", $cudaBlob
172183
)
173184
}
185+
"mistralai/Voxtral-Mini-4B-Realtime-2602" {
186+
$runnerArgs += @(
187+
"--temperature", "0",
188+
"--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
189+
"--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
190+
"--preprocessor_path", (Join-Path -Path $resolvedModelDir -ChildPath $preprocessor)
191+
)
192+
}
174193
}
175194

176195
$stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
@@ -202,10 +221,15 @@ try {
202221
Write-Warning "Runner exited with code $exitCode (may be benign)"
203222
}
204223

205-
if ($expectedOutput -ne "" -and $stdout -notmatch [Regex]::Escape($expectedOutput)) {
206-
throw "Expected output '$expectedOutput' not found in runner output"
224+
if ($expectedOutput -ne "") {
225+
if ($stdout -notmatch [Regex]::Escape($expectedOutput)) {
226+
throw "Expected output '$expectedOutput' not found in runner output"
227+
}
228+
Write-Host "Success: '$expectedOutput' found in output"
229+
}
230+
else {
231+
Write-Host "Success: runner completed"
207232
}
208-
Write-Host "Success: '$expectedOutput' found in output"
209233
Write-Host "::endgroup::"
210234
}
211235
finally {

.github/workflows/cuda-windows.yml

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,22 @@ jobs:
2828
strategy:
2929
fail-fast: false
3030
matrix:
31-
model:
32-
- repo: "mistralai"
33-
name: "Voxtral-Mini-3B-2507"
34-
- repo: "nvidia"
35-
name: "parakeet-tdt"
36-
quant:
37-
- "non-quantized"
38-
- "quantized-int4-weight-only"
31+
include:
32+
- model_repo: "mistralai"
33+
model_name: "Voxtral-Mini-3B-2507"
34+
quant: "non-quantized"
35+
- model_repo: "mistralai"
36+
model_name: "Voxtral-Mini-3B-2507"
37+
quant: "quantized-int4-weight-only"
38+
- model_repo: "nvidia"
39+
model_name: "parakeet-tdt"
40+
quant: "non-quantized"
41+
- model_repo: "nvidia"
42+
model_name: "parakeet-tdt"
43+
quant: "quantized-int4-weight-only"
44+
- model_repo: "mistralai"
45+
model_name: "Voxtral-Mini-4B-Realtime-2602"
46+
quant: "quantized-int4-tile-packed"
3947
with:
4048
timeout: 90
4149
secrets-env: EXECUTORCH_HF_TOKEN
@@ -44,7 +52,7 @@ jobs:
4452
gpu-arch-version: 12.8
4553
docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
4654
submodules: recursive
47-
upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
55+
upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
4856
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
4957
script: |
5058
set -eux
@@ -79,7 +87,11 @@ jobs:
7987
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
8088
echo "::endgroup::"
8189
82-
source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
90+
VR_MODE=""
91+
if [ "${{ matrix.model_name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
92+
VR_MODE="vr-offline"
93+
fi
94+
source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model_repo }}/${{ matrix.model_name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}"
8395
8496
test-model-cuda-windows-e2e:
8597
name: test-model-cuda-windows-e2e
@@ -88,21 +100,29 @@ jobs:
88100
strategy:
89101
fail-fast: false
90102
matrix:
91-
model:
92-
- repo: "mistralai"
93-
name: "Voxtral-Mini-3B-2507"
94-
- repo: "nvidia"
95-
name: "parakeet-tdt"
96-
quant:
97-
- "non-quantized"
98-
- "quantized-int4-weight-only"
103+
include:
104+
- model_repo: "mistralai"
105+
model_name: "Voxtral-Mini-3B-2507"
106+
quant: "non-quantized"
107+
- model_repo: "mistralai"
108+
model_name: "Voxtral-Mini-3B-2507"
109+
quant: "quantized-int4-weight-only"
110+
- model_repo: "nvidia"
111+
model_name: "parakeet-tdt"
112+
quant: "non-quantized"
113+
- model_repo: "nvidia"
114+
model_name: "parakeet-tdt"
115+
quant: "quantized-int4-weight-only"
116+
- model_repo: "mistralai"
117+
model_name: "Voxtral-Mini-4B-Realtime-2602"
118+
quant: "quantized-int4-tile-packed"
99119
with:
100120
timeout: 240
101121
runner: windows.g5.4xlarge.nvidia.gpu
102122
gpu-arch-type: cuda
103123
gpu-arch-version: 12.8
104124
submodules: recursive
105-
download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
125+
download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
106126
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
107127
script: |
108128
conda init powershell
@@ -122,5 +142,5 @@ jobs:
122142
throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
123143
}
124144
125-
.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model.repo }}/${{ matrix.model.name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
145+
.ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
126146
}"

examples/models/voxtral_realtime/export_voxtral_rt.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -404,12 +404,15 @@ def lower_to_executorch(programs, metadata, backend="xnnpack"):
404404
for key in programs:
405405
compile_specs = [MetalBackend.generate_method_name_compile_spec(key)]
406406
partitioner[key] = [MetalPartitioner(compile_specs)]
407-
elif backend == "cuda":
407+
elif backend in ("cuda", "cuda-windows"):
408408
from executorch.backends.cuda.cuda_backend import CudaBackend
409409
from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
410+
from executorch.exir.backend.compile_spec_schema import CompileSpec
410411
from torch._inductor.decomposition import conv1d_to_conv2d
411412

412-
print("\nLowering to ExecuTorch with CUDA...")
413+
print(
414+
f"\nLowering to ExecuTorch with CUDA{' (Windows)' if backend == 'cuda-windows' else ''}..."
415+
)
413416

414417
# Run conv1d decomposition for CUDA backend
415418
updated_programs = {}
@@ -422,6 +425,8 @@ def lower_to_executorch(programs, metadata, backend="xnnpack"):
422425
partitioner = {}
423426
for key in programs:
424427
compile_specs = [CudaBackend.generate_method_name_compile_spec(key)]
428+
if backend == "cuda-windows":
429+
compile_specs.append(CompileSpec("platform", b"windows"))
425430
partitioner[key] = [CudaPartitioner(compile_specs)]
426431
else:
427432
print("\nLowering to ExecuTorch (portable)...")
@@ -463,7 +468,7 @@ def main():
463468
parser.add_argument(
464469
"--backend",
465470
default="xnnpack",
466-
choices=["portable", "xnnpack", "metal", "cuda"],
471+
choices=["portable", "xnnpack", "metal", "cuda", "cuda-windows"],
467472
help="Backend for acceleration (default: xnnpack)",
468473
)
469474
parser.add_argument(
@@ -543,11 +548,12 @@ def main():
543548
help="Model dtype (default: fp32).",
544549
)
545550
args = parser.parse_args()
551+
backend_for_export = "cuda" if args.backend == "cuda-windows" else args.backend
546552

547553
# Validate fpa4w quantization requires Metal backend
548-
if args.qlinear == "fpa4w" and args.backend != "metal":
554+
if args.qlinear == "fpa4w" and backend_for_export != "metal":
549555
parser.error("--qlinear=fpa4w can only be used with --backend=metal")
550-
if args.qlinear_encoder == "fpa4w" and args.backend != "metal":
556+
if args.qlinear_encoder == "fpa4w" and backend_for_export != "metal":
551557
parser.error("--qlinear-encoder=fpa4w can only be used with --backend=metal")
552558

553559
os.makedirs(args.output_dir, exist_ok=True)
@@ -560,11 +566,11 @@ def main():
560566
max_seq_len=args.max_seq_len,
561567
n_delay_tokens=args.delay_tokens,
562568
dtype=model_dtype,
563-
backend=args.backend,
569+
backend=backend_for_export,
564570
)
565571

566572
# Move to CUDA for CUDA backend export (AOTInductor needs CUDA tensors)
567-
if args.backend == "cuda":
573+
if backend_for_export == "cuda":
568574
print("Moving model to CUDA...")
569575
model.cuda()
570576

@@ -585,7 +591,7 @@ def main():
585591
"qlinear_group_size": args.qlinear_group_size,
586592
"qlinear_packing_format": args.qlinear_packing_format,
587593
"qembedding": args.qembedding,
588-
"backend": args.backend,
594+
"backend": backend_for_export,
589595
}
590596
if args.streaming:
591597
programs, metadata = export_streaming(

0 commit comments

Comments
 (0)