Add Windows CUDA CI coverage for Voxtral Realtime (#17852)

larryliu0820 · web-flow · commit 2dd903443e51 · 2026-03-04T15:17:24.000-08:00
### Summary
This PR adds mistralai/Voxtral-Mini-4B-Realtime-2602 coverage to Windows
CUDA CI by following the existing cuda-windows.yml pattern (Linux export
artifact + Windows runner E2E).

cuda-windows CI previously covered Voxtral 3B and Parakeet, but not
Voxtral Realtime.
Voxtral Realtime also needed explicit cuda-windows export support in its
exporter to generate Windows-targeted CUDA artifacts.

### What changed

* Updated Windows CUDA workflow matrix to include Voxtral Realtime with
supported quantization:
  * quantized-int4-tile-packed
* Kept Voxtral Realtime in offline mode for CI:
  * passes vr-offline in export job
* Extended Windows E2E script support for Voxtral Realtime:
  * builds voxtral_realtime_runner
* runs with --preprocessor_path, tokenizer, audio, and CUDA --data_path
  * validates expected runtime output string
* Added cuda-windows backend support in export_voxtral_rt.py:
  * accepts --backend cuda-windows
* lowers using CUDA partitioner with Windows compile spec
(platform=windows)
  * normalizes model/export logic to CUDA internals where appropriate
* Added export artifact validation:
* asserts aoti_cuda_blob.ptd exists for Voxtral Realtime on
cuda/cuda-windows
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -296,6 +296,9 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
 
   test -f "${OUTPUT_DIR}/model.pte"
   test -f "${OUTPUT_DIR}/preprocessor.pte"
+  if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
+    test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
+  fi
   # Copy tokenizer from downloaded model weights
   cp "$LOCAL_MODEL_DIR/tekken.json" "${OUTPUT_DIR}/tekken.json"
   ls -al "${OUTPUT_DIR}"
diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
@@ -64,8 +64,19 @@ switch ($HfModel) {
         $audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
         $audioFile = "test_audio.wav"
     }
+    "mistralai/Voxtral-Mini-4B-Realtime-2602" {
+        $runnerTarget = "voxtral_realtime_runner"
+        $runnerPath = "voxtral_realtime"
+        $runnerPreset = "voxtral-realtime-cuda"
+        $expectedOutput = "Loading audio from"
+        $preprocessor = "preprocessor.pte"
+        $tokenizerUrl = ""
+        $tokenizerFile = "tekken.json"
+        $audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        $audioFile = "poem.wav"
+    }
     default {
-        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, nvidia/parakeet-tdt"
+        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/parakeet-tdt"
     }
 }
 
@@ -171,6 +182,14 @@ try {
                 "--data_path", $cudaBlob
             )
         }
+        "mistralai/Voxtral-Mini-4B-Realtime-2602" {
+            $runnerArgs += @(
+                "--temperature", "0",
+                "--tokenizer_path", (Join-Path -Path $resolvedModelDir -ChildPath $tokenizerFile),
+                "--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
+                "--preprocessor_path", (Join-Path -Path $resolvedModelDir -ChildPath $preprocessor)
+            )
+        }
     }
 
     $stdoutFile = Join-Path -Path $env:TEMP -ChildPath ("et_runner_stdout_{0}.log" -f ([Guid]::NewGuid().ToString("N")))
@@ -202,10 +221,15 @@ try {
         Write-Warning "Runner exited with code $exitCode (may be benign)"
     }
 
-    if ($expectedOutput -ne "" -and $stdout -notmatch [Regex]::Escape($expectedOutput)) {
-        throw "Expected output '$expectedOutput' not found in runner output"
+    if ($expectedOutput -ne "") {
+        if ($stdout -notmatch [Regex]::Escape($expectedOutput)) {
+            throw "Expected output '$expectedOutput' not found in runner output"
+        }
+        Write-Host "Success: '$expectedOutput' found in output"
+    }
+    else {
+        Write-Host "Success: runner completed"
     }
-    Write-Host "Success: '$expectedOutput' found in output"
     Write-Host "::endgroup::"
 }
 finally {
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
@@ -28,14 +28,22 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model:
-          - repo: "mistralai"
-            name: "Voxtral-Mini-3B-2507"
-          - repo: "nvidia"
-            name: "parakeet-tdt"
-        quant:
-          - "non-quantized"
-          - "quantized-int4-weight-only"
+        include:
+          - model_repo: "mistralai"
+            model_name: "Voxtral-Mini-3B-2507"
+            quant: "non-quantized"
+          - model_repo: "mistralai"
+            model_name: "Voxtral-Mini-3B-2507"
+            quant: "quantized-int4-weight-only"
+          - model_repo: "nvidia"
+            model_name: "parakeet-tdt"
+            quant: "non-quantized"
+          - model_repo: "nvidia"
+            model_name: "parakeet-tdt"
+            quant: "quantized-int4-weight-only"
+          - model_repo: "mistralai"
+            model_name: "Voxtral-Mini-4B-Realtime-2602"
+            quant: "quantized-int4-tile-packed"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -44,7 +52,7 @@ jobs:
       gpu-arch-version: 12.8
       docker-image: ci-image:executorch-ubuntu-22.04-cuda-windows
       submodules: recursive
-      upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
+      upload-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -79,7 +87,11 @@ jobs:
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
         echo "::endgroup::"
 
-        source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        VR_MODE=""
+        if [ "${{ matrix.model_name }}" = "Voxtral-Mini-4B-Realtime-2602" ]; then
+          VR_MODE="vr-offline"
+        fi
+        source .ci/scripts/export_model_artifact.sh cuda-windows "${{ matrix.model_repo }}/${{ matrix.model_name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" "${VR_MODE}"
 
   test-model-cuda-windows-e2e:
     name: test-model-cuda-windows-e2e
@@ -88,21 +100,29 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model:
-          - repo: "mistralai"
-            name: "Voxtral-Mini-3B-2507"
-          - repo: "nvidia"
-            name: "parakeet-tdt"
-        quant:
-          - "non-quantized"
-          - "quantized-int4-weight-only"
+        include:
+          - model_repo: "mistralai"
+            model_name: "Voxtral-Mini-3B-2507"
+            quant: "non-quantized"
+          - model_repo: "mistralai"
+            model_name: "Voxtral-Mini-3B-2507"
+            quant: "quantized-int4-weight-only"
+          - model_repo: "nvidia"
+            model_name: "parakeet-tdt"
+            quant: "non-quantized"
+          - model_repo: "nvidia"
+            model_name: "parakeet-tdt"
+            quant: "quantized-int4-weight-only"
+          - model_repo: "mistralai"
+            model_name: "Voxtral-Mini-4B-Realtime-2602"
+            quant: "quantized-int4-tile-packed"
     with:
       timeout: 240
       runner: windows.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.8
       submodules: recursive
-      download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-windows-${{ matrix.quant }}
+      download-artifact: ${{ matrix.model_repo }}-${{ matrix.model_name }}-cuda-windows-${{ matrix.quant }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         conda init powershell
@@ -122,5 +142,5 @@ jobs:
             throw 'RUNNER_ARTIFACT_DIR is empty. Ensure download-artifact is configured for windows_job.yml.'
           }
 
-          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model.repo }}/${{ matrix.model.name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
+          .ci/scripts/test_model_e2e_windows.ps1 -Device cuda-windows -HfModel '${{ matrix.model_repo }}/${{ matrix.model_name }}' -QuantName '${{ matrix.quant }}' -ModelDir \$artifactDir -ExpectedCudaVersion '12.8'
         }"
diff --git a/examples/models/voxtral_realtime/export_voxtral_rt.py b/examples/models/voxtral_realtime/export_voxtral_rt.py
@@ -404,12 +404,15 @@ def lower_to_executorch(programs, metadata, backend="xnnpack"):
         for key in programs:
             compile_specs = [MetalBackend.generate_method_name_compile_spec(key)]
             partitioner[key] = [MetalPartitioner(compile_specs)]
-    elif backend == "cuda":
+    elif backend in ("cuda", "cuda-windows"):
         from executorch.backends.cuda.cuda_backend import CudaBackend
         from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
         from torch._inductor.decomposition import conv1d_to_conv2d
 
-        print("\nLowering to ExecuTorch with CUDA...")
+        print(
+            f"\nLowering to ExecuTorch with CUDA{' (Windows)' if backend == 'cuda-windows' else ''}..."
+        )
 
         # Run conv1d decomposition for CUDA backend
         updated_programs = {}
@@ -422,6 +425,8 @@ def lower_to_executorch(programs, metadata, backend="xnnpack"):
         partitioner = {}
         for key in programs:
             compile_specs = [CudaBackend.generate_method_name_compile_spec(key)]
+            if backend == "cuda-windows":
+                compile_specs.append(CompileSpec("platform", b"windows"))
             partitioner[key] = [CudaPartitioner(compile_specs)]
     else:
         print("\nLowering to ExecuTorch (portable)...")
@@ -463,7 +468,7 @@ def main():
     parser.add_argument(
         "--backend",
         default="xnnpack",
-        choices=["portable", "xnnpack", "metal", "cuda"],
+        choices=["portable", "xnnpack", "metal", "cuda", "cuda-windows"],
         help="Backend for acceleration (default: xnnpack)",
     )
     parser.add_argument(
@@ -543,11 +548,12 @@ def main():
         help="Model dtype (default: fp32).",
     )
     args = parser.parse_args()
+    backend_for_export = "cuda" if args.backend == "cuda-windows" else args.backend
 
     # Validate fpa4w quantization requires Metal backend
-    if args.qlinear == "fpa4w" and args.backend != "metal":
+    if args.qlinear == "fpa4w" and backend_for_export != "metal":
         parser.error("--qlinear=fpa4w can only be used with --backend=metal")
-    if args.qlinear_encoder == "fpa4w" and args.backend != "metal":
+    if args.qlinear_encoder == "fpa4w" and backend_for_export != "metal":
         parser.error("--qlinear-encoder=fpa4w can only be used with --backend=metal")
 
     os.makedirs(args.output_dir, exist_ok=True)
@@ -560,11 +566,11 @@ def main():
         max_seq_len=args.max_seq_len,
         n_delay_tokens=args.delay_tokens,
         dtype=model_dtype,
-        backend=args.backend,
+        backend=backend_for_export,
     )
 
     # Move to CUDA for CUDA backend export (AOTInductor needs CUDA tensors)
-    if args.backend == "cuda":
+    if backend_for_export == "cuda":
         print("Moving model to CUDA...")
         model.cuda()
 
@@ -585,7 +591,7 @@ def main():
         "qlinear_group_size": args.qlinear_group_size,
         "qlinear_packing_format": args.qlinear_packing_format,
         "qembedding": args.qembedding,
-        "backend": args.backend,
+        "backend": backend_for_export,
     }
     if args.streaming:
         programs, metadata = export_streaming(