Skip to content

Commit 8d1ff1a

Browse files
authored
MoE prefill bf16 perf improvement for qwen-3.5-35B-A3B (#18829)
| | Baseline | Batched | Speedup | |--------------------|-----------|------------|---------| | Prefill (1341 tok) | 588 tok/s | 1807 tok/s | 3.07x | | Decode (128 tok) | 90 tok/s | 86 tok/s | ~1.0x | (noise?)
1 parent b5cf3c3 commit 8d1ff1a

12 files changed

Lines changed: 1924 additions & 36 deletions

File tree

backends/cuda/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ set(_aoti_cuda_shim_sources runtime/shims/memory.cpp
109109

110110
# Only build int4mm shim when CUDA language/toolchain is available.
111111
if(CMAKE_CUDA_COMPILER)
112-
list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu)
112+
list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu
113+
runtime/shims/sort.cu
114+
)
113115
endif()
114116

115117
add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})

0 commit comments

Comments
 (0)