From dddfcd61fc9ff77715b902d2714282c366dcc9c2 Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Thu, 14 May 2026 11:58:33 -0700 Subject: [PATCH 1/5] SIMD implementation for XMFloatLoad/Store3SE --- Inc/DirectXPackedVector.inl | 130 ++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/Inc/DirectXPackedVector.inl b/Inc/DirectXPackedVector.inl index c92cf44..03faa38 100644 --- a/Inc/DirectXPackedVector.inl +++ b/Inc/DirectXPackedVector.inl @@ -1279,6 +1279,8 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept { assert(pSource); +#if defined(_XM_NO_INTRINSICS_) + union { float f; int32_t i; } fi; fi.i = 0x33800000 + (pSource->e << 23); float Scale = fi.f; @@ -1289,6 +1291,45 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept Scale * float(pSource->zm), 1.0f } } }; return v; + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + + uint32_t v = pSource->v; + + // Build scale factor from shared exponent + union { float f; int32_t i; } fi; + fi.i = 0x33800000 + (static_cast(v >> 27) << 23); + + // Extract 9-bit mantissas into vector lanes + uint32x4_t mantissas = vdupq_n_u32(0); + mantissas = vsetq_lane_u32(v & 0x1FFu, mantissas, 0); + mantissas = vsetq_lane_u32((v >> 9) & 0x1FFu, mantissas, 1); + mantissas = vsetq_lane_u32((v >> 18) & 0x1FFu, mantissas, 2); + + // Convert to float, scale, and set w = 1.0f + float32x4_t result = vmulq_n_f32(vcvtq_f32_u32(mantissas), fi.f); + return vsetq_lane_f32(1.0f, result, 3); + +#elif defined(_XM_SSE_INTRINSICS_) + + uint32_t v = pSource->v; + + // Build scale factor from shared exponent + union { float f; int32_t i; } fi; + fi.i = 0x33800000 + (static_cast(v >> 27) << 23); + + // Extract 9-bit mantissas, convert to float, and scale + __m128i mantissas = _mm_set_epi32( + 0, + static_cast((v >> 18) & 0x1FF), + static_cast((v >> 9) & 0x1FF), + static_cast(v & 0x1FF)); + __m128 result = _mm_mul_ps(_mm_cvtepi32_ps(mantissas), _mm_set1_ps(fi.f)); + + // Set w = 1.0f (w lane is +0.0f so bitwise OR inserts 1.0f cleanly) + return _mm_or_ps(result, g_XMIdentityR3); + +#endif } //------------------------------------------------------------------------------ @@ -2639,6 +2680,8 @@ inline void XM_CALLCONV XMStoreFloat3SE { assert(pDestination); +#if defined(_XM_NO_INTRINSICS_) + XMFLOAT3A tmp; XMStoreFloat3A(&tmp, V); @@ -2667,6 +2710,93 @@ inline void XM_CALLCONV XMStoreFloat3SE pDestination->xm = static_cast(MathInternal::round_to_nearest(x * ScaleR)); pDestination->ym = static_cast(MathInternal::round_to_nearest(y * ScaleR)); pDestination->zm = static_cast(MathInternal::round_to_nearest(z * ScaleR)); + +#elif defined(_XM_ARM_NEON_INTRINSICS_) + + static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } }; + static constexpr float minf9 = float(1.f / (1 << 16)); + + // Clamp to [0, maxf9] then zero w lane + float32x4_t clamped = vminq_f32(vmaxq_f32(V, vdupq_n_f32(0)), MaxFloat9); + clamped = vsetq_lane_f32(0.0f, clamped, 3); + + // Horizontal max of xyz for shared exponent +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + float maxVal = vmaxvq_f32(clamped); +#else + float32x2_t vlow = vget_low_f32(clamped); + float32x2_t vhigh = vget_high_f32(clamped); + float32x2_t maxPair = vpmax_f32(vlow, vhigh); + maxPair = vpmax_f32(maxPair, maxPair); + float maxVal = vget_lane_f32(maxPair, 0); +#endif + + if (maxVal < minf9) maxVal = minf9; + + // Compute shared exponent (inherently scalar) + union { float f; int32_t i; } fi; + fi.f = maxVal; + fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1) + + auto exp = static_cast(fi.i) >> 23; + fi.i = static_cast(0x83000000 - (exp << 23)); + + // Scale all channels and convert to integer + float32x4_t scaled = vmulq_n_f32(clamped, fi.f); +#if defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __aarch64__ + uint32x4_t ints = vcvtnq_u32_f32(scaled); +#else + scaled = vaddq_f32(scaled, vdupq_n_f32(0.5f)); + uint32x4_t ints = vcvtq_u32_f32(scaled); +#endif + + // Extract and pack into bitfields + pDestination->v = (vgetq_lane_u32(ints, 0) & 0x1FF) + | ((vgetq_lane_u32(ints, 1) & 0x1FF) << 9) + | ((vgetq_lane_u32(ints, 2) & 0x1FF) << 18) + | ((exp - 0x6f) << 27); + +#elif defined(_XM_SSE_INTRINSICS_) + + static const XMVECTORF32 MaxFloat9 = { { { float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7), float(0x1FF << 7) } } }; + static constexpr float minf9 = float(1.f / (1 << 16)); + + // Clamp to [0, maxf9] then mask w to zero + __m128 clamped = _mm_min_ps(_mm_max_ps(V, _mm_setzero_ps()), MaxFloat9); + clamped = _mm_and_ps(clamped, g_XMMask3); + + // Horizontal max of xyz for shared exponent + __m128 maxV = clamped; + __m128 temp = XM_PERMUTE_PS(maxV, _MM_SHUFFLE(1, 1, 1, 1)); + maxV = _mm_max_ps(maxV, temp); + temp = XM_PERMUTE_PS(clamped, _MM_SHUFFLE(2, 2, 2, 2)); + maxV = _mm_max_ps(maxV, temp); + + // Ensure minimum threshold + maxV = _mm_max_ss(maxV, _mm_set_ss(minf9)); + + // Compute shared exponent (inherently scalar) + union { float f; int32_t i; } fi; + _mm_store_ss(&fi.f, maxV); + fi.i += 0x00004000; // round up leaving 9 bits in fraction (including assumed 1) + + auto exp = static_cast(fi.i) >> 23; + fi.i = static_cast(0x83000000 - (exp << 23)); + + // Scale all channels and round to nearest integer + __m128 scaled = _mm_mul_ps(clamped, _mm_set1_ps(fi.f)); + __m128i ints = _mm_cvtps_epi32(scaled); + + // Extract and pack into bitfields + XM_ALIGNED_DATA(16) uint32_t ivals[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(ivals), ints); + + pDestination->v = (ivals[0] & 0x1FF) + | ((ivals[1] & 0x1FF) << 9) + | ((ivals[2] & 0x1FF) << 18) + | ((exp - 0x6f) << 27); + +#endif } //------------------------------------------------------------------------------ From 32dafde4e7ee869130f677ab777ebd4006710c2f Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Thu, 14 May 2026 14:14:42 -0700 Subject: [PATCH 2/5] Adding deprecated arm32 basic compile tests --- .github/workflows/arm64.yml | 36 ++++++++++++++++++++++++++++++++++++ CMakePresets.json | 13 +++++++++++++ 2 files changed, 49 insertions(+) diff --git a/.github/workflows/arm64.yml b/.github/workflows/arm64.yml index 3323b18..af729dd 100644 --- a/.github/workflows/arm64.yml +++ b/.github/workflows/arm64.yml @@ -62,3 +62,39 @@ jobs: - name: 'Build' working-directory: ${{ github.workspace }} run: cmake --build out/build/${{ matrix.build_type }} + + buildarm32: + runs-on: windows-11-arm + + strategy: + fail-fast: false + + matrix: + build_type: [arm-Debug, arm-Release] + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Clone test repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + repository: walbourn/directxmathtest + path: Tests + ref: main + + - name: 'Install Ninja' + run: choco install ninja + + # ARM32 is deprecated, so more recent Windows SDKs no longer support it + - uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0 + with: + arch: arm64_arm + sdk: 10.0.22621.0 + + - name: 'Configure CMake' + working-directory: ${{ github.workspace }} + run: cmake --preset=${{ matrix.build_type }} + + - name: 'Build' + working-directory: ${{ github.workspace }} + run: cmake --build out/build/${{ matrix.build_type }} diff --git a/CMakePresets.json b/CMakePresets.json index e8749f0..56eaceb 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -36,6 +36,17 @@ }, "hidden": true }, + { + "name": "ARM32", + "architecture": { + "value": "arm", + "strategy": "external" + }, + "cacheVariables": { + "DXMATH_ARCHITECTURE": "arm" + }, + "hidden": true + }, { "name": "ARM64", "architecture": { @@ -205,6 +216,8 @@ { "name": "arm64-Release" , "description": "MSVC for ARM64 (Release) - ARM-NEON", "inherits": [ "base", "ARM64", "Release", "MSVC" ] }, { "name": "arm64ec-Debug" , "description": "MSVC for ARM64EC (Debug) - ARM-NEON", "inherits": [ "base", "ARM64EC", "Debug", "MSVC" ] }, { "name": "arm64ec-Release", "description": "MSVC for ARM64EC (Release) - ARM-NEON", "inherits": [ "base", "ARM64EC", "Release", "MSVC" ] }, + { "name": "arm-Debug" , "description": "MSVC for ARM32 [Deprecated] (Debug) - ARM-NEON", "inherits": [ "base", "ARM32", "Debug", "MSVC" ] }, + { "name": "arm-Release" , "description": "MSVC for ARM32 [Deprecated] (Release) - ARM-NEON", "inherits": [ "base", "ARM32", "Release", "MSVC" ] }, { "name": "x64-Debug-Clang" , "description": "Clang/LLVM for x64 (Debug) - SSE/SSE2", "inherits": [ "base", "x64", "Debug", "Clang" ] }, { "name": "x64-Release-Clang" , "description": "Clang/LLVM for x64 (Release) - SSE/SSE2", "inherits": [ "base", "x64", "Release", "Clang" ] }, From e895f41b2a69f81e2859e55d2df620862ead5574 Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Thu, 14 May 2026 14:19:11 -0700 Subject: [PATCH 3/5] Code review feedback --- Inc/DirectXPackedVector.inl | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/Inc/DirectXPackedVector.inl b/Inc/DirectXPackedVector.inl index 03faa38..5de2f16 100644 --- a/Inc/DirectXPackedVector.inl +++ b/Inc/DirectXPackedVector.inl @@ -1294,17 +1294,15 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept #elif defined(_XM_ARM_NEON_INTRINSICS_) - uint32_t v = pSource->v; - // Build scale factor from shared exponent union { float f; int32_t i; } fi; - fi.i = 0x33800000 + (static_cast(v >> 27) << 23); + fi.i = 0x33800000 + (pSource->e << 23); // Extract 9-bit mantissas into vector lanes uint32x4_t mantissas = vdupq_n_u32(0); - mantissas = vsetq_lane_u32(v & 0x1FFu, mantissas, 0); - mantissas = vsetq_lane_u32((v >> 9) & 0x1FFu, mantissas, 1); - mantissas = vsetq_lane_u32((v >> 18) & 0x1FFu, mantissas, 2); + mantissas = vsetq_lane_u32(pSource->xm, mantissas, 0); + mantissas = vsetq_lane_u32(pSource->ym, mantissas, 1); + mantissas = vsetq_lane_u32(pSource->zm, mantissas, 2); // Convert to float, scale, and set w = 1.0f float32x4_t result = vmulq_n_f32(vcvtq_f32_u32(mantissas), fi.f); @@ -1312,18 +1310,16 @@ inline XMVECTOR XM_CALLCONV XMLoadFloat3SE(const XMFLOAT3SE* pSource) noexcept #elif defined(_XM_SSE_INTRINSICS_) - uint32_t v = pSource->v; - // Build scale factor from shared exponent union { float f; int32_t i; } fi; - fi.i = 0x33800000 + (static_cast(v >> 27) << 23); + fi.i = 0x33800000 + (pSource->e << 23); // Extract 9-bit mantissas, convert to float, and scale __m128i mantissas = _mm_set_epi32( 0, - static_cast((v >> 18) & 0x1FF), - static_cast((v >> 9) & 0x1FF), - static_cast(v & 0x1FF)); + static_cast(pSource->zm), + static_cast(pSource->ym), + static_cast(pSource->xm)); __m128 result = _mm_mul_ps(_mm_cvtepi32_ps(mantissas), _mm_set1_ps(fi.f)); // Set w = 1.0f (w lane is +0.0f so bitwise OR inserts 1.0f cleanly) From 216f7007f9cfa855749cbb4037e8f4802d2458ea Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Thu, 14 May 2026 14:24:28 -0700 Subject: [PATCH 4/5] More code review --- Inc/DirectXPackedVector.inl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/Inc/DirectXPackedVector.inl b/Inc/DirectXPackedVector.inl index 5de2f16..be99cec 100644 --- a/Inc/DirectXPackedVector.inl +++ b/Inc/DirectXPackedVector.inl @@ -2747,10 +2747,10 @@ inline void XM_CALLCONV XMStoreFloat3SE #endif // Extract and pack into bitfields - pDestination->v = (vgetq_lane_u32(ints, 0) & 0x1FF) - | ((vgetq_lane_u32(ints, 1) & 0x1FF) << 9) - | ((vgetq_lane_u32(ints, 2) & 0x1FF) << 18) - | ((exp - 0x6f) << 27); + pDestination->xm = vgetq_lane_u32(ints, 0) & 0x1FF; + pDestination->ym = vgetq_lane_u32(ints, 1) & 0x1FF; + pDestination->zm = vgetq_lane_u32(ints, 2) & 0x1FF; + pDestination->e = exp - 0x6f; #elif defined(_XM_SSE_INTRINSICS_) @@ -2787,11 +2787,10 @@ inline void XM_CALLCONV XMStoreFloat3SE XM_ALIGNED_DATA(16) uint32_t ivals[4]; _mm_store_si128(reinterpret_cast<__m128i*>(ivals), ints); - pDestination->v = (ivals[0] & 0x1FF) - | ((ivals[1] & 0x1FF) << 9) - | ((ivals[2] & 0x1FF) << 18) - | ((exp - 0x6f) << 27); - + pDestination->xm = ivals[0] & 0x1FF; + pDestination->ym = ivals[1] & 0x1FF; + pDestination->zm = ivals[2] & 0x1FF; + pDestination->e = exp - 0x6f; #endif } From 1d9c0476a72873ce8a4be946553157333f2e9055 Mon Sep 17 00:00:00 2001 From: Chuck Walbourn Date: Thu, 14 May 2026 15:51:40 -0700 Subject: [PATCH 5/5] Added readme update --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 7e30d19..fdfdf10 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,8 @@ For a full change history, see [CHANGELOG.md](https://github.com/microsoft/Direc * The clang/LLVM toolset currently does not respect the ``float_control`` pragma for SSE instrinsics. Therefore, the use of ``/fp:fast`` is not recommended on clang/LLVM until this issue is fixed. See [55713](https://github.com/llvm/llvm-project/issues/55713). +* AArch32/ARM32 (ARMv7) support is deprecated in Windows 11. Compiler support for ARM32 is deprecated in Visual Studio 2026, and the system libraries are no longer present in the Windows SDK (26100) or later. Therefore, support for ARM32 is deprecated in DirectXMath and will be removed in a future release. Since most codepaths are shared for AArch32 and AArch64, the codepaths will be refactored to assume AArch64 (ARMv8). + ## Support For questions, consider using [Stack Overflow](https://stackoverflow.com/questions/tagged/directxmath) with the *directxmath* tag, or the [DirectX Discord Server](https://discord.gg/directx) in the *dx12-developers* or *dx9-dx11-developers* channel.