From 3a253488cadf6835ece425e1db32c6635979f002 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jun 2026 15:01:59 -0400 Subject: [PATCH] fix: route avxvnni FMA ops through fma3 kernels batch derived from avx2, so fnma/fnms fell back to the generic neg(x*y)+z form (vxorpd + vfmadd) instead of the hardware vfnmadd/vfnmsub kernels registered for fma3. This bites -march=native on Alder/Meteor Lake and Zen 5, where default_arch resolves to avxvnni. Derive avxvnni from fma3 instead. fma3 always derives from avx2 and its kernels are guarded by XSIMD_WITH_FMA3_AVX2, so when FMA is disabled the base is transparent and dispatch falls through to avx2 unchanged. Validated under Intel SDE across avxvnni+FMA (-adl), fma3 (-hsw), avx2-only (-hsw) and avxvnni-without-FMA (-adl): correct results, full test_batch suite passes, vfnmadd emitted only where FMA is enabled. --- include/xsimd/types/xsimd_avxvnni_register.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/xsimd/types/xsimd_avxvnni_register.hpp b/include/xsimd/types/xsimd_avxvnni_register.hpp index e1e1ee964..c46e8ec16 100644 --- a/include/xsimd/types/xsimd_avxvnni_register.hpp +++ b/include/xsimd/types/xsimd_avxvnni_register.hpp @@ -13,6 +13,7 @@ #define XSIMD_AVXVNNI_REGISTER_HPP #include "./xsimd_avx2_register.hpp" +#include "./xsimd_fma3_avx2_register.hpp" namespace xsimd { @@ -21,7 +22,12 @@ namespace xsimd * * AVXVNNI instructions */ - struct avxvnni : avx2 + // Derive from fma3 rather than avx2 so the FMA3 kernels (fnma/fnms -> + // vfnmadd) are in avxvnni's dispatch chain instead of the generic neg(x*y)+z + // fallback. fma3 always derives from avx2 and its kernels are only + // registered when XSIMD_WITH_FMA3_AVX2, so when FMA is disabled this base is + // transparent (dispatch falls straight through to avx2). + struct avxvnni : fma3 { static constexpr bool supported() noexcept { return XSIMD_WITH_AVXVNNI; } static constexpr bool available() noexcept { return true; }