33/***************************
44 * Data Type
55 ***************************/
6- #ifdef DOUBLE
7- typedef __m128d v_f32 ;
8- #else
96typedef __m128 v_f32 ;
10- #endif
11-
127#define v_nlanes_f32 4
138/***************************
149 * Arithmetic
1510 ***************************/
16- #ifdef DOUBLE
17- #define v_add_f32 _mm_add_pd
18- #define v_mul_f32 _mm_mul_pd
19- #else
2011#define v_add_f32 _mm_add_ps
2112#define v_mul_f32 _mm_mul_ps
22- #endif
2313#ifdef HAVE_FMA3
2414 // multiply and add, a*b + c
25- #ifdef DOUBLE
26- #define v_muladd_f32 _mm_fmadd_pd
27- #else
28- #define v_muladd_f32 _mm_fmadd_ps
29- #endif
15+ #define v_muladd_f32 _mm_fmadd_ps
3016#elif defined(HAVE_FMA4 )
3117 // multiply and add, a*b + c
32- #ifdef DOUBLE
33- #define v_muladd_f32 _mm_macc_pd
34- #else
35- #define v_muladd_f32 _mm_macc_ps
36- #endif
18+ #define v_muladd_f32 _mm_macc_ps
3719#else
3820 // multiply and add, a*b + c
3921 BLAS_FINLINE v_f32 v_muladd_f32 (v_f32 a , v_f32 b , v_f32 c )
4022 { return v_add_f32 (v_mul_f32 (a , b ), c ); }
4123#endif // HAVE_FMA3
4224
43- // Horizontal add: Calculates the sum of all vector elements.
44- #ifdef DOUBLE
45- BLAS_FINLINE double v_sum_f32 (__m128d a )
46- {
47- #ifdef HAVE_SSE3
48- __m128d sum_halves = _mm_hadd_pd (a , a );
49- return _mm_cvtsd_f64 (_mm_hadd_pd (sum_halves , sum_halves ));
50- #else
51- __m128d t1 = _mm_movehl_pd (a , a );
52- __m128d t2 = _mm_add_pd (a , t1 );
53- __m128d t3 = _mm_shuffle_pd (t2 , t2 , 1 );
54- __m128d t4 = _mm_add_ss (t2 , t3 );
55- return _mm_cvtsd_f64 (t4 );
56- #endif
57- }
58- #else
5925// Horizontal add: Calculates the sum of all vector elements.
6026BLAS_FINLINE float v_sum_f32 (__m128 a )
6127{
@@ -70,19 +36,11 @@ BLAS_FINLINE float v_sum_f32(__m128 a)
7036 return _mm_cvtss_f32 (t4 );
7137#endif
7238}
73- #endif
7439/***************************
7540 * memory
7641 ***************************/
7742// unaligned load
78- #ifdef DOUBLE
79- #define v_loadu_f32 _mm_loadu_pd
80- #define v_storeu_f32 _mm_storeu_pd
81- #define v_setall_f32 (VAL ) _mm_set1_pd(VAL)
82- #define v_zero_f32 _mm_setzero_pd
83- #else
8443#define v_loadu_f32 _mm_loadu_ps
8544#define v_storeu_f32 _mm_storeu_ps
8645#define v_setall_f32 (VAL ) _mm_set1_ps(VAL)
87- #define v_zero_f32 _mm_setzero_ps
88- #endif
46+ #define v_zero_f32 _mm_setzero_ps
0 commit comments