33/***************************
44 * Data Type
55 ***************************/
6+ #ifdef DOUBLE
7+ typedef __m128d v_f32 ;
8+ #else
69typedef __m128 v_f32 ;
10+ #endif
11+
712#define v_nlanes_f32 4
813/***************************
914 * Arithmetic
1015 ***************************/
16+ #ifdef DOUBLE
17+ #define v_add_f32 _mm_add_pd
18+ #define v_mul_f32 _mm_mul_pd
19+ #else
1120#define v_add_f32 _mm_add_ps
1221#define v_mul_f32 _mm_mul_ps
22+ #endif
1323#ifdef HAVE_FMA3
1424 // multiply and add, a*b + c
15- #define v_muladd_f32 _mm_fmadd_ps
25+ #ifdef DOUBLE
26+ #define v_muladd_f32 _mm_fmadd_pd
27+ #else
28+ #define v_muladd_f32 _mm_fmadd_ps
29+ #endif
1630#elif defined(HAVE_FMA4 )
1731 // multiply and add, a*b + c
18- #define v_muladd_f32 _mm_macc_ps
32+ #ifdef DOUBLE
33+ #define v_muladd_f32 _mm_macc_pd
34+ #else
35+ #define v_muladd_f32 _mm_macc_ps
36+ #endif
1937#else
2038 // multiply and add, a*b + c
2139 BLAS_FINLINE v_f32 v_muladd_f32 (v_f32 a , v_f32 b , v_f32 c )
2240 { return v_add_f32 (v_mul_f32 (a , b ), c ); }
2341#endif // HAVE_FMA3
2442
43+ // Horizontal add: Calculates the sum of all vector elements.
44+ #ifdef DOUBLE
45+ BLAS_FINLINE double v_sum_f32 (__m128d a )
46+ {
47+ #ifdef HAVE_SSE3
48+ __m128d sum_halves = _mm_hadd_pd (a , a );
49+ return _mm_cvtsd_f64 (_mm_hadd_pd (sum_halves , sum_halves ));
50+ #else
51+ __m128d t1 = _mm_movehl_pd (a , a );
52+ __m128d t2 = _mm_add_pd (a , t1 );
53+ __m128d t3 = _mm_shuffle_pd (t2 , t2 , 1 );
54+ __m128d t4 = _mm_add_ss (t2 , t3 );
55+ return _mm_cvtsd_f64 (t4 );
56+ #endif
57+ }
58+ #else
2559// Horizontal add: Calculates the sum of all vector elements.
2660BLAS_FINLINE float v_sum_f32 (__m128 a )
2761{
@@ -36,11 +70,19 @@ BLAS_FINLINE float v_sum_f32(__m128 a)
3670 return _mm_cvtss_f32 (t4 );
3771#endif
3872}
73+ #endif
3974/***************************
4075 * memory
4176 ***************************/
4277// unaligned load
78+ #ifdef DOUBLE
79+ #define v_loadu_f32 _mm_loadu_pd
80+ #define v_storeu_f32 _mm_storeu_pd
81+ #define v_setall_f32 (VAL ) _mm_set1_pd(VAL)
82+ #define v_zero_f32 _mm_setzero_pd
83+ #else
4384#define v_loadu_f32 _mm_loadu_ps
4485#define v_storeu_f32 _mm_storeu_ps
4586#define v_setall_f32 (VAL ) _mm_set1_ps(VAL)
46- #define v_zero_f32 _mm_setzero_ps
87+ #define v_zero_f32 _mm_setzero_ps
88+ #endif
0 commit comments