44 * Data Type
55 ***************************/
66typedef __m512 v_f32 ;
7+ typedef __m512d v_f64 ;
78#define v_nlanes_f32 16
9+ #define v_nlanes_f64 8
810/***************************
911 * Arithmetic
1012 ***************************/
1113#define v_add_f32 _mm512_add_ps
14+ #define v_add_f64 _mm512_add_pd
1215#define v_mul_f32 _mm512_mul_ps
16+ #define v_mul_f64 _mm512_mul_pd
1317// multiply and add, a*b + c
1418#define v_muladd_f32 _mm512_fmadd_ps
15-
19+ #define v_muladd_f64 _mm512_fmadd_pd
1620BLAS_FINLINE float v_sum_f32 (v_f32 a )
1721{
1822 __m512 h64 = _mm512_shuffle_f32x4 (a , a , _MM_SHUFFLE (3 , 2 , 3 , 2 ));
@@ -25,11 +29,26 @@ BLAS_FINLINE float v_sum_f32(v_f32 a)
2529 __m512 sum4 = _mm512_add_ps (sum8 , h4 );
2630 return _mm_cvtss_f32 (_mm512_castps512_ps128 (sum4 ));
2731}
32+
33+ BLAS_FINLINE double v_sum_f64 (v_f64 a )
34+ {
35+ __m512d h64 = _mm512_shuffle_f64x2 (a , a , _MM_SHUFFLE (3 , 2 , 3 , 2 ));
36+ __m512d sum32 = _mm512_add_pd (a , h64 );
37+ __m512d h32 = _mm512_permutex_pd (sum32 , _MM_SHUFFLE (1 , 0 , 3 , 2 ));
38+ __m512d sum16 = _mm512_add_pd (sum32 , h32 );
39+ __m512d h16 = _mm512_permute_pd (sum16 , _MM_SHUFFLE (2 , 3 , 0 , 1 ));
40+ __m512d sum8 = _mm512_add_pd (sum16 , h16 );
41+ return _mm_cvtsd_f64 (_mm512_castpd512_pd128 (sum8 ));
42+ }
2843/***************************
2944 * memory
3045 ***************************/
3146// unaligned load
3247#define v_loadu_f32 (PTR ) _mm512_loadu_ps((const __m512*)(PTR))
48+ #define v_loadu_f64 (PTR ) _mm512_loadu_pd((const __m512*)(PTR))
3349#define v_storeu_f32 _mm512_storeu_ps
50+ #define v_storeu_f64 _mm512_storeu_pd
3451#define v_setall_f32 (VAL ) _mm512_set1_ps(VAL)
52+ #define v_setall_f64 (VAL ) _mm512_set1_pd(VAL)
3553#define v_zero_f32 _mm512_setzero_ps
54+ #define v_zero_f64 _mm512_setzero_pd
0 commit comments