@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
2828
2929#include "common.h"
30-
30+ #include "../simd/intrin.h"
3131#if defined(DSDOT )
3232double CNAME (BLASLONG n , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y )
3333#else
@@ -47,27 +47,59 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
4747
4848 if ( (inc_x == 1 ) && (inc_y == 1 ) )
4949 {
50-
51- int n1 = n & -4 ;
52-
53- while (i < n1 )
50+ int n1 = n & -4 ;
51+ #if V_SIMD && !defined(DSDOT )
52+ const int vstep = v_nlanes_f32 ;
53+ const int unrollx4 = n & (- vstep * 4 );
54+ const int unrollx = n & - vstep ;
55+ v_f32 vsum0 = v_zero_f32 ();
56+ v_f32 vsum1 = v_zero_f32 ();
57+ v_f32 vsum2 = v_zero_f32 ();
58+ v_f32 vsum3 = v_zero_f32 ();
59+ while (i < unrollx4 )
60+ {
61+ vsum0 = v_muladd_f32 (
62+ v_loadu_f32 (x + i ), v_loadu_f32 (y + i ), vsum0
63+ );
64+ vsum1 = v_muladd_f32 (
65+ v_loadu_f32 (x + i + vstep ), v_loadu_f32 (y + i + vstep ), vsum1
66+ );
67+ vsum2 = v_muladd_f32 (
68+ v_loadu_f32 (x + i + vstep * 2 ), v_loadu_f32 (y + i + vstep * 2 ), vsum2
69+ );
70+ vsum3 = v_muladd_f32 (
71+ v_loadu_f32 (x + i + vstep * 3 ), v_loadu_f32 (y + i + vstep * 3 ), vsum3
72+ );
73+ i += vstep * 4 ;
74+ }
75+ vsum0 = v_add_f32 (
76+ v_add_f32 (vsum0 , vsum1 ), v_add_f32 (vsum2 , vsum3 )
77+ );
78+ while (i < unrollx )
79+ {
80+ vsum0 = v_muladd_f32 (
81+ v_loadu_f32 (x + i ), v_loadu_f32 (y + i ), vsum0
82+ );
83+ i += vstep ;
84+ }
85+ dot = v_sum_f32 (vsum0 );
86+ #elif defined(DSDOT )
87+ for (; i < n1 ; i += 4 )
5488 {
55-
56- #if defined(DSDOT )
5789 dot += (double ) y [i ] * (double ) x [i ]
5890 + (double ) y [i + 1 ] * (double ) x [i + 1 ]
5991 + (double ) y [i + 2 ] * (double ) x [i + 2 ]
6092 + (double ) y [i + 3 ] * (double ) x [i + 3 ] ;
93+ }
6194#else
95+ for (; i < n1 ; i += 4 )
96+ {
6297 dot += y [i ] * x [i ]
6398 + y [i + 1 ] * x [i + 1 ]
6499 + y [i + 2 ] * x [i + 2 ]
65100 + y [i + 3 ] * x [i + 3 ] ;
66- #endif
67- i += 4 ;
68-
69101 }
70-
102+ #endif
71103 while (i < n )
72104 {
73105
0 commit comments