@@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929* trivial copy of asum.c with the ABS() removed *
3030**************************************************************************************/
3131
32-
3332#include "common.h"
33+ #include "../simd/intrin.h"
3434#include <math.h>
3535
3636FLOAT CNAME (BLASLONG n , FLOAT * x , BLASLONG inc_x )
3737{
38- BLASLONG i = 0 ;
38+ BLASLONG i = 0 ;
3939 FLOAT sumf = 0.0 ;
40- if (n <= 0 || inc_x <= 0 ) return ( sumf );
41-
40+ if (n <= 0 || inc_x <= 0 )
41+ return ( sumf );
4242 n *= inc_x ;
43- while (i < n )
43+ if (inc_x == 1 )
44+ {
45+ #if V_SIMD
46+ const int vstep = v_nlanes_f32 ;
47+ const int unrollx4 = n & (- vstep * 4 );
48+ const int unrollx = n & - vstep ;
49+ v_f32 vsum0 = v_zero_f32 ();
50+ v_f32 vsum1 = v_zero_f32 ();
51+ v_f32 vsum2 = v_zero_f32 ();
52+ v_f32 vsum3 = v_zero_f32 ();
53+ while (i < unrollx4 )
54+ {
55+ vsum0 = v_add_f32 (vsum0 , v_loadu_f32 (x ));
56+ vsum1 = v_add_f32 (vsum1 , v_loadu_f32 (x + vstep ));
57+ vsum2 = v_add_f32 (vsum2 , v_loadu_f32 (x + vstep * 2 ));
58+ vsum3 = v_add_f32 (vsum3 , v_loadu_f32 (x + vstep * 3 ));
59+ i += vstep * 4 ;
60+ }
61+ vsum0 = v_add_f32 (
62+ v_add_f32 (vsum0 , vsum1 ), v_add_f32 (vsum2 , vsum3 ));
63+ while (i < unrollx )
64+ {
65+ vsum0 = v_add_f32 (vsum0 , v_loadu_f32 (x + i ));
66+ i += vstep ;
67+ }
68+ sumf = v_sum_f32 (vsum0 );
69+ #else
70+ int n1 = n & -4 ;
71+ for (; i < n1 ; i += 4 )
72+ {
73+ sumf += x [i ] + x [i + 1 ] + x [i + 2 ] + x [i + 3 ];
74+ }
75+ #endif
76+ }
77+ while (i < n )
4478 {
4579 sumf += x [i ];
4680 i += inc_x ;
4781 }
48- return (sumf );
82+ return (sumf );
4983}
50-
51-
0 commit comments