@@ -27,147 +27,294 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
2828#include "common.h"
2929#if !defined(DOUBLE )
30- #define VSETVL (n ) RISCV_RVV(vsetvl_e32m4)(n)
31- #define FLOAT_V_T vfloat32m4_t
32- #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
33- #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
34- #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
35- #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
36- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
37- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
30+ #define VSETVL (n ) RISCV_RVV(vsetvl_e32m2)(n)
31+ #define FLOAT_V_T vfloat32m2_t
32+ #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
33+ #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
34+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
35+ #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2)
36+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2)
37+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2)
38+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2)
39+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
3840#else
39- #define VSETVL (n ) RISCV_RVV(vsetvl_e64m4)(n)
40- #define FLOAT_V_T vfloat64m4_t
41- #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
42- #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
43- #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
44- #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
45- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
46- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
41+ #define VSETVL (n ) RISCV_RVV(vsetvl_e64m2)(n)
42+ #define FLOAT_V_T vfloat64m2_t
43+ #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
44+ #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
45+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
46+ #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2)
47+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2)
48+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2)
49+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2)
50+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
4751#endif
4852
4953int CNAME (BLASLONG m , BLASLONG n , BLASLONG dummy1 , FLOAT alpha_r , FLOAT alpha_i , FLOAT * a , BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y , FLOAT * buffer )
5054{
51- BLASLONG i = 0 , j = 0 , k = 0 ;
55+ BLASLONG i = 0 , j = 0 , k = 0 ;
5256 BLASLONG ix = 0 , iy = 0 ;
5357 FLOAT * a_ptr = a ;
54- FLOAT temp_r = 0.0 , temp_i = 0.0 ;
55- FLOAT_V_T va0 , va1 , vy0 , vy1 ;
58+ FLOAT temp_r = 0.0 , temp_i = 0.0 , temp_r1 , temp_i1 , temp_r2 , temp_i2 , temp_r3 , temp_i3 , temp_rr [ 4 ], temp_ii [ 4 ] ;
59+ FLOAT_V_T va0 , va1 , vy0 , vy1 , vy0_new , vy1_new , va2 , va3 , va4 , va5 , va6 , va7 , temp_iv , temp_rv , x_v0 , x_v1 , temp_v1 , temp_v2 , temp_v3 , temp_v4 ;
5660 unsigned int gvl = 0 ;
5761 BLASLONG stride_a = sizeof (FLOAT ) * 2 ;
5862 BLASLONG stride_y = inc_y * sizeof (FLOAT ) * 2 ;
5963 gvl = VSETVL (m );
6064 BLASLONG inc_yv = inc_y * gvl * 2 ;
6165 BLASLONG inc_x2 = inc_x * 2 ;
6266 BLASLONG lda2 = lda * 2 ;
63- for (k = 0 ,j = 0 ; k < m /gvl ; k ++ ){
67+ vy0_new = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
68+ vy1_new = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
69+ for (k = 0 , j = 0 ; k < m / gvl ; k ++ )
70+ {
6471 a_ptr = a ;
6572 ix = 0 ;
66- vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
67- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
68- for (i = 0 ; i < n ; i ++ ){
73+ vy0 = vy0_new ;
74+ vy1 = vy1_new ;
75+
76+ if (k < m / gvl - 1 )
77+ {
78+ vy0_new = VLSEV_FLOAT (& y [iy + inc_yv ], stride_y , gvl );
79+ vy1_new = VLSEV_FLOAT (& y [iy + inc_yv + 1 ], stride_y , gvl );
80+ }
81+ for (i = 0 ; i < n % 4 ; i ++ )
82+ {
6983#if !defined(XCONJ )
70- temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
71- temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
84+ temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
85+ temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
7286#else
73- temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
74- temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
87+ temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
88+ temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
7589#endif
7690
7791 va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
78- va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
92+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
7993#if !defined(CONJ )
8094#if !defined(XCONJ )
81- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
82- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
83- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
84- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
95+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
96+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
97+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
98+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
8599#else
86100
87- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
88- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
89- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
90- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
101+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
102+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
103+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
104+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
91105#endif
92106
93107#else
94108
95109#if !defined(XCONJ )
96- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
97- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
98- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
99- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
110+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
111+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
112+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
113+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
100114#else
101- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
102- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
103- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
104- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
115+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
116+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
117+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
118+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
105119#endif
106120
107121#endif
108122 a_ptr += lda2 ;
109123 ix += inc_x2 ;
110124 }
125+
126+ for (; i < n ; i += 4 )
127+ {
128+ #if !defined(XCONJ )
129+
130+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
131+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
132+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
133+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
134+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
135+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
136+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
137+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
138+
139+ #else
140+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
141+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
142+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
143+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
144+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
145+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
146+ VSEV_FLOAT (& temp_rr [0 ], temp_rv , 4 );
147+ VSEV_FLOAT (& temp_ii [0 ], temp_iv , 4 );
148+
149+ #endif
150+
151+ va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
152+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
153+ va2 = VLSEV_FLOAT (& a_ptr [j + lda2 ], stride_a , gvl );
154+ va3 = VLSEV_FLOAT (& a_ptr [j + lda2 + 1 ], stride_a , gvl );
155+ va4 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 ], stride_a , gvl );
156+ va5 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 + 1 ], stride_a , gvl );
157+ va6 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 ], stride_a , gvl );
158+ va7 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 + 1 ], stride_a , gvl );
159+
160+ #if !defined(CONJ )
161+ #if !defined(XCONJ )
162+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
163+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
164+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
165+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
166+
167+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
168+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
169+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
170+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
171+
172+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
173+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
174+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
175+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
176+
177+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
178+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
179+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
180+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
181+
182+ #else
183+
184+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
185+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
186+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
187+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
188+
189+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
190+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
191+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
192+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
193+
194+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
195+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
196+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
197+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
198+
199+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
200+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
201+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
202+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
203+
204+ #endif
205+
206+ #else
207+
208+ #if !defined(XCONJ )
209+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
210+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
211+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
212+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
213+
214+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
215+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
216+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
217+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
218+
219+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
220+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
221+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
222+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
223+
224+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
225+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
226+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
227+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
228+
229+ #else
230+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
231+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
232+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
233+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
234+
235+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
236+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
237+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
238+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
239+
240+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
241+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
242+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
243+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
244+
245+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
246+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
247+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
248+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
249+
250+ #endif
251+
252+ #endif
253+ a_ptr += lda2 * 4 ;
254+ ix += inc_x2 * 4 ;
255+ }
256+
111257 VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
112- VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
258+ VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
113259 j += gvl * 2 ;
114260 iy += inc_yv ;
115261 }
116- //tail
117- if (j /2 < m ){
118- gvl = VSETVL (m - j /2 );
262+ // tail
263+ if (j / 2 < m )
264+ {
265+ gvl = VSETVL (m - j / 2 );
119266 a_ptr = a ;
120267 ix = 0 ;
121268 vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
122- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
123- for (i = 0 ; i < n ; i ++ ){
269+ vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
270+ for (i = 0 ; i < n ; i ++ )
271+ {
124272#if !defined(XCONJ )
125- temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
126- temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
273+ temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
274+ temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
127275#else
128- temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
129- temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
276+ temp_r = alpha_r * x [ix ] + alpha_i * x [ix + 1 ];
277+ temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
130278#endif
131279
132280 va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
133- va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
281+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
134282#if !defined(CONJ )
135283
136284#if !defined(XCONJ )
137- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
138- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
139- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
140- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
285+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
286+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
287+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
288+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
141289#else
142290
143- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
144- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
145- vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
146- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
291+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
292+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
293+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r , va1 , gvl );
294+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
147295#endif
148296
149297#else
150298
151299#if !defined(XCONJ )
152- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
153- vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
154- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
155- vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
300+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
301+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i , va1 , gvl );
302+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
303+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i , va0 , gvl );
156304#else
157- vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
158- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
159- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
160- vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
305+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
306+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i , va1 , gvl );
307+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r , va1 , gvl );
308+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_i , va0 , gvl );
161309#endif
162310
163311#endif
164312 a_ptr += lda2 ;
165313 ix += inc_x2 ;
166314 }
167315 VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
168- VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
316+ VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
169317 }
170- return (0 );
318+ return (0 );
171319}
172320
173-
0 commit comments