@@ -27,45 +27,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727
2828#include "common.h"
2929#if !defined(DOUBLE )
30- #define VSETVL (n ) RISCV_RVV(vsetvl_e32m4)(n)
31- #define FLOAT_V_T vfloat32m4_t
32- #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
33- #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
34- #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
35- #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
36- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
37- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
30+ #define VSETVL (n ) RISCV_RVV(vsetvl_e32m2)(n)
31+ #define FLOAT_V_T vfloat32m2_t
32+ #define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
33+ #define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
34+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
35+ #define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m2)
36+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m2)
37+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m2)
38+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f32m2)
39+ #define VSEV_FLOAT RISCV_RVV(vse32_v_f32m2)
3840#else
39- #define VSETVL (n ) RISCV_RVV(vsetvl_e64m4)(n)
40- #define FLOAT_V_T vfloat64m4_t
41- #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
42- #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
43- #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
44- #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
45- #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
46- #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
41+ #define VSETVL (n ) RISCV_RVV(vsetvl_e64m2)(n)
42+ #define FLOAT_V_T vfloat64m2_t
43+ #define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
44+ #define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
45+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
46+ #define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m2)
47+ #define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m2)
48+ #define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m2)
49+ #define VFMUL_VF_FLOAT RISCV_RVV(vfmul_vf_f64m2)
50+ #define VSEV_FLOAT RISCV_RVV(vse64_v_f64m2)
4751#endif
4852
4953int CNAME (BLASLONG m , BLASLONG n , BLASLONG dummy1 , FLOAT alpha_r , FLOAT alpha_i , FLOAT * a , BLASLONG lda , FLOAT * x , BLASLONG inc_x , FLOAT * y , BLASLONG inc_y , FLOAT * buffer )
5054{
5155 BLASLONG i = 0 , j = 0 , k = 0 ;
5256 BLASLONG ix = 0 , iy = 0 ;
5357 FLOAT * a_ptr = a ;
54- FLOAT temp_r = 0.0 , temp_i = 0.0 ;
55- FLOAT_V_T va0 , va1 , vy0 , vy1 ;
58+ FLOAT temp_r = 0.0 , temp_i = 0.0 , temp_r1 , temp_i1 , temp_r2 , temp_i2 , temp_r3 , temp_i3 , temp_rr [ 4 ] , temp_ii [ 4 ] ;
59+ FLOAT_V_T va0 , va1 , vy0 , vy1 , vy0_new , vy1_new , va2 , va3 , va4 , va5 , va6 , va7 , temp_iv , temp_rv , x_v0 , x_v1 , temp_v1 , temp_v2 , temp_v3 , temp_v4 ;
5660 unsigned int gvl = 0 ;
5761 BLASLONG stride_a = sizeof (FLOAT ) * 2 ;
5862 BLASLONG stride_y = inc_y * sizeof (FLOAT ) * 2 ;
5963 gvl = VSETVL (m );
6064 BLASLONG inc_yv = inc_y * gvl * 2 ;
6165 BLASLONG inc_x2 = inc_x * 2 ;
6266 BLASLONG lda2 = lda * 2 ;
67+ vy0_new = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
68+ vy1_new = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
6369 for (k = 0 ,j = 0 ; k < m /gvl ; k ++ ){
6470 a_ptr = a ;
6571 ix = 0 ;
66- vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
67- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
68- for (i = 0 ; i < n ; i ++ ){
72+ vy0 = vy0_new ;
73+ vy1 = vy1_new ;
74+ // vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
75+ // vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
76+ if (k < m /gvl - 1 ){
77+ vy0_new = VLSEV_FLOAT (& y [iy + inc_yv ], stride_y , gvl );
78+ vy1_new = VLSEV_FLOAT (& y [iy + inc_yv + 1 ], stride_y , gvl );
79+ }
80+ for (i = 0 ; i < n %4 ; i ++ ){
6981#if !defined(XCONJ )
7082 temp_r = alpha_r * x [ix ] - alpha_i * x [ix + 1 ];
7183 temp_i = alpha_r * x [ix + 1 ] + alpha_i * x [ix ];
@@ -74,8 +86,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
7486 temp_i = alpha_r * x [ix + 1 ] - alpha_i * x [ix ];
7587#endif
7688
77- va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
78- va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
89+ va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
90+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
7991#if !defined(CONJ )
8092#if !defined(XCONJ )
8193 vy0 = VFMACCVF_FLOAT (vy0 , temp_r , va0 , gvl );
@@ -108,6 +120,144 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
108120 a_ptr += lda2 ;
109121 ix += inc_x2 ;
110122 }
123+
124+ for (; i < n ; i += 4 ){
125+ #if !defined(XCONJ )
126+
127+
128+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
129+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
130+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
131+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
132+ temp_rv = VFNMSACVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
133+ temp_iv = VFMACCVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
134+ VSEV_FLOAT (& temp_rr [0 ],temp_rv , 4 );
135+ VSEV_FLOAT (& temp_ii [0 ],temp_iv , 4 );
136+
137+
138+ #else
139+ x_v0 = VLSEV_FLOAT (& x [ix ], inc_x2 * sizeof (FLOAT ), 4 );
140+ x_v1 = VLSEV_FLOAT (& x [ix + 1 ], inc_x2 * sizeof (FLOAT ), 4 );
141+ temp_rv = VFMUL_VF_FLOAT (x_v0 , alpha_r , 4 );
142+ temp_iv = VFMUL_VF_FLOAT (x_v0 , alpha_i , 4 );
143+ temp_rv = VFMACCVF_FLOAT (temp_rv , alpha_i , x_v1 , 4 );
144+ temp_iv = VFNMSACVF_FLOAT (temp_iv , alpha_r , x_v1 , 4 );
145+ VSEV_FLOAT (& temp_rr [0 ],temp_rv , 4 );
146+ VSEV_FLOAT (& temp_ii [0 ],temp_iv , 4 );
147+
148+ #endif
149+
150+ va0 = VLSEV_FLOAT (& a_ptr [j ], stride_a , gvl );
151+ va1 = VLSEV_FLOAT (& a_ptr [j + 1 ], stride_a , gvl );
152+ va2 = VLSEV_FLOAT (& a_ptr [j + lda2 ], stride_a , gvl );
153+ va3 = VLSEV_FLOAT (& a_ptr [j + lda2 + 1 ], stride_a , gvl );
154+ va4 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 ], stride_a , gvl );
155+ va5 = VLSEV_FLOAT (& a_ptr [j + lda2 * 2 + 1 ], stride_a , gvl );
156+ va6 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 ], stride_a , gvl );
157+ va7 = VLSEV_FLOAT (& a_ptr [j + lda2 * 3 + 1 ], stride_a , gvl );
158+
159+
160+ #if !defined(CONJ )
161+ #if !defined(XCONJ )
162+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
163+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
164+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
165+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
166+
167+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
168+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
169+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
170+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
171+
172+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
173+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
174+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
175+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
176+
177+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
178+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
179+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
180+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
181+
182+
183+ #else
184+
185+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
186+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
187+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
188+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
189+
190+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
191+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
192+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
193+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
194+
195+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
196+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
197+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
198+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
199+
200+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
201+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
202+ vy1 = VFMACCVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
203+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
204+
205+
206+ #endif
207+
208+ #else
209+
210+ #if !defined(XCONJ )
211+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
212+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
213+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
214+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
215+
216+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
217+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
218+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
219+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
220+
221+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
222+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
223+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
224+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
225+
226+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
227+ vy0 = VFMACCVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
228+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
229+ vy1 = VFMACCVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
230+
231+
232+ #else
233+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [0 ], va0 , gvl );
234+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [0 ], va1 , gvl );
235+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [0 ], va1 , gvl );
236+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [0 ], va0 , gvl );
237+
238+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [1 ], va2 , gvl );
239+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [1 ], va3 , gvl );
240+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [1 ], va3 , gvl );
241+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [1 ], va2 , gvl );
242+
243+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [2 ], va4 , gvl );
244+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [2 ], va5 , gvl );
245+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [2 ], va5 , gvl );
246+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [2 ], va4 , gvl );
247+
248+ vy0 = VFMACCVF_FLOAT (vy0 , temp_rr [3 ], va6 , gvl );
249+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_ii [3 ], va7 , gvl );
250+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_rr [3 ], va7 , gvl );
251+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_ii [3 ], va6 , gvl );
252+
253+ #endif
254+
255+ #endif
256+ a_ptr += lda2 * 4 ;
257+ ix += inc_x2 * 4 ;
258+ }
259+
260+
111261 VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
112262 VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
113263 j += gvl * 2 ;
@@ -171,3 +321,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
171321}
172322
173323
324+
0 commit comments