@@ -36,12 +36,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3636#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
3737#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
3838#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
39+ #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
3940#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
4041#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
4142#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
4243#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
4344#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
4445#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
46+ #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
4547#else
4648#define VSETVL (n ) __riscv_vsetvl_e64m4(n)
4749#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
@@ -52,12 +54,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5254#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
5355#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
5456#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
57+ #define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
5558#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
5659#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
5760#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
5861#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
5962#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
6063#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
64+ #define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
6165#endif
6266
6367int CNAME (BLASLONG m , BLASLONG offset , FLOAT alpha_r , FLOAT alpha_i , FLOAT * a , BLASLONG lda , FLOAT * x , BLASLONG incx , FLOAT * y , BLASLONG incy , FLOAT * buffer ){
@@ -143,49 +147,45 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
143147 iy += inc_yv ;
144148 ia += inc_av ;
145149 }
146- v_res = VFREDSUM_FLOAT (vr0 , v_z0 , gvl );
147- temp_r2 = VFMVFS_FLOAT (v_res );
148- v_res = VFREDSUM_FLOAT (vr1 , v_z0 , gvl );
149- temp_i2 = VFMVFS_FLOAT (v_res );
150+
150151 if (i < m ){
151- gvl = VSETVL (m - i );
152- va0 = VLSEV_FLOAT (& a_ptr [ia ], stride_a , gvl );
153- va1 = VLSEV_FLOAT (& a_ptr [ia + 1 ], stride_a , gvl );
154- vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl );
155- vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl );
152+ unsigned int gvl_rem = VSETVL (m - i );
153+ va0 = VLSEV_FLOAT (& a_ptr [ia ], stride_a , gvl_rem );
154+ va1 = VLSEV_FLOAT (& a_ptr [ia + 1 ], stride_a , gvl_rem );
155+ vy0 = VLSEV_FLOAT (& y [iy ], stride_y , gvl_rem );
156+ vy1 = VLSEV_FLOAT (& y [iy + 1 ], stride_y , gvl_rem );
156157#ifndef HEMVREV
157- vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl );
158- vy0 = VFNMSACVF_FLOAT (vy0 , temp_i1 , va1 , gvl );
159- vy1 = VFMACCVF_FLOAT (vy1 , temp_r1 , va1 , gvl );
160- vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl );
158+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl_rem );
159+ vy0 = VFNMSACVF_FLOAT (vy0 , temp_i1 , va1 , gvl_rem );
160+ vy1 = VFMACCVF_FLOAT (vy1 , temp_r1 , va1 , gvl_rem );
161+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl_rem );
161162#else
162- vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl );
163- vy0 = VFMACCVF_FLOAT (vy0 , temp_i1 , va1 , gvl );
164- vy1 = VFNMSACVF_FLOAT (vy1 , temp_r1 , va1 , gvl );
165- vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl );
163+ vy0 = VFMACCVF_FLOAT (vy0 , temp_r1 , va0 , gvl_rem );
164+ vy0 = VFMACCVF_FLOAT (vy0 , temp_i1 , va1 , gvl_rem );
165+ vy1 = VFNMSACVF_FLOAT (vy1 , temp_r1 , va1 , gvl_rem );
166+ vy1 = VFMACCVF_FLOAT (vy1 , temp_i1 , va0 , gvl_rem );
166167#endif
167- VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl );
168- VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl );
168+ VSSEV_FLOAT (& y [iy ], stride_y , vy0 , gvl_rem );
169+ VSSEV_FLOAT (& y [iy + 1 ], stride_y , vy1 , gvl_rem );
169170
170- vx0 = VLSEV_FLOAT (& x [ix ], stride_x , gvl );
171- vx1 = VLSEV_FLOAT (& x [ix + 1 ], stride_x , gvl );
171+ vx0 = VLSEV_FLOAT (& x [ix ], stride_x , gvl_rem );
172+ vx1 = VLSEV_FLOAT (& x [ix + 1 ], stride_x , gvl_rem );
172173#ifndef HEMVREV
173- vr0 = VFMULVV_FLOAT ( vx0 , va0 , gvl );
174- vr0 = VFMACCVV_FLOAT (vr0 , vx1 , va1 , gvl );
175- vr1 = VFMULVV_FLOAT ( vx1 , va0 , gvl );
176- vr1 = VFNMSACVV_FLOAT (vr1 , vx0 , va1 , gvl );
174+ vr0 = VFMACCVV_FLOAT_TU ( vr0 , vx0 , va0 , gvl_rem );
175+ vr0 = VFMACCVV_FLOAT_TU (vr0 , vx1 , va1 , gvl_rem );
176+ vr1 = VFMACCVV_FLOAT_TU ( vr1 , vx1 , va0 , gvl_rem );
177+ vr1 = VFNMSACVV_FLOAT_TU (vr1 , vx0 , va1 , gvl_rem );
177178#else
178- vr0 = VFMULVV_FLOAT ( vx0 , va0 , gvl );
179- vr0 = VFNMSACVV_FLOAT (vr0 , vx1 , va1 , gvl );
180- vr1 = VFMULVV_FLOAT ( vx1 , va0 , gvl );
181- vr1 = VFMACCVV_FLOAT (vr1 , vx0 , va1 , gvl );
179+ vr0 = VFMACCVV_FLOAT_TU ( vr0 , vx0 , va0 , gvl_rem );
180+ vr0 = VFNMSACVV_FLOAT_TU (vr0 , vx1 , va1 , gvl_rem );
181+ vr1 = VFMACCVV_FLOAT_TU ( vr1 , vx1 , va0 , gvl_rem );
182+ vr1 = VFMACCVV_FLOAT_TU (vr1 , vx0 , va1 , gvl_rem );
182183#endif
183-
184- v_res = VFREDSUM_FLOAT (vr0 , v_z0 , gvl );
185- temp_r2 += VFMVFS_FLOAT (v_res );
186- v_res = VFREDSUM_FLOAT (vr1 , v_z0 , gvl );
187- temp_i2 += VFMVFS_FLOAT (v_res );
188184 }
185+ v_res = VFREDSUM_FLOAT (vr0 , v_z0 , gvl );
186+ temp_r2 = VFMVFS_FLOAT (v_res );
187+ v_res = VFREDSUM_FLOAT (vr1 , v_z0 , gvl );
188+ temp_i2 = VFMVFS_FLOAT (v_res );
189189 }
190190 y [jy ] += alpha_r * temp_r2 - alpha_i * temp_i2 ;
191191 y [jy + 1 ] += alpha_r * temp_i2 + alpha_i * temp_r2 ;
0 commit comments