@@ -2214,67 +2214,37 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
22142214
22152215 FLOAT * C2 = C ;
22162216
2217- vfloat32m2_t c00 ;
2218- c00 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2219- vfloat32m1_t c0 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2220- vfloat32m1_t c1 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2221- c00 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2222- vfloat32m1_t c2 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2223- vfloat32m1_t c3 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2224- c00 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2225- vfloat32m1_t c4 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2226- vfloat32m1_t c5 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2227- c00 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2228- vfloat32m1_t c6 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2229- vfloat32m1_t c7 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2230- c00 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2231- vfloat32m1_t c8 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2232- vfloat32m1_t c9 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2233- c00 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2234- vfloat32m1_t c10 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2235- vfloat32m1_t c11 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2236- c00 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2237- vfloat32m1_t c12 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2238- vfloat32m1_t c13 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2239- c00 = __riscv_vle32_v_f32m2 (C , 16 );
2240- vfloat32m1_t c14 = __riscv_vget_v_f32m2_f32m1 (c00 , 0 );
2241- vfloat32m1_t c15 = __riscv_vget_v_f32m2_f32m1 (c00 , 1 );
2242-
2243- c0 = __riscv_vfmacc_vf_f32m1 ( c0 , alpha , result0 , 8 );
2244- c1 = __riscv_vfmacc_vf_f32m1 ( c1 , alpha , result1 , 8 );
2245- c2 = __riscv_vfmacc_vf_f32m1 ( c2 , alpha , result2 , 8 );
2246- c3 = __riscv_vfmacc_vf_f32m1 ( c3 , alpha , result3 , 8 );
2247- c4 = __riscv_vfmacc_vf_f32m1 ( c4 , alpha , result4 , 8 );
2248- c5 = __riscv_vfmacc_vf_f32m1 ( c5 , alpha , result5 , 8 );
2249- c6 = __riscv_vfmacc_vf_f32m1 ( c6 , alpha , result6 , 8 );
2250- c7 = __riscv_vfmacc_vf_f32m1 ( c7 , alpha , result7 , 8 );
2251- c8 = __riscv_vfmacc_vf_f32m1 ( c8 , alpha , result8 , 8 );
2252- c9 = __riscv_vfmacc_vf_f32m1 ( c9 , alpha , result9 , 8 );
2253- c10 = __riscv_vfmacc_vf_f32m1 ( c10 , alpha , result10 , 8 );
2254- c11 = __riscv_vfmacc_vf_f32m1 ( c11 , alpha , result11 , 8 );
2255- c12 = __riscv_vfmacc_vf_f32m1 ( c12 , alpha , result12 , 8 );
2256- c13 = __riscv_vfmacc_vf_f32m1 ( c13 , alpha , result13 , 8 );
2257- c14 = __riscv_vfmacc_vf_f32m1 ( c14 , alpha , result14 , 8 );
2258- c15 = __riscv_vfmacc_vf_f32m1 ( c15 , alpha , result15 , 8 );
2217+ vfloat32m2_t c01 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2218+ vfloat32m2_t c23 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2219+ vfloat32m2_t c45 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2220+ vfloat32m2_t c67 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2221+
2222+ c01 = __riscv_vfmacc_vf_f32m2 ( c01 , alpha , result01 , 16 );
2223+ c23 = __riscv_vfmacc_vf_f32m2 ( c23 , alpha , result23 , 16 );
2224+ c45 = __riscv_vfmacc_vf_f32m2 ( c45 , alpha , result45 , 16 );
2225+ c67 = __riscv_vfmacc_vf_f32m2 ( c67 , alpha , result67 , 16 );
2226+
2227+ vfloat32m2_t c89 = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2228+ vfloat32m2_t cAB = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2229+ vfloat32m2_t cCD = __riscv_vle32_v_f32m2 (C , 16 ); C += ldc ;
2230+ vfloat32m2_t cEF = __riscv_vle32_v_f32m2 (C , 16 );
2231+
2232+ c89 = __riscv_vfmacc_vf_f32m2 ( c89 , alpha , result89 , 16 );
2233+ cAB = __riscv_vfmacc_vf_f32m2 ( cAB , alpha , resultAB , 16 );
2234+ cCD = __riscv_vfmacc_vf_f32m2 ( cCD , alpha , resultCD , 16 );
2235+ cEF = __riscv_vfmacc_vf_f32m2 ( cEF , alpha , resultEF , 16 );
22592236
22602237 C = C2 ;
22612238
2262- c00 = __riscv_vcreate_v_f32m1_f32m2 (c0 , c1 );
2263- __riscv_vse32_v_f32m2 (C , c00 , 16 ); C += ldc ;
2264- c00 = __riscv_vcreate_v_f32m1_f32m2 (c2 , c3 );
2265- __riscv_vse32_v_f32m2 (C , c00 , 16 ); C += ldc ;
2266- c00 = __riscv_vcreate_v_f32m1_f32m2 (c4 , c5 );
2267- __riscv_vse32_v_f32m2 (C , c00 , 16 ); C += ldc ;
2268- c00 = __riscv_vcreate_v_f32m1_f32m2 (c6 , c7 );
2269- __riscv_vse32_v_f32m2 (C , c00 , 16 ); C += ldc ;
2270- c00 = __riscv_vcreate_v_f32m1_f32m2 (c8 , c9 );
2271- __riscv_vse32_v_f32m2 (C , c00 , 16 ); C += ldc ;
2272- c00 = __riscv_vcreate_v_f32m1_f32m2 (c10 , c11 );
2273- __riscv_vse32_v_f32m2 (C , c00 , 16 ); C += ldc ;
2274- c00 = __riscv_vcreate_v_f32m1_f32m2 (c12 , c13 );
2275- __riscv_vse32_v_f32m2 (C , c00 , 16 ); C += ldc ;
2276- c00 = __riscv_vcreate_v_f32m1_f32m2 (c14 , c15 );
2277- __riscv_vse32_v_f32m2 (C , c00 , 16 );
2239+ __riscv_vse32_v_f32m2 (C , c01 , 16 ); C += ldc ;
2240+ __riscv_vse32_v_f32m2 (C , c23 , 16 ); C += ldc ;
2241+ __riscv_vse32_v_f32m2 (C , c45 , 16 ); C += ldc ;
2242+ __riscv_vse32_v_f32m2 (C , c67 , 16 ); C += ldc ;
2243+ __riscv_vse32_v_f32m2 (C , c89 , 16 ); C += ldc ;
2244+ __riscv_vse32_v_f32m2 (C , cAB , 16 ); C += ldc ;
2245+ __riscv_vse32_v_f32m2 (C , cCD , 16 ); C += ldc ;
2246+ __riscv_vse32_v_f32m2 (C , cEF , 16 );
2247+
22782248 C = C2 + 16 ;
22792249 }
22802250
0 commit comments