@@ -671,6 +671,7 @@ static void FORCEINLINE M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG
671671 c18 = __riscv_vfmacc_vf_f32m8 (c18 , alpha , c28 , N );
672672 __riscv_vse32_v_f32m8 (C , c18 , N * 8 );
673673 } else {
674+ // Can swap A and B and remove transpose when compilers get better
674675 FLOAT temp [8 * 8 ];
675676 vfloat32m1x8_t c28 = __riscv_vcreate_v_f32m1x8 (result0 , result1 , result2 , result3 , result4 , result5 , result6 , result7 );
676677 __riscv_vsseg8e32_v_f32m1x8 (temp , c28 , N );
@@ -1149,6 +1150,7 @@ static void FORCEINLINE M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG
11491150 c14 = __riscv_vfmacc_vf_f32m4 (c14 , alpha , c24 , 4 * 8 );
11501151 __riscv_vse32_v_f32m4 (C , c14 , 4 * 8 );
11511152 } else {
1153+ // Can swap A and B and remove transpose when compilers get better
11521154 FLOAT temp [8 * 4 ];
11531155 vfloat32mf2x8_t c18 = __riscv_vcreate_v_f32mf2x8 (result0 , result1 , result2 , result3 , result4 , result5 , result6 , result7 );
11541156 __riscv_vsseg8e32_v_f32mf2x8 (temp , c18 , 4 );
@@ -1503,6 +1505,12 @@ static void FORCEINLINE N_TAIL_ONE(BLASLONG K, BLASLONG M, const BLASLONG N, FLO
15031505 if (N & 1 ) {
15041506 B04 = B + ((N & 6 ) * K );
15051507 }
1508+ #endif
1509+ #ifdef GEMM_BOTTOM_CHUNK
1510+ FLOAT K2 ;
1511+ if (N <= 4 ) {
1512+ K2 = K ;
1513+ }
15061514#endif
15071515 do {
15081516 FLOAT B0 , B1 , B2 , B3 , B4 , B5 , B6 ;
@@ -1530,6 +1538,9 @@ static void FORCEINLINE N_TAIL_ONE(BLASLONG K, BLASLONG M, const BLASLONG N, FLO
15301538 vfloat32m1_t A2 , A3 , A4 , A5 , A6 , A7 ;
15311539 vfloat32m1_t resultE , resultF ;
15321540 FLOAT B7 ;
1541+ if (N <= 4 ) {
1542+ K = K2 ;
1543+ }
15331544
15341545 if (N == 1 ) {
15351546 if (K >= 8 ) {
0 commit comments