@@ -98,6 +98,30 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
9898 rowC = (v2sf_t *) &CO[7* ldc+J]; \
9999 rowC[0] += result[6] * alpha;
100100
101+ #define SAVE4x2_ACC_SCALAR (ACC ) { \
102+ __builtin_mma_disassemble_acc ((void *)result, ACC); \
103+ res[0] = result[0] * alpha; \
104+ res[1] = result[1] * alpha; \
105+ res[2] = result[2] * alpha; \
106+ res[3] = result[3] * alpha; \
107+ CO[0 * ldc] += res[0][0]; \
108+ CO[1 * ldc] += res[1][0]; \
109+ CO[2 * ldc] += res[2][0]; \
110+ CO[3 * ldc] += res[3][0]; \
111+ }
112+
113+ #define SAVE4x2_ACC1_SCALAR (ACC ) { \
114+ __builtin_mma_disassemble_acc ((void *)result, ACC); \
115+ res[0] = result[0] * alpha; \
116+ res[1] = result[1] * alpha; \
117+ res[2] = result[2] * alpha; \
118+ res[3] = result[3] * alpha; \
119+ CO[4 * ldc] += res[0][0]; \
120+ CO[5 * ldc] += res[1][0]; \
121+ CO[6 * ldc] += res[2][0]; \
122+ CO[7 * ldc] += res[3][0]; \
123+ }
124+
101125#define MMA __builtin_mma_xvbf16ger2pp
102126
103127#define SAVE2x4_ACC (ACC , J ) \
@@ -313,7 +337,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
313337 {
314338 IFLOAT * BO = B ;
315339 v2sf_t * rowC ;
316- v2sf_t result [8 ];
340+ v4sf_t result [4 ], res [ 4 ];
317341 __vector_quad acc0 , acc1 ;
318342 __builtin_mma_xxsetaccz (& acc0 );
319343 __builtin_mma_xxsetaccz (& acc1 );
@@ -335,8 +359,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
335359 MMA (& acc0 , MERGE_HIGH (rowB [0 ], vzero ), (vec_t ) rowA );
336360 MMA (& acc1 , MERGE_LOW (rowB [0 ], vzero ), (vec_t ) rowA );
337361 }
338- SAVE4x2_ACC (& acc0 , 0 );
339- SAVE4x2_ACC1 (& acc1 , 0 );
362+ SAVE4x2_ACC_SCALAR (& acc0 );
363+ SAVE4x2_ACC1_SCALAR (& acc1 );
340364 CO += 1 ;
341365 AO += k ;
342366 BO += (k << 3 );
@@ -547,7 +571,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
547571 {
548572 IFLOAT * BO = B ;
549573 v2sf_t * rowC ;
550- v2sf_t result [8 ];
574+ v4sf_t result [4 ], res [ 4 ];
551575 __vector_quad acc0 ;
552576 BLASLONG l = 0 ;
553577 __builtin_mma_xxsetaccz (& acc0 );
@@ -571,7 +595,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
571595 };
572596 MMA (& acc0 , (vec_t )(rowB_mrg ), (vec_t ) rowA );
573597 }
574- SAVE4x2_ACC (& acc0 , 0 );
598+ SAVE4x2_ACC_SCALAR (& acc0 );
575599 AO += k ;
576600 BO += (k << 2 );
577601 CO += 1 ;
0 commit comments