@@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929
3030typedef __vector unsigned char vec_t ;
3131typedef FLOAT v4sf_t __attribute__ ((vector_size (16 )));
32- typedef FLOAT v2sf_t __attribute__ ((vector_size (8 )));
32+ #if !__has_builtin (__builtin_vsx_assemble_pair )
33+ #define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
34+ #endif
35+
36+ #if !__has_builtin (__builtin_vsx_disassemble_pair )
37+ #define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
38+ #endif
3339
3440#ifdef TRMMKERNEL
3541#define SAVE_ACC (ACC , J ) \
@@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
186192 vec_t * rowA = (vec_t * ) & AO [0 ];
187193 vec_t * rb = (vec_t * ) & BO [0 ];
188194 __vector_pair rowB , rowB1 ;
189- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
190- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
195+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
196+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
191197 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
192198 __builtin_mma_xvf64ger (& acc1 , rowB1 , rowA [0 ]);
193199 __builtin_mma_xvf64ger (& acc2 , rowB , rowA [1 ]);
@@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
200206 {
201207 rowA = (vec_t * ) & AO [l << 3 ];
202208 rb = (vec_t * ) & BO [l << 3 ];
203- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
204- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
209+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
210+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
205211 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
206212 __builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
207213 __builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [1 ]);
@@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
242248 vec_t * rowA = (vec_t * ) & AO [0 ];
243249 __vector_pair rowB , rowB1 ;
244250 vec_t * rb = (vec_t * ) & BO [0 ];
245- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
246- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
251+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
252+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
247253 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
248254 __builtin_mma_xvf64ger (& acc1 , rowB1 , rowA [0 ]);
249255 __builtin_mma_xvf64ger (& acc2 , rowB , rowA [1 ]);
@@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
252258 {
253259 rowA = (vec_t * ) & AO [l << 2 ];
254260 rb = (vec_t * ) & BO [l << 3 ];
255- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
256- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
261+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
262+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
257263 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
258264 __builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
259265 __builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [1 ]);
@@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
286292 vec_t * rowA = (vec_t * ) & AO [0 ];
287293 __vector_pair rowB , rowB1 ;
288294 vec_t * rb = (vec_t * ) & BO [0 ];
289- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
290- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
295+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
296+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
291297 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
292298 __builtin_mma_xvf64ger (& acc1 , rowB1 , rowA [0 ]);
293299 for (l = 1 ; l < temp ; l ++ )
294300 {
295301 rowA = (vec_t * ) & AO [l << 1 ];
296302 rb = (vec_t * ) & BO [l << 3 ];
297- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
298- __builtin_mma_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
303+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
304+ __builtin_vsx_assemble_pair (& rowB1 , rb [3 ], rb [2 ]);
299305 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
300306 __builtin_mma_xvf64gerpp (& acc1 , rowB1 , rowA [0 ]);
301307 }
@@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
398404 vec_t * rowA = (vec_t * ) & AO [0 ];
399405 __vector_pair rowB ;
400406 vec_t * rb = (vec_t * ) & BO [0 ];
401- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
407+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
402408 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
403409 __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
404410 __builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
@@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
407413 {
408414 rowA = (vec_t * ) & AO [l << 3 ];
409415 rb = (vec_t * ) & BO [l << 2 ];
410- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
416+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
411417 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
412418 __builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
413419 __builtin_mma_xvf64gerpp (& acc2 , rowB , rowA [2 ]);
@@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
440446 vec_t * rowA = (vec_t * ) & AO [0 ];
441447 __vector_pair rowB ;
442448 vec_t * rb = (vec_t * ) & BO [0 ];
443- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
449+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
444450 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
445451 __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
446452 for (l = 1 ; l < temp ; l ++ )
447453 {
448454 rowA = (vec_t * ) & AO [l << 2 ];
449455 rb = (vec_t * ) & BO [l << 2 ];
450- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
456+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
451457 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
452458 __builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
453459 }
@@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
476482 vec_t * rowA = (vec_t * ) & AO [0 ];
477483 __vector_pair rowB ;
478484 vec_t * rb = (vec_t * ) & BO [0 ];
479- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
485+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
480486 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
481487 for (l = 1 ; l < temp ; l ++ )
482488 {
483489 rowA = (vec_t * ) & AO [l << 1 ];
484490 rb = (vec_t * ) & BO [l << 2 ];
485- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
491+ __builtin_vsx_assemble_pair (& rowB , rb [1 ], rb [0 ]);
486492 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
487493 }
488494 SAVE_ACC (& acc0 , 0 );
@@ -562,21 +568,18 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
562568 v4sf_t result [4 ];
563569 __vector_quad acc0 , acc1 , acc2 , acc3 ;
564570 BLASLONG l = 0 ;
565- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
566- t [0 ] = BO [0 ], t [1 ] = BO [1 ];
567571 __vector_pair rowB ;
568- vec_t * rb = (vec_t * ) & t [0 ];
569- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
572+ vec_t * rb = (vec_t * ) & BO [0 ];
573+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
570574 vec_t * rowA = (vec_t * ) & AO [0 ];
571575 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
572576 __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
573577 __builtin_mma_xvf64ger (& acc2 , rowB , rowA [2 ]);
574578 __builtin_mma_xvf64ger (& acc3 , rowB , rowA [3 ]);
575579 for (l = 1 ; l < temp ; l ++ )
576580 {
577- t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
578- rb = (vec_t * ) & t [0 ];
579- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
581+ rb = (vec_t * ) & BO [l << 1 ];
582+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
580583 rowA = (vec_t * ) & AO [l << 3 ];
581584 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
582585 __builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
@@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
607610 v4sf_t result [4 ];
608611 __vector_quad acc0 , acc1 ;
609612 BLASLONG l = 0 ;
610- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
611- t [0 ] = BO [0 ], t [1 ] = BO [1 ];
612613 __vector_pair rowB ;
613- vec_t * rb = (vec_t * ) & t [0 ];
614- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
614+ vec_t * rb = (vec_t * ) & BO [0 ];
615+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
615616 vec_t * rowA = (vec_t * ) & AO [0 ];
616617 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
617618 __builtin_mma_xvf64ger (& acc1 , rowB , rowA [1 ]);
618619 for (l = 1 ; l < temp ; l ++ )
619620 {
620- t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
621- rb = (vec_t * ) & t [0 ];
622- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
621+ rb = (vec_t * ) & BO [l << 1 ];
622+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
623623 rowA = (vec_t * ) & AO [l << 2 ];
624624 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
625625 __builtin_mma_xvf64gerpp (& acc1 , rowB , rowA [1 ]);
@@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
646646 v4sf_t result [4 ];
647647 __vector_quad acc0 ;
648648 BLASLONG l = 0 ;
649- FLOAT t [4 ] = { 0 , 0 , 0 , 0 };
650- t [0 ] = BO [0 ], t [1 ] = BO [1 ];
651649 __vector_pair rowB ;
652- vec_t * rb = (vec_t * ) & t [0 ];
653- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
650+ vec_t * rb = (vec_t * ) & BO [0 ];
651+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
654652 vec_t * rowA = (vec_t * ) & AO [0 ];
655653 __builtin_mma_xvf64ger (& acc0 , rowB , rowA [0 ]);
656654 for (l = 1 ; l < temp ; l ++ )
657655 {
658- t [0 ] = BO [l << 1 ], t [1 ] = BO [(l << 1 ) + 1 ];
659- rb = (vec_t * ) & t [0 ];
660- __builtin_mma_assemble_pair (& rowB , rb [1 ], rb [0 ]);
656+ rb = (vec_t * ) & BO [l << 1 ];
657+ __builtin_vsx_assemble_pair (& rowB , rb [0 ], rb [0 ]);
661658 rowA = (vec_t * ) & AO [l << 1 ];
662659 __builtin_mma_xvf64gerpp (& acc0 , rowB , rowA [0 ]);
663660 }
0 commit comments