@@ -26,8 +26,94 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626*****************************************************************************/
2727
2828#include <stdio.h>
29+ #include <immintrin.h>
2930#include "common.h"
3031
3132int CNAME (BLASLONG m , BLASLONG n , IFLOAT * a , BLASLONG lda , IFLOAT * b ){
33+ BLASLONG i , j ;
3234
35+ IFLOAT * boffset0 , * boffset1 ;
36+
37+ boffset0 = b ;
38+
39+ BLASLONG n8 = n & ~7 ;
40+ BLASLONG m4 = m & ~3 ;
41+ BLASLONG m2 = m & ~1 ;
42+
43+ for (j = 0 ; j < n8 ; j += 8 ) {
44+ boffset1 = boffset0 + m * 4 ;
45+ for (i = 0 ; i < m4 ; i += 4 ) {
46+ __m128i a0 = _mm_loadu_si128 ((void * )& a [(i + 0 )* lda + j ]);
47+ __m128i a1 = _mm_loadu_si128 ((void * )& a [(i + 1 )* lda + j ]);
48+ __m128i a2 = _mm_loadu_si128 ((void * )& a [(i + 2 )* lda + j ]);
49+ __m128i a3 = _mm_loadu_si128 ((void * )& a [(i + 3 )* lda + j ]);
50+ __m128i a00 = _mm_unpacklo_epi16 (a0 , a1 );
51+ __m128i a01 = _mm_unpackhi_epi16 (a0 , a1 );
52+ __m128i a10 = _mm_unpacklo_epi16 (a2 , a3 );
53+ __m128i a11 = _mm_unpackhi_epi16 (a2 , a3 );
54+ _mm_storeu_si128 ((void * )(boffset0 + 0 ), a00 );
55+ _mm_storeu_si128 ((void * )(boffset0 + 8 ), a10 );
56+ _mm_storeu_si128 ((void * )(boffset1 + 0 ), a01 );
57+ _mm_storeu_si128 ((void * )(boffset1 + 8 ), a11 );
58+ boffset0 += 16 ;
59+ boffset1 += 16 ;
60+ }
61+ for (; i < m2 ; i += 2 ) {
62+ __m128i a0 = _mm_loadu_si128 ((void * )& a [(i + 0 )* lda + j ]);
63+ __m128i a1 = _mm_loadu_si128 ((void * )& a [(i + 1 )* lda + j ]);
64+ __m128i a00 = _mm_unpacklo_epi16 (a0 , a1 );
65+ __m128i a01 = _mm_unpackhi_epi16 (a0 , a1 );
66+ _mm_storeu_si128 ((void * )(boffset0 + 0 ), a00 );
67+ _mm_storeu_si128 ((void * )(boffset1 + 0 ), a01 );
68+ boffset0 += 8 ;
69+ boffset1 += 8 ;
70+ }
71+ for (; i < m ; i ++ ) {
72+ __m128d a0 = _mm_loadu_pd ((void * )& a [(i + 0 )* lda + j ]);
73+ _mm_store_sd ((void * )boffset0 , a0 );
74+ _mm_store_sd ((void * )boffset1 , _mm_permute_pd (a0 , 0x1 ));
75+ boffset0 += 4 ;
76+ boffset1 += 4 ;
77+ }
78+ boffset0 = boffset1 ;
79+ }
80+ if (j < n ) {
81+ uint32_t remains = n - j ;
82+ __mmask8 r_mask = (1UL << remains ) - 1 ;
83+ if (remains > 4 ) {
84+ boffset1 = boffset0 + m * 4 ;
85+ uint32_t tail1 = remains - 4 ;
86+ __mmask8 w_mask1 = (1UL << tail1 ) - 1 ;
87+ for (i = 0 ; i < m2 ; i += 2 ) {
88+ __m128i a0 = _mm_maskz_loadu_epi16 (r_mask , & a [(i + 0 )* lda + j ]);
89+ __m128i a1 = _mm_maskz_loadu_epi16 (r_mask , & a [(i + 1 )* lda + j ]);
90+ __m128i a00 = _mm_unpacklo_epi16 (a0 , a1 );
91+ __m128i a01 = _mm_unpackhi_epi16 (a0 , a1 );
92+ _mm_storeu_si128 ((void * )boffset0 , a00 );
93+ _mm_mask_storeu_epi32 ((void * )boffset1 , w_mask1 , a01 );
94+ boffset0 += 8 ;
95+ boffset1 += 2 * tail1 ;
96+ }
97+ for (; i < m ; i ++ ) {
98+ __m128i a0 = _mm_maskz_loadu_epi16 (r_mask , & a [(i + 0 )* lda + j ]);
99+ _mm_store_sd ((void * )boffset0 , (__m128d ) a0 );
100+ _mm_mask_storeu_epi16 ((void * )boffset1 , w_mask1 , (__m128i ) _mm_permute_pd ((__m128d ) a0 , 0x1 ));
101+ boffset0 += 4 ;
102+ boffset1 += tail1 ;
103+ }
104+ } else {
105+ for (i = 0 ; i < m2 ; i += 2 ) {
106+ __m128i a0 = _mm_maskz_loadu_epi16 (r_mask , & a [(i + 0 )* lda + j ]);
107+ __m128i a1 = _mm_maskz_loadu_epi16 (r_mask , & a [(i + 1 )* lda + j ]);
108+ __m128i a00 = _mm_unpacklo_epi16 (a0 , a1 );
109+ _mm_mask_storeu_epi32 ((void * )boffset0 , r_mask , a00 );
110+ boffset0 += 2 * remains ;
111+ }
112+ for (; i < m ; i ++ ) {
113+ __m128i a0 = _mm_maskz_loadu_epi16 (r_mask , & a [(i + 0 )* lda + j ]);
114+ _mm_mask_storeu_epi16 ((void * )boffset0 , r_mask , a0 );
115+ }
116+ }
117+ }
118+ return 0 ;
33119}
0 commit comments