Skip to content

Commit 5fcacad

Browse files
committed
sbgemm: cooperlake: implement tcopy_4
1 parent bb1c4fa commit 5fcacad

2 files changed

Lines changed: 87 additions & 0 deletions

File tree

kernel/x86_64/sbgemm_tcopy_16_cooperlake.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,4 +160,5 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
160160
}
161161
}
162162
}
163+
return 0;
163164
}

kernel/x86_64/sbgemm_tcopy_4_cooperlake.c

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,94 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

2828
#include <stdio.h>
29+
#include <immintrin.h>
2930
#include "common.h"
3031

3132
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
33+
BLASLONG i, j;
3234

35+
IFLOAT *boffset0, *boffset1;
36+
37+
boffset0 = b;
38+
39+
BLASLONG n8 = n & ~7;
40+
BLASLONG m4 = m & ~3;
41+
BLASLONG m2 = m & ~1;
42+
43+
for (j = 0; j < n8; j += 8) {
44+
boffset1 = boffset0 + m * 4;
45+
for (i = 0; i < m4; i +=4) {
46+
__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
47+
__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
48+
__m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]);
49+
__m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]);
50+
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
51+
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
52+
__m128i a10 = _mm_unpacklo_epi16(a2, a3);
53+
__m128i a11 = _mm_unpackhi_epi16(a2, a3);
54+
_mm_storeu_si128((void *)(boffset0 + 0), a00);
55+
_mm_storeu_si128((void *)(boffset0 + 8), a10);
56+
_mm_storeu_si128((void *)(boffset1 + 0), a01);
57+
_mm_storeu_si128((void *)(boffset1 + 8), a11);
58+
boffset0 += 16;
59+
boffset1 += 16;
60+
}
61+
for (; i < m2; i+= 2) {
62+
__m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]);
63+
__m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]);
64+
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
65+
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
66+
_mm_storeu_si128((void *)(boffset0 + 0), a00);
67+
_mm_storeu_si128((void *)(boffset1 + 0), a01);
68+
boffset0 += 8;
69+
boffset1 += 8;
70+
}
71+
for (; i < m; i++) {
72+
__m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]);
73+
_mm_store_sd((void *)boffset0, a0);
74+
_mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1));
75+
boffset0 += 4;
76+
boffset1 += 4;
77+
}
78+
boffset0 = boffset1;
79+
}
80+
if (j < n) {
81+
uint32_t remains = n - j;
82+
__mmask8 r_mask = (1UL << remains) - 1;
83+
if (remains > 4) {
84+
boffset1 = boffset0 + m * 4;
85+
uint32_t tail1 = remains - 4;
86+
__mmask8 w_mask1 = (1UL << tail1) - 1;
87+
for (i = 0; i < m2; i += 2) {
88+
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
89+
__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
90+
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
91+
__m128i a01 = _mm_unpackhi_epi16(a0, a1);
92+
_mm_storeu_si128((void *)boffset0, a00);
93+
_mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01);
94+
boffset0 += 8;
95+
boffset1 += 2 * tail1;
96+
}
97+
for (; i < m; i++) {
98+
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
99+
_mm_store_sd((void *)boffset0, (__m128d) a0);
100+
_mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1));
101+
boffset0 += 4;
102+
boffset1 += tail1;
103+
}
104+
} else {
105+
for (i = 0; i < m2; i += 2) {
106+
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
107+
__m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]);
108+
__m128i a00 = _mm_unpacklo_epi16(a0, a1);
109+
_mm_mask_storeu_epi32((void *)boffset0, r_mask, a00);
110+
boffset0 += 2 * remains;
111+
}
112+
for (; i < m; i++) {
113+
__m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]);
114+
_mm_mask_storeu_epi16((void *)boffset0, r_mask, a0);
115+
}
116+
}
117+
}
118+
return 0;
33119
}

0 commit comments

Comments
 (0)