@@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
122122
123123 for (jjs = 0 ; jjs < ls - js ; jjs += min_jj ){
124124 min_jj = ls - js - jjs ;
125+ #ifdef SKYLAKEX
126+ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
127+ if (min_jj >= 6 * GEMM_UNROLL_N ) min_jj = 6 * GEMM_UNROLL_N ;
128+ #else
125129 if (min_jj > GEMM_UNROLL_N * 3 ) min_jj = GEMM_UNROLL_N * 3 ;
126130 else
127131 if (min_jj > GEMM_UNROLL_N ) min_jj = GEMM_UNROLL_N ;
128-
132+ #endif
129133#ifndef TRANSA
130134 GEMM_ONCOPY (min_l , min_jj , a + (ls + (js + jjs ) * lda ) * COMPSIZE , lda , sb + min_l * jjs * COMPSIZE );
131135#else
@@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
142146
143147 for (jjs = 0 ; jjs < min_l ; jjs += min_jj ){
144148 min_jj = min_l - jjs ;
149+ #ifdef SKYLAKEX
150+ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
151+ if (min_jj >= 6 * GEMM_UNROLL_N ) min_jj = 6 * GEMM_UNROLL_N ;
152+ #else
145153 if (min_jj > GEMM_UNROLL_N * 3 ) min_jj = GEMM_UNROLL_N * 3 ;
146154 else
147155 if (min_jj > GEMM_UNROLL_N ) min_jj = GEMM_UNROLL_N ;
148-
156+ #endif
149157#ifndef TRANSA
150158 TRMM_OLNCOPY (min_l , min_jj , a , lda , ls , ls + jjs , sb + min_l * (ls - js + jjs ) * COMPSIZE );
151159#else
@@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
195203
196204 for (jjs = js ; jjs < js + min_j ; jjs += min_jj ){
197205 min_jj = min_j + js - jjs ;
206+ #ifdef SKYLAKEX
207+ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
208+ if (min_jj >= 6 * GEMM_UNROLL_N ) min_jj = 6 * GEMM_UNROLL_N ;
209+ #else
198210 if (min_jj > GEMM_UNROLL_N * 3 ) min_jj = GEMM_UNROLL_N * 3 ;
199211 else
200212 if (min_jj > GEMM_UNROLL_N ) min_jj = GEMM_UNROLL_N ;
201-
213+ #endif
202214#ifndef TRANSA
203215 GEMM_ONCOPY (min_l , min_jj , a + (ls + jjs * lda ) * COMPSIZE , lda , sb + min_l * (jjs - js ) * COMPSIZE );
204216#else
@@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
246258
247259 for (jjs = 0 ; jjs < min_l ; jjs += min_jj ){
248260 min_jj = min_l - jjs ;
261+ #ifdef SKYLAKEX
262+ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
263+ if (min_jj >= 6 * GEMM_UNROLL_N ) min_jj = 6 * GEMM_UNROLL_N ;
264+ #else
249265 if (min_jj > GEMM_UNROLL_N * 3 ) min_jj = GEMM_UNROLL_N * 3 ;
250266 else
251267 if (min_jj > GEMM_UNROLL_N ) min_jj = GEMM_UNROLL_N ;
252-
268+ #endif
253269#ifndef TRANSA
254270 TRMM_OUNCOPY (min_l , min_jj , a , lda , ls , ls + jjs , sb + min_l * jjs * COMPSIZE );
255271#else
@@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
267283
268284 for (jjs = 0 ; jjs < js - ls - min_l ; jjs += min_jj ){
269285 min_jj = js - ls - min_l - jjs ;
286+ #ifdef SKYLAKEX
287+ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
288+ if (min_jj >= 6 * GEMM_UNROLL_N ) min_jj = 6 * GEMM_UNROLL_N ;
289+ #else
270290 if (min_jj > GEMM_UNROLL_N * 3 ) min_jj = GEMM_UNROLL_N * 3 ;
271291 else
272292 if (min_jj > GEMM_UNROLL_N ) min_jj = GEMM_UNROLL_N ;
273-
293+ #endif
274294#ifndef TRANSA
275295 GEMM_ONCOPY (min_l , min_jj , a + (ls + (ls + min_l + jjs ) * lda ) * COMPSIZE , lda ,
276296 sb + min_l * (min_l + jjs ) * COMPSIZE );
@@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
324344
325345 for (jjs = js ; jjs < js + min_j ; jjs += min_jj ){
326346 min_jj = min_j + js - jjs ;
347+ #ifdef SKYLAKEX
348+ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
349+ if (min_jj >= 6 * GEMM_UNROLL_N ) min_jj = 6 * GEMM_UNROLL_N ;
350+ #else
327351 if (min_jj > GEMM_UNROLL_N * 3 ) min_jj = GEMM_UNROLL_N * 3 ;
328352 else
329353 if (min_jj > GEMM_UNROLL_N ) min_jj = GEMM_UNROLL_N ;
330-
354+ #endif
331355#ifndef TRANSA
332356 GEMM_ONCOPY (min_l , min_jj , a + (ls + (jjs - min_j ) * lda ) * COMPSIZE , lda , sb + min_l * (jjs - js ) * COMPSIZE );
333357#else
0 commit comments