Skip to content

Commit 706a08d

Browse files
committed
Optimized sgemv_t for small N based on AVX512
1 parent 42f048c commit 706a08d

3 files changed

Lines changed: 1215 additions & 1 deletion

File tree

kernel/x86_64/sgemv_t_4.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3434
#include "sgemv_t_microk_bulldozer-4.c"
3535
#elif defined(SANDYBRIDGE)
3636
#include "sgemv_t_microk_sandy-4.c"
37-
#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE)
37+
#elif defined(HASWELL) || defined(ZEN)
3838
#include "sgemv_t_microk_haswell-4.c"
39+
#elif defined (SKYLAKEX) || defined (COOPERLAKE)
40+
#include "sgemv_t_microk_haswell-4.c"
41+
#include "sgemv_t_microk_skylakex.c"
3942
#endif
4043

4144
#if defined(STEAMROLLER) || defined(EXCAVATOR)
@@ -305,6 +308,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
305308
if ( m < 1 ) return(0);
306309
if ( n < 1 ) return(0);
307310

311+
#ifdef HAVE_SGEMV_T_SKYLAKE_KERNEL
312+
if (lda == m && n <= 16384 && m <= 8)
313+
{
314+
FLOAT * xbuffer_align = x;
315+
FLOAT * ybuffer_align = y;
316+
317+
if (inc_x != 1) {
318+
xbuffer_align = buffer;
319+
for(BLASLONG i=0; i<m; i++) {
320+
xbuffer_align[i] = x[i*inc_x];
321+
}
322+
}
323+
324+
if (inc_y != 1) {
325+
ybuffer_align = buffer + m;
326+
for(BLASLONG i=0; i<n; i++) {
327+
ybuffer_align[i] = y[i*inc_y];
328+
}
329+
}
330+
sgemv_kernel_t(m, n , alpha, a, xbuffer_align, ybuffer_align);
331+
332+
if(inc_y != 1) {
333+
for(BLASLONG i=0; i<n; i++) {
334+
y[i*inc_y] = ybuffer_align[i];
335+
}
336+
}
337+
return(0);
338+
}
339+
340+
#endif
341+
308342
xbuffer = buffer;
309343
ytemp = buffer + (m < NBMAX ? m : NBMAX);
310344

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
/***************************************************************************
2+
Copyright (c) 2014, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
/* need a new enough GCC for avx512 support */
29+
#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
30+
31+
#define HAVE_SGEMV_T_SKYLAKE_KERNEL 1
32+
#include "common.h"
33+
#include <immintrin.h>
34+
#include "sgemv_t_microk_skylakex_template.c"
35+
36+
//sgemv_t:
37+
// ----- m -----
38+
// |<-----------
39+
// |<-----------
40+
// n
41+
// |<-----------
42+
// |<-----------
43+
44+
static int sgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, float *a, float *x, float *y)
45+
{
46+
switch(m) {
47+
case 1: sgemv_kernel_t_1(n, alpha, a, x, y); break;
48+
case 2: sgemv_kernel_t_2(n, alpha, a, x, y); break;
49+
case 3: sgemv_kernel_t_3(n, alpha, a, x, y); break;
50+
case 4: sgemv_kernel_t_4(n, alpha, a, x, y); break;
51+
case 5: sgemv_kernel_t_5(n, alpha, a, x, y); break;
52+
case 6: sgemv_kernel_t_6(n, alpha, a, x, y); break;
53+
case 7: sgemv_kernel_t_7(n, alpha, a, x, y); break;
54+
case 8: sgemv_kernel_t_8(n, alpha, a, x, y); break;
55+
default: break;
56+
}
57+
return 0;
58+
}
59+
60+
#endif

0 commit comments

Comments
 (0)