Skip to content

Commit d832ee5

Browse files
committed
More global optimzation and clean up.
1 parent 477dd40 commit d832ee5

1 file changed

Lines changed: 36 additions & 66 deletions

File tree

kernel/riscv64/sgemm_kernel_16x8_zvl256b.c

Lines changed: 36 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,16 @@ AUTOGENERATED KERNEL
5252
#define FORCEINLINE inline __attribute__((always_inline))
5353

5454
#ifdef GEMM_NEW_PACKING
55-
static void FORCEINLINE M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG N, const bool S, FLOAT alpha, FLOAT* A0, FLOAT*, FLOAT*, FLOAT*, FLOAT* B, FLOAT* C, BLASLONG ldc)
55+
static FORCEINLINE FLOAT* M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG N, const bool S, FLOAT alpha, FLOAT* A0, FLOAT*, FLOAT*, FLOAT*, FLOAT* B, FLOAT* C, BLASLONG ldc)
5656
#else
57-
static void FORCEINLINE M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG N, const bool S, FLOAT alpha, FLOAT* A0, FLOAT* A1, FLOAT* A2, FLOAT* A3, FLOAT* B, FLOAT* C, BLASLONG ldc)
57+
static FORCEINLINE FLOAT* M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG N, const bool S, FLOAT alpha, FLOAT* A0, FLOAT* A1, FLOAT* A2, FLOAT* A3, FLOAT* B, FLOAT* C, BLASLONG ldc)
5858
#endif
5959
{
60+
const bool S2 = (S && (M == 8));
6061
if (N & 8) {
6162
vfloat32m1_t result0, result1, result2, result3, result4, result5, result6, result7;
6263
vfloat32m1_t result8, result9, resultA, resultB, resultC, resultD, resultE;
6364
vfloat32m1_t B0, A4;
64-
const bool S2 = (S && (M == 8));
6565

6666
#ifdef GEMM_RIGHT_CHUNK
6767
vfloat32m1_t B1, B2, B3, B4, B5, B6, B7;
@@ -824,7 +824,6 @@ static void FORCEINLINE M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG
824824
#ifndef GEMM_NEW_PACKING
825825
FLOAT *B00, *B01;
826826
#endif
827-
const bool S2 = (S && (M == 8));
828827
const bool S3 = ((N & 3) && (M & 8));
829828
if (S2 || S3) {
830829
result03 = __riscv_vle32_v_f32m1(A0, 8);
@@ -1374,9 +1373,10 @@ static void FORCEINLINE M_TAIL_ONE(BLASLONG K, const BLASLONG M, const BLASLONG
13741373
}
13751374
}
13761375
}
1376+
return B;
13771377
}
13781378

1379-
static void FORCEINLINE M_TAIL(BLASLONG K, const BLASLONG M, const BLASLONG N, const bool S, FLOAT alpha, FLOAT* A0, FLOAT* B, FLOAT* C, BLASLONG ldc)
1379+
static FORCEINLINE FLOAT* M_TAIL(BLASLONG K, const BLASLONG M, const BLASLONG N, const bool S, FLOAT alpha, FLOAT* A0, FLOAT* B, FLOAT* C, BLASLONG ldc)
13801380
{
13811381
FLOAT *A1, *A2, *A3;
13821382
#ifndef GEMM_NEW_PACKING
@@ -1407,32 +1407,32 @@ static void FORCEINLINE M_TAIL(BLASLONG K, const BLASLONG M, const BLASLONG N, c
14071407
if (M & 4) {
14081408
if (M & 2) {
14091409
if (M & 1) {
1410-
M_TAIL_ONE(K, 15, N, false, alpha, A0, A1, A2, A3, B, C, ldc);
1410+
return M_TAIL_ONE(K, 15, N, false, alpha, A0, A1, A2, A3, B, C, ldc);
14111411
} else {
1412-
M_TAIL_ONE(K, 14, N, false, alpha, A0, A1, A2, A0, B, C, ldc);
1412+
return M_TAIL_ONE(K, 14, N, false, alpha, A0, A1, A2, A0, B, C, ldc);
14131413
}
14141414
} else {
14151415
if (M & 1) {
1416-
M_TAIL_ONE(K, 13, N, false, alpha, A0, A1, A0, A3, B, C, ldc);
1416+
return M_TAIL_ONE(K, 13, N, false, alpha, A0, A1, A0, A3, B, C, ldc);
14171417
} else {
1418-
M_TAIL_ONE(K, 12, N, false, alpha, A0, A1, A0, A0, B, C, ldc);
1418+
return M_TAIL_ONE(K, 12, N, false, alpha, A0, A1, A0, A0, B, C, ldc);
14191419
}
14201420
}
14211421
} else {
14221422
if (M & 2) {
14231423
if (M & 1) {
1424-
M_TAIL_ONE(K, 11, N, false, alpha, A0, A0, A2, A3, B, C, ldc);
1424+
return M_TAIL_ONE(K, 11, N, false, alpha, A0, A0, A2, A3, B, C, ldc);
14251425
} else {
1426-
M_TAIL_ONE(K, 10, N, false, alpha, A0, A0, A2, A0, B, C, ldc);
1426+
return M_TAIL_ONE(K, 10, N, false, alpha, A0, A0, A2, A0, B, C, ldc);
14271427
}
14281428
} else {
14291429
if (M & 1) {
1430-
M_TAIL_ONE(K, 9, N, false, alpha, A0, A0, A0, A3, B, C, ldc);
1430+
return M_TAIL_ONE(K, 9, N, false, alpha, A0, A0, A0, A3, B, C, ldc);
14311431
} else {
14321432
if (S) {
1433-
M_TAIL_ONE(K, 8, N, true, alpha, A0, A0, A0, A0, B, C, 8);
1433+
return M_TAIL_ONE(K, 8, N, true, alpha, A0, A0, A0, A0, B, C, 8);
14341434
} else {
1435-
M_TAIL_ONE(K, 8, N, false, alpha, A0, A0, A0, A0, B, C, ldc);
1435+
return M_TAIL_ONE(K, 8, N, false, alpha, A0, A0, A0, A0, B, C, ldc);
14361436
}
14371437
}
14381438
}
@@ -1441,51 +1441,51 @@ static void FORCEINLINE M_TAIL(BLASLONG K, const BLASLONG M, const BLASLONG N, c
14411441
if (M & 2) {
14421442
if (M & 1) {
14431443
if (S) {
1444-
M_TAIL_ONE(K, 7, N, true, alpha, A0, A1, A2, A3, B, C, 7);
1444+
return M_TAIL_ONE(K, 7, N, true, alpha, A0, A1, A2, A3, B, C, 7);
14451445
} else {
1446-
M_TAIL_ONE(K, 7, N, false, alpha, A0, A1, A2, A3, B, C, ldc);
1446+
return M_TAIL_ONE(K, 7, N, false, alpha, A0, A1, A2, A3, B, C, ldc);
14471447
}
14481448
} else {
14491449
if (S) {
1450-
M_TAIL_ONE(K, 6, N, true, alpha, A0, A1, A2, A0, B, C, 6);
1450+
return M_TAIL_ONE(K, 6, N, true, alpha, A0, A1, A2, A0, B, C, 6);
14511451
} else {
1452-
M_TAIL_ONE(K, 6, N, false, alpha, A0, A1, A2, A0, B, C, ldc);
1452+
return M_TAIL_ONE(K, 6, N, false, alpha, A0, A1, A2, A0, B, C, ldc);
14531453
}
14541454
}
14551455
} else {
14561456
if (M & 1) {
14571457
if (S) {
1458-
M_TAIL_ONE(K, 5, N, true, alpha, A0, A1, A0, A3, B, C, 5);
1458+
return M_TAIL_ONE(K, 5, N, true, alpha, A0, A1, A0, A3, B, C, 5);
14591459
} else {
1460-
M_TAIL_ONE(K, 5, N, false, alpha, A0, A1, A0, A3, B, C, ldc);
1460+
return M_TAIL_ONE(K, 5, N, false, alpha, A0, A1, A0, A3, B, C, ldc);
14611461
}
14621462
} else {
14631463
if (S) {
1464-
M_TAIL_ONE(K, 4, N, true, alpha, A0, A1, A0, A0, B, C, 4);
1464+
return M_TAIL_ONE(K, 4, N, true, alpha, A0, A1, A0, A0, B, C, 4);
14651465
} else {
1466-
M_TAIL_ONE(K, 4, N, false, alpha, A0, A1, A0, A0, B, C, ldc);
1466+
return M_TAIL_ONE(K, 4, N, false, alpha, A0, A1, A0, A0, B, C, ldc);
14671467
}
14681468
}
14691469
}
14701470
} else if (M & 2) {
14711471
if (M & 1) {
14721472
if (S) {
1473-
M_TAIL_ONE(K, 3, N, true, alpha, A0, A0, A2, A3, B, C, 3);
1473+
return M_TAIL_ONE(K, 3, N, true, alpha, A0, A0, A2, A3, B, C, 3);
14741474
} else {
1475-
M_TAIL_ONE(K, 3, N, false, alpha, A0, A0, A2, A3, B, C, ldc);
1475+
return M_TAIL_ONE(K, 3, N, false, alpha, A0, A0, A2, A3, B, C, ldc);
14761476
}
14771477
} else {
14781478
if (S) {
1479-
M_TAIL_ONE(K, 2, N, true, alpha, A0, A0, A2, A0, B, C, 2);
1479+
return M_TAIL_ONE(K, 2, N, true, alpha, A0, A0, A2, A0, B, C, 2);
14801480
} else {
1481-
M_TAIL_ONE(K, 2, N, false, alpha, A0, A0, A2, A0, B, C, ldc);
1481+
return M_TAIL_ONE(K, 2, N, false, alpha, A0, A0, A2, A0, B, C, ldc);
14821482
}
14831483
}
14841484
} else {
14851485
if (S) {
1486-
M_TAIL_ONE(K, 1, N, true, alpha, A0, A0, A0, A3, B, C, 1);
1486+
return M_TAIL_ONE(K, 1, N, true, alpha, A0, A0, A0, A3, B, C, 1);
14871487
} else {
1488-
M_TAIL_ONE(K, 1, N, false, alpha, A0, A0, A0, A3, B, C, ldc);
1488+
return M_TAIL_ONE(K, 1, N, false, alpha, A0, A0, A0, A3, B, C, ldc);
14891489
}
14901490
}
14911491
}
@@ -2113,57 +2113,28 @@ static void FORCEINLINE N_TAIL_ONE(BLASLONG K, BLASLONG M, const BLASLONG N, FLO
21132113

21142114
static void FORCEINLINE N_TAIL(BLASLONG K, const BLASLONG M, const BLASLONG N, FLOAT alpha, FLOAT** A, FLOAT* B, FLOAT** C, BLASLONG ldc)
21152115
{
2116-
const bool S = (ldc == 16);
21172116
if (N & 4) {
21182117
if (N & 2) {
21192118
if (N & 1) {
2120-
if (S) {
2121-
N_TAIL_ONE(K, 1, 7, alpha, A, B, C, 16);
2122-
} else {
2123-
N_TAIL_ONE(K, M, 7, alpha, A, B, C, ldc);
2124-
}
2119+
N_TAIL_ONE(K, M, 7, alpha, A, B, C, ldc);
21252120
} else {
2126-
if (S) {
2127-
N_TAIL_ONE(K, 1, 6, alpha, A, B, C, 16);
2128-
} else {
2129-
N_TAIL_ONE(K, M, 6, alpha, A, B, C, ldc);
2130-
}
2121+
N_TAIL_ONE(K, M, 6, alpha, A, B, C, ldc);
21312122
}
21322123
} else {
21332124
if (N & 1) {
2134-
if (S) {
2135-
N_TAIL_ONE(K, 1, 5, alpha, A, B, C, 16);
2136-
} else {
2137-
N_TAIL_ONE(K, M, 5, alpha, A, B, C, ldc);
2138-
}
2125+
N_TAIL_ONE(K, M, 5, alpha, A, B, C, ldc);
21392126
} else {
2140-
if (S) {
2141-
N_TAIL_ONE(K, 1, 4, alpha, A, B, C, 16);
2142-
} else {
2143-
N_TAIL_ONE(K, M, 4, alpha, A, B, C, ldc);
2144-
}
2127+
N_TAIL_ONE(K, M, 4, alpha, A, B, C, ldc);
21452128
}
21462129
}
21472130
} else if (N & 2) {
21482131
if (N & 1) {
2149-
if (S) {
2150-
N_TAIL_ONE(K, 1, 3, alpha, A, B, C, 16);
2151-
} else {
2152-
N_TAIL_ONE(K, M, 3, alpha, A, B, C, ldc);
2153-
}
2132+
N_TAIL_ONE(K, M, 3, alpha, A, B, C, ldc);
21542133
} else {
2155-
if (S) {
2156-
N_TAIL_ONE(K, 1, 2, alpha, A, B, C, 16);
2157-
} else {
2158-
N_TAIL_ONE(K, M, 2, alpha, A, B, C, ldc);
2159-
}
2134+
N_TAIL_ONE(K, M, 2, alpha, A, B, C, ldc);
21602135
}
21612136
} else {
2162-
if (S) {
2163-
N_TAIL_ONE(K, 1, 1, alpha, A, B, C, 16);
2164-
} else {
2165-
N_TAIL_ONE(K, M, 1, alpha, A, B, C, ldc);
2166-
}
2137+
N_TAIL_ONE(K, M, 1, alpha, A, B, C, ldc);
21672138
}
21682139
}
21692140

@@ -2345,12 +2316,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
23452316
}
23462317

23472318
if (m_edge) {
2348-
M_TAIL(K, m_edge, 8, S, alpha, A, B00, C, ldc);
2319+
B = M_TAIL(K, m_edge, 8, S, alpha, A, B00, C, ldc);
23492320
}
23502321

23512322
C01 += 8*ldc;
23522323
C = C01;
2353-
B = B00 + 8*K;
23542324
A = A00;
23552325
}
23562326

0 commit comments

Comments
 (0)