@@ -64,6 +64,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6464#define DECLARE_B_PAIR () \
6565 __m512i B_lo; __m512i B_hi;
6666
67+ #define PREFETCH_B_STEP 32
68+ #define PREFETCH_B (Bx , By ) \
69+ if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \
70+ else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2))
71+
6772#define BROADCAST_B_PAIR (Bx , By ) \
6873 BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \
6974 BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi);
@@ -204,17 +209,19 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT *
204209 k_count = k ;
205210 for (; k_count > 3 ; k_count -= 4 ) {
206211 LOAD_A_PAIR (0 );
212+ _mm_prefetch (ptr_a0 + 128 , _MM_HINT_T0 );
207213 ptr_a0 += 16 * 2 ;
208- BROADCAST_B_PAIR (0 , 0 ); MATMUL_4X (0 , 0 , 0 );
209- BROADCAST_B_PAIR (0 , 1 ); MATMUL_4X (0 , 0 , 1 );
210- BROADCAST_B_PAIR (0 , 2 ); MATMUL_4X (0 , 0 , 2 );
214+ BROADCAST_B_PAIR (0 , 0 ); PREFETCH_B ( 0 , 0 ); MATMUL_4X (0 , 0 , 0 );
215+ BROADCAST_B_PAIR (0 , 1 ); PREFETCH_B ( 0 , 1 ); MATMUL_4X (0 , 0 , 1 );
216+ BROADCAST_B_PAIR (0 , 2 ); PREFETCH_B ( 0 , 2 ); MATMUL_4X (0 , 0 , 2 );
211217 ptr_b0 += 4 * 2 ;
212- BROADCAST_B_PAIR (1 , 0 ); MATMUL_4X (0 , 1 , 0 );
213- BROADCAST_B_PAIR (1 , 1 ); MATMUL_4X (0 , 1 , 1 );
214- BROADCAST_B_PAIR (1 , 2 ); MATMUL_4X (0 , 1 , 2 );
218+ BROADCAST_B_PAIR (1 , 0 ); PREFETCH_B ( 1 , 0 ); MATMUL_4X (0 , 1 , 0 );
219+ BROADCAST_B_PAIR (1 , 1 ); PREFETCH_B ( 1 , 1 ); MATMUL_4X (0 , 1 , 1 );
220+ BROADCAST_B_PAIR (1 , 2 ); PREFETCH_B ( 1 , 2 ); MATMUL_4X (0 , 1 , 2 );
215221 ptr_b1 += 4 * 2 ;
216222
217223 LOAD_A_PAIR (0 );
224+ _mm_prefetch (ptr_a0 + 128 , _MM_HINT_T0 );
218225 ptr_a0 += 16 * 2 ;
219226 BROADCAST_B_PAIR (0 , 0 ); MATMUL_4X (0 , 0 , 0 );
220227 BROADCAST_B_PAIR (0 , 1 ); MATMUL_4X (0 , 0 , 1 );
0 commit comments