|
50 | 50 | "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ |
51 | 51 | acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1) |
52 | 52 | #define KERNEL_2_k1m4n4 \ |
53 | | - "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\ |
| 53 | + "vpermilpd $5,-64(%0),%%ymm0; vpermilpd $5,-32(%0),%%ymm1;"\ |
54 | 54 | acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1) |
55 | 55 | #define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2) |
56 | 56 | #define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2) |
|
93 | 93 | "movq $10,%5; movq $84,%%r15;"\ |
94 | 94 | #ndim"4441:\n\t"\ |
95 | 95 | "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ |
96 | | - "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
| 96 | + KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
97 | 97 | "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ |
98 | | - "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
| 98 | + KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ |
99 | 99 | "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\ |
100 | 100 | "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\ |
101 | 101 | #ndim"4442:\n\t"\ |
|
0 commit comments