Skip to content

Commit cc1b579

Browse files
committed
Reduce number of vectors in use from 32 to 24 for last stage of main block - now full LMUL2.
1 parent 22b7950 commit cc1b579

1 file changed

Lines changed: 28 additions & 58 deletions

File tree

kernel/riscv64/dgemm_kernel_8x8_zvl256b.c

Lines changed: 28 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1709,67 +1709,37 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, F
17091709

17101710
FLOAT *C2 = C;
17111711

1712-
vfloat64m2_t c00;
1713-
c00 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1714-
vfloat64m1_t c0 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1715-
vfloat64m1_t c1 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1716-
c00 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1717-
vfloat64m1_t c2 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1718-
vfloat64m1_t c3 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1719-
c00 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1720-
vfloat64m1_t c4 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1721-
vfloat64m1_t c5 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1722-
c00 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1723-
vfloat64m1_t c6 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1724-
vfloat64m1_t c7 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1725-
c00 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1726-
vfloat64m1_t c8 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1727-
vfloat64m1_t c9 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1728-
c00 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1729-
vfloat64m1_t c10 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1730-
vfloat64m1_t c11 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1731-
c00 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1732-
vfloat64m1_t c12 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1733-
vfloat64m1_t c13 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1734-
c00 = __riscv_vle64_v_f64m2(C, 8);
1735-
vfloat64m1_t c14 = __riscv_vget_v_f64m2_f64m1(c00, 0);
1736-
vfloat64m1_t c15 = __riscv_vget_v_f64m2_f64m1(c00, 1);
1737-
1738-
c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, 4 );
1739-
c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, 4 );
1740-
c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, 4 );
1741-
c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, 4 );
1742-
c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, 4 );
1743-
c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, 4 );
1744-
c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, 4 );
1745-
c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, 4 );
1746-
c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, 4 );
1747-
c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, 4 );
1748-
c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, 4 );
1749-
c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, 4 );
1750-
c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, 4 );
1751-
c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, 4 );
1752-
c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, 4 );
1753-
c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, 4 );
1712+
vfloat64m2_t c01 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1713+
vfloat64m2_t c23 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1714+
vfloat64m2_t c45 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1715+
vfloat64m2_t c67 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1716+
1717+
c01 = __riscv_vfmacc_vf_f64m2( c01, alpha, result01, 8 );
1718+
c23 = __riscv_vfmacc_vf_f64m2( c23, alpha, result23, 8 );
1719+
c45 = __riscv_vfmacc_vf_f64m2( c45, alpha, result45, 8 );
1720+
c67 = __riscv_vfmacc_vf_f64m2( c67, alpha, result67, 8 );
1721+
1722+
vfloat64m2_t c89 = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1723+
vfloat64m2_t cAB = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1724+
vfloat64m2_t cCD = __riscv_vle64_v_f64m2(C, 8); C += ldc;
1725+
vfloat64m2_t cEF = __riscv_vle64_v_f64m2(C, 8);
1726+
1727+
c89 = __riscv_vfmacc_vf_f64m2( c89, alpha, result89, 8 );
1728+
cAB = __riscv_vfmacc_vf_f64m2( cAB, alpha, resultAB, 8 );
1729+
cCD = __riscv_vfmacc_vf_f64m2( cCD, alpha, resultCD, 8 );
1730+
cEF = __riscv_vfmacc_vf_f64m2( cEF, alpha, resultEF, 8 );
17541731

17551732
C = C2;
17561733

1757-
c00 = __riscv_vcreate_v_f64m1_f64m2(c0, c1);
1758-
__riscv_vse64_v_f64m2(C, c00, 8); C += ldc;
1759-
c00 = __riscv_vcreate_v_f64m1_f64m2(c2, c3);
1760-
__riscv_vse64_v_f64m2(C, c00, 8); C += ldc;
1761-
c00 = __riscv_vcreate_v_f64m1_f64m2(c4, c5);
1762-
__riscv_vse64_v_f64m2(C, c00, 8); C += ldc;
1763-
c00 = __riscv_vcreate_v_f64m1_f64m2(c6, c7);
1764-
__riscv_vse64_v_f64m2(C, c00, 8); C += ldc;
1765-
c00 = __riscv_vcreate_v_f64m1_f64m2(c8, c9);
1766-
__riscv_vse64_v_f64m2(C, c00, 8); C += ldc;
1767-
c00 = __riscv_vcreate_v_f64m1_f64m2(c10, c11);
1768-
__riscv_vse64_v_f64m2(C, c00, 8); C += ldc;
1769-
c00 = __riscv_vcreate_v_f64m1_f64m2(c12, c13);
1770-
__riscv_vse64_v_f64m2(C, c00, 8); C += ldc;
1771-
c00 = __riscv_vcreate_v_f64m1_f64m2(c14, c15);
1772-
__riscv_vse64_v_f64m2(C, c00, 8);
1734+
__riscv_vse64_v_f64m2(C, c01, 8); C += ldc;
1735+
__riscv_vse64_v_f64m2(C, c23, 8); C += ldc;
1736+
__riscv_vse64_v_f64m2(C, c45, 8); C += ldc;
1737+
__riscv_vse64_v_f64m2(C, c67, 8); C += ldc;
1738+
__riscv_vse64_v_f64m2(C, c89, 8); C += ldc;
1739+
__riscv_vse64_v_f64m2(C, cAB, 8); C += ldc;
1740+
__riscv_vse64_v_f64m2(C, cCD, 8); C += ldc;
1741+
__riscv_vse64_v_f64m2(C, cEF, 8);
1742+
17731743
C = C2 + 8;
17741744
}
17751745

0 commit comments

Comments
 (0)