@@ -30,27 +30,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3030#if !defined(DOUBLE )
3131#define VSETVL (n ) __riscv_vsetvl_e32m1(n)
3232#define FLOAT_V_T vfloat32m1_t
33+ #define FLOAT_VX2_T vfloat32m1x2_t
34+ #define FLOAT_VX4_T vfloat32m1x4_t
35+ #define FLOAT_VX8_T vfloat32m1x8_t
3336#define VLEV_FLOAT __riscv_vle32_v_f32m1
3437#define VLSEV_FLOAT __riscv_vlse32_v_f32m1
3538#define VSEV_FLOAT __riscv_vse32_v_f32m1
36- #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1
37- #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1
38- #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1
39- #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1
40- #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1
41- #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1
39+ #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2
40+ #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2
41+ #define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4
42+ #define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4
43+ #define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8
44+ #define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8
4245#else
4346#define VSETVL (n ) __riscv_vsetvl_e64m1(n)
4447#define FLOAT_V_T vfloat64m1_t
48+ #define FLOAT_VX2_T vfloat64m1x2_t
49+ #define FLOAT_VX4_T vfloat64m1x4_t
50+ #define FLOAT_VX8_T vfloat64m1x8_t
4551#define VLEV_FLOAT __riscv_vle64_v_f64m1
4652#define VLSEV_FLOAT __riscv_vlse64_v_f64m1
4753#define VSEV_FLOAT __riscv_vse64_v_f64m1
48- #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1
49- #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1
50- #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1
51- #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1
52- #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1
53- #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1
54+ #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2
55+ #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2
56+ #define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4
57+ #define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4
58+ #define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8
59+ #define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8
5460#endif
5561
5662int CNAME (BLASLONG m , BLASLONG n , IFLOAT * a , BLASLONG lda , IFLOAT * b )
@@ -62,7 +68,10 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
6268
6369 IFLOAT * boffset , * boffset1 , * boffset2 , * boffset3 , * boffset4 ;
6470
65- FLOAT_V_T v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ;
71+ FLOAT_V_T v0 ;
72+ FLOAT_VX2_T vx2 ;
73+ FLOAT_VX4_T vx4 ;
74+ FLOAT_VX8_T vx8 ;
6675
6776 // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
6877
@@ -83,8 +92,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
8392 for (i = (n >> 3 ); i > 0 ; i -- ) {
8493 size_t vl = 8 ;
8594
86- VLSSEG8_FLOAT ( & v0 , & v1 , & v2 , & v3 , & v4 , & v5 , & v6 , & v7 , aoffset1 , lda * sizeof (FLOAT ), vl );
87- VSSEG8_FLOAT (boffset1 , v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 , vl );
95+ vx8 = VLSSEG8_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
96+ VSSEG8_FLOAT (boffset1 , vx8 , vl );
8897
8998 aoffset1 += 8 ;
9099 boffset1 += m * 8 ;
@@ -93,8 +102,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
93102 if (n & 4 ) {
94103 size_t vl = 8 ;
95104
96- VLSSEG4_FLOAT ( & v0 , & v1 , & v2 , & v3 , aoffset1 , lda * sizeof (FLOAT ), vl );
97- VSSEG4_FLOAT (boffset2 , v0 , v1 , v2 , v3 , vl );
105+ vx4 = VLSSEG4_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
106+ VSSEG4_FLOAT (boffset2 , vx4 , vl );
98107
99108 aoffset1 += 4 ;
100109 boffset2 += 32 ;
@@ -103,8 +112,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
103112 if (n & 2 ) {
104113 size_t vl = 8 ;
105114
106- VLSSEG2_FLOAT ( & v0 , & v1 , aoffset1 , lda * sizeof (FLOAT ), vl );
107- VSSEG2_FLOAT (boffset3 , v0 , v1 , vl );
115+ vx2 = VLSSEG2_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
116+ VSSEG2_FLOAT (boffset3 , vx2 , vl );
108117
109118 aoffset1 += 2 ;
110119 boffset3 += 16 ;
@@ -133,8 +142,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
133142 for (i = (n >> 3 ); i > 0 ; i -- ) {
134143 size_t vl = 4 ;
135144
136- VLSSEG8_FLOAT ( & v0 , & v1 , & v2 , & v3 , & v4 , & v5 , & v6 , & v7 , aoffset1 , lda * sizeof (FLOAT ), vl );
137- VSSEG8_FLOAT (boffset1 , v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 , vl );
145+ vx8 = VLSSEG8_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
146+ VSSEG8_FLOAT (boffset1 , vx8 , vl );
138147
139148 aoffset1 += 8 ;
140149 boffset1 += m * 8 ;
@@ -143,8 +152,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
143152 if (n & 4 ) {
144153 size_t vl = 4 ;
145154
146- VLSSEG4_FLOAT ( & v0 , & v1 , & v2 , & v3 , aoffset1 , lda * sizeof (FLOAT ), vl );
147- VSSEG4_FLOAT (boffset2 , v0 , v1 , v2 , v3 , vl );
155+ vx4 = VLSSEG4_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
156+ VSSEG4_FLOAT (boffset2 , vx4 , vl );
148157
149158 aoffset1 += 4 ;
150159 boffset2 += 16 ;
@@ -153,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
153162 if (n & 2 ) {
154163 size_t vl = 4 ;
155164
156- VLSSEG2_FLOAT ( & v0 , & v1 , aoffset1 , lda * sizeof (FLOAT ), vl );
157- VSSEG2_FLOAT (boffset3 , v0 , v1 , vl );
165+ vx2 = VLSSEG2_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
166+ VSSEG2_FLOAT (boffset3 , vx2 , vl );
158167
159168 aoffset1 += 2 ;
160169 boffset3 += 8 ;
@@ -181,8 +190,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
181190 for (i = (n >> 3 ); i > 0 ; i -- ) {
182191 size_t vl = 2 ;
183192
184- VLSSEG8_FLOAT ( & v0 , & v1 , & v2 , & v3 , & v4 , & v5 , & v6 , & v7 , aoffset1 , lda * sizeof (FLOAT ), vl );
185- VSSEG8_FLOAT (boffset1 , v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 , vl );
193+ vx8 = VLSSEG8_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
194+ VSSEG8_FLOAT (boffset1 , vx8 , vl );
186195
187196 aoffset1 += 8 ;
188197 boffset1 += m * 8 ;
@@ -191,8 +200,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
191200 if (n & 4 ) {
192201 size_t vl = 2 ;
193202
194- VLSSEG4_FLOAT ( & v0 , & v1 , & v2 , & v3 , aoffset1 , lda * sizeof (FLOAT ), vl );
195- VSSEG4_FLOAT (boffset2 , v0 , v1 , v2 , v3 , vl );
203+ vx4 = VLSSEG4_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
204+ VSSEG4_FLOAT (boffset2 , vx4 , vl );
196205
197206 aoffset1 += 4 ;
198207 boffset2 += 8 ;
@@ -201,8 +210,8 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
201210 if (n & 2 ) {
202211 size_t vl = 2 ;
203212
204- VLSSEG2_FLOAT ( & v0 , & v1 , aoffset1 , lda * sizeof (FLOAT ), vl );
205- VSSEG2_FLOAT (boffset3 , v0 , v1 , vl );
213+ vx2 = VLSSEG2_FLOAT ( aoffset1 , lda * sizeof (FLOAT ), vl );
214+ VSSEG2_FLOAT (boffset3 , vx2 , vl );
206215
207216 aoffset1 += 2 ;
208217 boffset3 += 4 ;
0 commit comments