Skip to content

Commit 36c12c4

Browse files
committed
loongarch64: Refine copy,swap,nrm2,sum optimization.
1 parent c6996a8 commit 36c12c4

24 files changed

Lines changed: 2159 additions & 2816 deletions

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,20 @@ IDAMAXKERNEL = idamax_lsx.S
3131
ISAMINKERNEL = isamin_lsx.S
3232
IDAMINKERNEL = idamin_lsx.S
3333

34-
SCOPYKERNEL = scopy_lsx.S
35-
DCOPYKERNEL = dcopy_lsx.S
34+
SCOPYKERNEL = copy_lsx.S
35+
DCOPYKERNEL = copy_lsx.S
3636

37-
SSWAPKERNEL = sswap_lsx.S
38-
DSWAPKERNEL = dswap_lsx.S
37+
SSWAPKERNEL = swap_lsx.S
38+
DSWAPKERNEL = swap_lsx.S
3939

4040
SAXPYKERNEL = saxpy_lsx.S
4141
DAXPYKERNEL = daxpy_lsx.S
4242

4343
SAXPBYKERNEL = saxpby_lsx.S
4444
DAXPBYKERNEL = daxpby_lsx.S
4545

46-
SSUMKERNEL = ssum_lsx.S
47-
DSUMKERNEL = dsum_lsx.S
46+
SSUMKERNEL = sum_lsx.S
47+
DSUMKERNEL = sum_lsx.S
4848

4949
SASUMKERNEL = sasum_lsx.S
5050
DASUMKERNEL = dasum_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,20 @@ IDAMAXKERNEL = idamax_lasx.S
3131
ISAMINKERNEL = isamin_lasx.S
3232
IDAMINKERNEL = idamin_lasx.S
3333

34-
SCOPYKERNEL = scopy_lasx.S
35-
DCOPYKERNEL = dcopy_lasx.S
34+
SCOPYKERNEL = copy_lasx.S
35+
DCOPYKERNEL = copy_lasx.S
3636

37-
SSWAPKERNEL = sswap_lasx.S
38-
DSWAPKERNEL = dswap_lasx.S
37+
SSWAPKERNEL = swap_lasx.S
38+
DSWAPKERNEL = swap_lasx.S
3939

4040
SAXPYKERNEL = saxpy_lasx.S
4141
DAXPYKERNEL = daxpy_lasx.S
4242

4343
SAXPBYKERNEL = saxpby_lasx.S
4444
DAXPBYKERNEL = daxpby_lasx.S
4545

46-
SSUMKERNEL = ssum_lasx.S
47-
DSUMKERNEL = dsum_lasx.S
46+
SSUMKERNEL = sum_lasx.S
47+
DSUMKERNEL = sum_lasx.S
4848

4949
SASUMKERNEL = sasum_lasx.S
5050
DASUMKERNEL = dasum_lasx.S

kernel/loongarch64/copy_lasx.S

Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
/*****************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions are
7+
met:
8+
9+
1. Redistributions of source code must retain the above copyright
10+
notice, this list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright
13+
notice, this list of conditions and the following disclaimer in
14+
the documentation and/or other materials provided with the
15+
distribution.
16+
3. Neither the name of the OpenBLAS project nor the names of
17+
its contributors may be used to endorse or promote products
18+
derived from this software without specific prior written
19+
permission.
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
**********************************************************************************/
32+
33+
#define ASSEMBLER
34+
35+
#include "common.h"
36+
#define N $r4
37+
#define X $r5
38+
#define INCX $r6
39+
#define Y $r7
40+
#define INCY $r8
41+
#define I $r17
42+
#define TEMP $r18
43+
#define t1 $r14
44+
#define t2 $r15
45+
#define t3 $r16
46+
#define t4 $r19
47+
#define a1 $f12
48+
#define a2 $f13
49+
#define a3 $f14
50+
#define a4 $f15
51+
#define VX0 $xr12
52+
#define VX1 $xr13
53+
54+
PROLOGUE
55+
bge $r0, N, .L999
56+
li.d TEMP, 1
57+
slli.d TEMP, TEMP, BASE_SHIFT
58+
slli.d INCX, INCX, BASE_SHIFT
59+
slli.d INCY, INCY, BASE_SHIFT
60+
srai.d I, N, 3
61+
bne INCX, TEMP, .L20
62+
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
63+
b .L11 // INCX==1 and INCY==1
64+
.L20:
65+
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
66+
b .L21 // INCX!=1 and INCY==1
67+
68+
/* INCX==1 and INCY==1 */
69+
.L11:
70+
bge $r0, I, .L112
71+
.align 3
72+
73+
.L111:
74+
xvld VX0, X, 0
75+
addi.d I, I, -1
76+
xvst VX0, Y, 0
77+
#ifdef DOUBLE
78+
xvld VX0, X, 32
79+
xvst VX0, Y, 32
80+
#endif
81+
addi.d X, X, 8 * SIZE
82+
addi.d Y, Y, 8 * SIZE
83+
blt $r0, I, .L111
84+
.align 3
85+
86+
.L112:
87+
andi I, N, 7
88+
bge $r0, I, .L999
89+
.align 3
90+
91+
.L113:
92+
LD $f12, X, 0
93+
addi.d I, I, -1
94+
addi.d X, X, SIZE
95+
ST $f12, Y, 0
96+
addi.d Y, Y, SIZE
97+
blt $r0, I, .L113
98+
b .L999
99+
.align 3
100+
101+
/* INCX==1 and INCY!=1 */
102+
.L12:
103+
bge $r0, I, .L122
104+
.align 3
105+
106+
.L121:
107+
#ifdef DOUBLE
108+
xvld VX0, X, 0
109+
xvld VX1, X, 32
110+
xvstelm.d VX0, Y, 0, 0
111+
add.d Y, Y, INCY
112+
xvstelm.d VX0, Y, 0, 1
113+
add.d Y, Y, INCY
114+
xvstelm.d VX0, Y, 0, 2
115+
add.d Y, Y, INCY
116+
xvstelm.d VX0, Y, 0, 3
117+
add.d Y, Y, INCY
118+
xvstelm.d VX1, Y, 0, 0
119+
add.d Y, Y, INCY
120+
xvstelm.d VX1, Y, 0, 1
121+
add.d Y, Y, INCY
122+
xvstelm.d VX1, Y, 0, 2
123+
add.d Y, Y, INCY
124+
xvstelm.d VX1, Y, 0, 3
125+
add.d Y, Y, INCY
126+
#else
127+
xvld VX0, X, 0
128+
xvstelm.w VX0, Y, 0, 0
129+
add.d Y, Y, INCY
130+
xvstelm.w VX0, Y, 0, 1
131+
add.d Y, Y, INCY
132+
xvstelm.w VX0, Y, 0, 2
133+
add.d Y, Y, INCY
134+
xvstelm.w VX0, Y, 0, 3
135+
add.d Y, Y, INCY
136+
xvstelm.w VX0, Y, 0, 4
137+
add.d Y, Y, INCY
138+
xvstelm.w VX0, Y, 0, 5
139+
add.d Y, Y, INCY
140+
xvstelm.w VX0, Y, 0, 6
141+
add.d Y, Y, INCY
142+
xvstelm.w VX0, Y, 0, 7
143+
add.d Y, Y, INCY
144+
#endif
145+
addi.d X, X, 8 * SIZE
146+
addi.d I, I, -1
147+
blt $r0, I, .L121
148+
.align 3
149+
150+
.L122:
151+
andi I, N, 7
152+
bge $r0, I, .L999
153+
.align 3
154+
155+
.L123:
156+
LD $f12, X, 0
157+
addi.d I, I, -1
158+
addi.d X, X, SIZE
159+
ST $f12, Y, 0
160+
add.d Y, Y, INCY
161+
blt $r0, I, .L123
162+
b .L999
163+
.align 3
164+
165+
/* INCX!=1 and INCY==1 */
166+
.L21:
167+
bge $r0, I, .L212
168+
.align 3
169+
170+
.L211:
171+
#ifdef DOUBLE
172+
ld.d t1, X, 0
173+
add.d X, X, INCX
174+
ld.d t2, X, 0
175+
add.d X, X, INCX
176+
ld.d t3, X, 0
177+
add.d X, X, INCX
178+
ld.d t4, X, 0
179+
add.d X, X, INCX
180+
xvinsgr2vr.d VX0, t1, 0
181+
xvinsgr2vr.d VX0, t2, 1
182+
xvinsgr2vr.d VX0, t3, 2
183+
xvinsgr2vr.d VX0, t4, 3
184+
xvst VX0, Y, 0
185+
ld.d t1, X, 0
186+
add.d X, X, INCX
187+
ld.d t2, X, 0
188+
add.d X, X, INCX
189+
ld.d t3, X, 0
190+
add.d X, X, INCX
191+
ld.d t4, X, 0
192+
add.d X, X, INCX
193+
xvinsgr2vr.d VX1, t1, 0
194+
xvinsgr2vr.d VX1, t2, 1
195+
xvinsgr2vr.d VX1, t3, 2
196+
xvinsgr2vr.d VX1, t4, 3
197+
xvst VX1, Y, 32
198+
#else
199+
ld.w t1, X, 0
200+
add.d X, X, INCX
201+
ld.w t2, X, 0
202+
add.d X, X, INCX
203+
ld.w t3, X, 0
204+
add.d X, X, INCX
205+
ld.w t4, X, 0
206+
add.d X, X, INCX
207+
xvinsgr2vr.w VX0, t1, 0
208+
xvinsgr2vr.w VX0, t2, 1
209+
xvinsgr2vr.w VX0, t3, 2
210+
xvinsgr2vr.w VX0, t4, 3
211+
ld.w t1, X, 0
212+
add.d X, X, INCX
213+
ld.w t2, X, 0
214+
add.d X, X, INCX
215+
ld.w t3, X, 0
216+
add.d X, X, INCX
217+
ld.w t4, X, 0
218+
add.d X, X, INCX
219+
xvinsgr2vr.w VX0, t1, 4
220+
xvinsgr2vr.w VX0, t2, 5
221+
xvinsgr2vr.w VX0, t3, 6
222+
xvinsgr2vr.w VX0, t4, 7
223+
xvst VX0, Y, 0
224+
#endif
225+
addi.d Y, Y, 8 * SIZE
226+
addi.d I, I, -1
227+
blt $r0, I, .L211
228+
.align 3
229+
230+
.L212:
231+
andi I, N, 7
232+
bge $r0, I, .L999
233+
.align 3
234+
235+
.L213:
236+
LD $f12, X, 0
237+
addi.d I, I, -1
238+
ST $f12, Y, 0
239+
add.d X, X, INCX
240+
addi.d Y, Y, SIZE
241+
blt $r0, I, .L213
242+
b .L999
243+
.align 3
244+
245+
/* INCX!=1 and INCY!=1 */
246+
.L22:
247+
bge $r0, I, .L223
248+
.align 3
249+
250+
.L222:
251+
LD a1, X, 0
252+
add.d X, X, INCX
253+
LD a2, X, 0
254+
add.d X, X, INCX
255+
LD a3, X, 0
256+
add.d X, X, INCX
257+
LD a4, X, 0
258+
add.d X, X, INCX
259+
ST a1, Y, 0
260+
add.d Y, Y, INCY
261+
ST a2, Y, 0
262+
add.d Y, Y, INCY
263+
ST a3, X, 0
264+
add.d Y, Y, INCY
265+
ST a4, X, 0
266+
add.d Y, Y, INCY
267+
LD a1, X, 0
268+
add.d X, X, INCX
269+
LD a2, X, 0
270+
add.d X, X, INCX
271+
LD a3, X, 0
272+
add.d X, X, INCX
273+
LD a4, X, 0
274+
add.d X, X, INCX
275+
ST a1, Y, 0
276+
add.d Y, Y, INCY
277+
ST a2, Y, 0
278+
add.d Y, Y, INCY
279+
ST a3, X, 0
280+
add.d Y, Y, INCY
281+
ST a4, X, 0
282+
add.d Y, Y, INCY
283+
addi.d I, I, -1
284+
blt $r0, I, .L222
285+
.align 3
286+
287+
.L223:
288+
andi I, N, 7
289+
bge $r0, I, .L999
290+
.align 3
291+
292+
.L224:
293+
LD $f12, X, 0
294+
addi.d I, I, -1
295+
ST $f12, Y, 0
296+
add.d X, X, INCX
297+
add.d Y, Y, INCY
298+
blt $r0, I, .L224
299+
.align 3
300+
301+
.L999:
302+
move $r4, $r12
303+
jirl $r0, $r1, 0x0
304+
.align 3
305+
306+
EPILOGUE

0 commit comments

Comments
 (0)