Skip to content

Commit c6996a8

Browse files
committed
loongarch64: Refine amax,amin,max,min optimization.
1 parent 3b520a5 commit c6996a8

27 files changed

Lines changed: 1881 additions & 2826 deletions

common_loongarch64.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,10 @@ static inline int WhereAmI(void){
120120
#define CMOVT fsel
121121
#define MTC movgr2fr.d
122122
#define FABS fabs.d
123+
#define FMIN fmin.d
124+
#define FMINA fmina.d
125+
#define FMAX fmax.d
126+
#define FMAXA fmaxa.d
123127
#define CMPEQ fcmp.ceq.d
124128
#define CMPLE fcmp.cle.d
125129
#define CMPLT fcmp.clt.d
@@ -128,10 +132,18 @@ static inline int WhereAmI(void){
128132
#define XVFSUB xvfsub.d
129133
#define XVFADD xvfadd.d
130134
#define XVFMADD xvfmadd.d
135+
#define XVFMIN xvfmin.d
136+
#define XVFMINA xvfmina.d
137+
#define XVFMAX xvfmax.d
138+
#define XVFMAXA xvfmaxa.d
131139

132140
#define VFSUB vfsub.d
133141
#define VFADD vfadd.d
134142
#define VFMADD vfmadd.d
143+
#define VFMIN vfmin.d
144+
#define VFMINA vfmina.d
145+
#define VFMAX vfmax.d
146+
#define VFMAXA vfmaxa.d
135147

136148
#else
137149

@@ -148,6 +160,10 @@ static inline int WhereAmI(void){
148160
#define CMOVT fsel
149161
#define MTC movgr2fr.w
150162
#define FABS fabs.s
163+
#define FMIN fmin.s
164+
#define FMINA fmina.s
165+
#define FMAX fmax.s
166+
#define FMAXA fmaxa.s
151167
#define CMPEQ fcmp.ceq.s
152168
#define CMPLE fcmp.cle.s
153169
#define CMPLT fcmp.clt.s
@@ -156,10 +172,18 @@ static inline int WhereAmI(void){
156172
#define XVFSUB xvfsub.s
157173
#define XVFADD xvfadd.s
158174
#define XVFMADD xvfmadd.s
175+
#define XVFMIN xvfmin.s
176+
#define XVFMINA xvfmina.s
177+
#define XVFMAX xvfmax.s
178+
#define XVFMAXA xvfmaxa.s
159179

160180
#define VFSUB vfsub.s
161181
#define VFADD vfadd.s
162182
#define VFMADD vfmadd.s
183+
#define VFMIN vfmin.s
184+
#define VFMINA vfmina.s
185+
#define VFMAX vfmax.s
186+
#define VFMAXA vfmaxa.s
163187

164188
#endif /* defined(DOUBLE) */
165189

kernel/loongarch64/KERNEL.LOONGSON2K1000

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@ DDOTKERNEL = dot_lsx.S
77
SSCALKERNEL = sscal_lsx.S
88
DSCALKERNEL = dscal_lsx.S
99

10-
SAMAXKERNEL = samax_lsx.S
11-
DAMAXKERNEL = damax_lsx.S
10+
SAMAXKERNEL = amax_lsx.S
11+
DAMAXKERNEL = amax_lsx.S
1212

13-
SAMINKERNEL = samin_lsx.S
14-
DAMINKERNEL = damin_lsx.S
13+
SAMINKERNEL = amin_lsx.S
14+
DAMINKERNEL = amin_lsx.S
1515

16-
SMAXKERNEL = smax_lsx.S
17-
DMAXKERNEL = dmax_lsx.S
16+
SMAXKERNEL = max_lsx.S
17+
DMAXKERNEL = max_lsx.S
1818

19-
SMINKERNEL = smin_lsx.S
20-
DMINKERNEL = dmin_lsx.S
19+
SMINKERNEL = min_lsx.S
20+
DMINKERNEL = min_lsx.S
2121

2222
ISMAXKERNEL = ismax_lsx.S
2323
IDMAXKERNEL = idmax_lsx.S

kernel/loongarch64/KERNEL.LOONGSON3R5

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@ DDOTKERNEL = dot_lasx.S
77
SSCALKERNEL = sscal_lasx.S
88
DSCALKERNEL = dscal_lasx.S
99

10-
SAMAXKERNEL = samax_lasx.S
11-
DAMAXKERNEL = damax_lasx.S
10+
SAMAXKERNEL = amax_lasx.S
11+
DAMAXKERNEL = amax_lasx.S
1212

13-
SAMINKERNEL = samin_lasx.S
14-
DAMINKERNEL = damin_lasx.S
13+
SAMINKERNEL = amin_lasx.S
14+
DAMINKERNEL = amin_lasx.S
1515

16-
SMAXKERNEL = smax_lasx.S
17-
DMAXKERNEL = dmax_lasx.S
16+
SMAXKERNEL = max_lsx.S
17+
DMAXKERNEL = max_lsx.S
1818

19-
SMINKERNEL = smin_lasx.S
20-
DMINKERNEL = dmin_lasx.S
19+
SMINKERNEL = min_lsx.S
20+
DMINKERNEL = min_lsx.S
2121

2222
ISMAXKERNEL = ismax_lasx.S
2323
IDMAXKERNEL = idmax_lasx.S

kernel/loongarch64/amax_lasx.S

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,232 @@
1+
/***************************************************************************
2+
Copyright (c) 2023, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define ASSEMBLER
29+
30+
#include "common.h"
31+
32+
#define N $r4
33+
#define X $r5
34+
#define INCX $r6
35+
36+
#define I $r12
37+
#define TEMP $r13
38+
39+
#define VM0 $xr0
40+
#define VM1 $xr1
41+
#define VM2 $xr2
42+
#define VX0 $xr3
43+
#define VX1 $xr4
44+
#define VX2 $xr5
45+
#define VX3 $xr6
46+
47+
#define t1 $r14
48+
#define t2 $r15
49+
#define t3 $r16
50+
#define t4 $r17
51+
52+
PROLOGUE
53+
54+
#ifdef F_INTERFACE
55+
LDINT N, 0(N)
56+
LDINT INCX, 0(INCX)
57+
#endif
58+
59+
bge $r0, N, .L999
60+
bge $r0, INCX, .L999
61+
li.d TEMP, 1
62+
slli.d TEMP, TEMP, BASE_SHIFT
63+
slli.d INCX, INCX, BASE_SHIFT
64+
#ifdef DOUBLE
65+
xvldrepl.d VM0, X, 0
66+
#else
67+
xvldrepl.w VM0, X, 0
68+
#endif
69+
XVFSUB VM0, VM0, VM0
70+
bne INCX, TEMP, .L20
71+
72+
srai.d I, N, 4
73+
bge $r0, I, .L11
74+
.align 3
75+
76+
.L10:
77+
#ifdef DOUBLE
78+
xvld VX0, X, 0
79+
xvld VX1, X, 32
80+
xvld VX2, X, 64
81+
xvld VX3, X, 96
82+
addi.d I, I, -1
83+
addi.d X, X, 128
84+
XVFMAXA VM1, VX0, VX1
85+
XVFMAXA VM2, VX2, VX3
86+
XVFMAXA VM0, VM0, VM1
87+
XVFMAXA VM0, VM0, VM2
88+
#else
89+
xvld VX0, X, 0
90+
xvld VX1, X, 32
91+
addi.d I, I, -1
92+
addi.d X, X, 64
93+
XVFMAXA VM1, VX0, VX1
94+
XVFMAXA VM0, VM0, VM1
95+
#endif
96+
blt $r0, I, .L10
97+
98+
#ifdef DOUBLE
99+
xvrepl128vei.d VX0, VM0, 0
100+
xvrepl128vei.d VX1, VM0, 1
101+
XVFMAXA VM0, VX0, VX1
102+
#else
103+
xvrepl128vei.w VX0, VM0, 0
104+
xvrepl128vei.w VX1, VM0, 1
105+
xvrepl128vei.w VX2, VM0, 2
106+
xvrepl128vei.w VX3, VM0, 3
107+
XVFMAXA VM1, VX0, VX1
108+
XVFMAXA VM2, VX2, VX3
109+
XVFMAXA VM0, VM1, VM2
110+
#endif
111+
xvpermi.q VM1, VM0, 0x1
112+
XVFMAXA VM0, VM0, VM1
113+
.align 3
114+
115+
.L11:
116+
andi I, N, 0x0f
117+
bge $r0, I, .L13
118+
.align 3
119+
120+
.L12: /* 0 < N < 16 */
121+
LD $f1, X, 0
122+
addi.d I, I, -1
123+
addi.d X, X, SIZE
124+
FMAXA $f0, $f0, $f1
125+
bnez I, .L12
126+
.align 3
127+
128+
.L13:
129+
FABS $f0, $f0
130+
jirl $r0, $r1, 0x0
131+
.align 3
132+
133+
.L20: // INCX!=1
134+
srai.d I, N, 3
135+
bge $r0, I, .L23
136+
.align 3
137+
138+
.L21:
139+
#ifdef DOUBLE
140+
ld.d t1, X, 0
141+
add.d X, X, INCX
142+
ld.d t2, X, 0
143+
add.d X, X, INCX
144+
ld.d t3, X, 0
145+
add.d X, X, INCX
146+
ld.d t4, X, 0
147+
add.d X, X, INCX
148+
xvinsgr2vr.d VX0, t1, 0
149+
xvinsgr2vr.d VX0, t2, 1
150+
xvinsgr2vr.d VX0, t3, 2
151+
xvinsgr2vr.d VX0, t4, 3
152+
ld.d t1, X, 0
153+
add.d X, X, INCX
154+
ld.d t2, X, 0
155+
add.d X, X, INCX
156+
ld.d t3, X, 0 * SIZE
157+
add.d X, X, INCX
158+
ld.d t4, X, 0 * SIZE
159+
add.d X, X, INCX
160+
xvinsgr2vr.d VX1, t1, 0
161+
xvinsgr2vr.d VX1, t2, 1
162+
xvinsgr2vr.d VX1, t3, 2
163+
xvinsgr2vr.d VX1, t4, 3
164+
xvfmaxa.d VM1, VX0, VX1
165+
xvfmaxa.d VM0, VM0, VM1
166+
#else
167+
ld.w t1, X, 0
168+
add.d X, X, INCX
169+
ld.w t2, X, 0
170+
add.d X, X, INCX
171+
ld.w t3, X, 0
172+
add.d X, X, INCX
173+
ld.w t4, X, 0
174+
add.d X, X, INCX
175+
xvinsgr2vr.w VM1, t1, 0
176+
xvinsgr2vr.w VM1, t2, 1
177+
xvinsgr2vr.w VM1, t3, 2
178+
xvinsgr2vr.w VM1, t4, 3
179+
ld.w t1, X, 0
180+
add.d X, X, INCX
181+
ld.w t2, X, 0
182+
add.d X, X, INCX
183+
ld.w t3, X, 0
184+
add.d X, X, INCX
185+
ld.w t4, X, 0
186+
add.d X, X, INCX
187+
xvinsgr2vr.w VM1, t1, 4
188+
xvinsgr2vr.w VM1, t2, 5
189+
xvinsgr2vr.w VM1, t3, 6
190+
xvinsgr2vr.w VM1, t4, 7
191+
xvfmaxa.s VM0, VM0, VM1
192+
#endif
193+
addi.d I, I, -1
194+
blt $r0, I, .L21
195+
.align 3
196+
197+
.L22:
198+
#ifdef DOUBLE
199+
xvrepl128vei.d VX0, VM0, 0
200+
xvrepl128vei.d VX1, VM0, 1
201+
XVFMAXA VM0, VX0, VX1
202+
#else
203+
xvrepl128vei.w VX0, VM0, 0
204+
xvrepl128vei.w VX1, VM0, 1
205+
xvrepl128vei.w VX2, VM0, 2
206+
xvrepl128vei.w VX3, VM0, 3
207+
XVFMAXA VM1, VX0, VX1
208+
XVFMAXA VM2, VX2, VX3
209+
XVFMAXA VM0, VM1, VM2
210+
#endif
211+
xvpermi.q VM1, VM0, 1
212+
XVFMAXA VM0, VM0, VM1
213+
.align 3
214+
215+
.L23: //INCX!=1 and N<8
216+
andi I, N, 7
217+
bge $r0, I, .L999
218+
.align 3
219+
220+
.L24: /* 0 < N < 8 */
221+
LD $f1, X, 0
222+
addi.d I, I, -1
223+
add.d X, X, INCX
224+
FMAXA $f0, $f0, $f1
225+
bnez I, .L24
226+
.align 3
227+
228+
.L999:
229+
FABS $f0, $f0
230+
jirl $r0, $r1, 0x0
231+
232+
EPILOGUE

0 commit comments

Comments
 (0)