@@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828#define ASSEMBLER
2929
3030#include "common.h"
31+ #include "loongarch64_asm.S"
3132
3233/* Param */
3334#define M $r4
@@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5758#define T2 $r28
5859#define T3 $r29
5960#define T4 $r30
61+ #define T5 $r17
62+ #define T6 $r16
6063
6164/* LSX vectors */
6265#define U0 $xr31
@@ -87,75 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8790#define a8 $f8
8891#define a9 $f9
8992
90-
91- PROLOGUE
92-
93- LDARG BUFFER, $sp , 0
94-
95- addi .d $sp , $sp , -88
96-
97- SDARG $r23, $sp , 0
98- SDARG $r24, $sp , 8
99- SDARG $r25, $sp , 16
100- SDARG $r26, $sp , 32
101- SDARG $r27, $sp , 40
102- SDARG $r28, $sp , 48
103- SDARG $r29, $sp , 56
104- SDARG $r30, $sp , 64
105- SDARG $r31, $sp , 72
106- ST ALPHA, $sp , 80
107-
108- xvldrepl.w VALPHA, $sp , 80
109-
110- slli.d LDA, LDA, BASE_SHIFT
111- slli.d INCX, INCX, BASE_SHIFT
112- slli.d INCY, INCY, BASE_SHIFT
113-
114- bge $r0, M, .L999
115- bge $r0, N, .L999
116-
117- move J, $r0
118- move JY, $r0
119- move JX, $r0
120- move AO1, A
121-
122- beq J , N, .L999
123-
124- .L01:
125- MTC a2 , $r0 //temp2
126- fldx.s a6 , X, JX
127- fmul .s a3, ALPHA, a6 //temp1
128- xvreplve0.w U3, U3
129- xvreplve0.w U2, U2
130-
131- mul.w T0, J, LDA
132- slli.d T1, J, BASE_SHIFT
133- add.w T0, T0, T1
134- fldx.s a6 , AO1, T0
135- fldx.s a4 , Y, JY
136- fmadd.s a4 , a3 , a6 , a4
137- fstx.s a4 , Y, JY
138-
139- move IY, JY
140- move IX, JX
141- addi .d II, J, 1
142- move I, II
143- slli.d II, II, BASE_SHIFT
144-
145- sub .d T0, M, J
146- addi .d T0, T0, -1
147- srai.d T0, T0, 3
148- add .d T0, T0, J
149- addi .d T0, T0, 1
150- beq I , T0, .L03
151- bge I , T0, .L03
152-
153- mul.w T1, J, LDA
154- add .d T1, T1, II
155-
156- .L02: /* /8 */
157- xvldx U1, AO1, T1
158-
93+ .macro LOAD_Y_8
94+ beqz T5, .L01_Y_0
15995 add .d T2, IY, INCY
16096 fldx.s $f4 , Y, T2
16197 add .d T2, T2, INCY
@@ -180,11 +116,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
180116 vextrins.w $vr8, $vr9, 0x10
181117 vextrins.w $vr8, $vr10, 0x20
182118 vextrins.w $vr8, $vr11, 0x30
183- xvpermi.q U4, U8, 0x02
184-
185- xvfmadd.s U4, U3, U1, U4
186-
187- xvpermi.d U8, U4, 0xee
119+ xvpermi.q U4, U8, 0x02
120+ b .L01_Y_1
121+ .L01_Y_0:
122+ add .d T3, IY, INCY
123+ xvldx U4, Y, T3
124+ .L01_Y_1:
125+ .endm
126+
127+ .macro STORE_Y_8
128+ beqz T5, .L01_Y_2
129+ xvpermi.d U8, U4, 0xee
188130 vextrins.w $vr5, $vr4, 0x01
189131 vextrins.w $vr6, $vr4, 0x02
190132 vextrins.w $vr7, $vr4, 0x03
@@ -209,10 +151,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
209151 fstx.s $f10 , Y, T2
210152 add .d T2, T2, INCY
211153 fstx.s $f11 , Y, T2
212-
213- slli.d T2, INCY, 3
214- add .d IY, IY, T2
215-
154+ b .L01_Y_3
155+ .L01_Y_2:
156+ xvstx U4, Y, T3
157+ .L01_Y_3:
158+ .endm
159+
160+ .macro LOAD_X_8
161+ beqz T6, .L01_X_0
216162 add .d T2, IX, INCX
217163 fldx.s $f4 , X, T2
218164 add .d T2, T2, INCX
@@ -238,39 +184,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
238184 vextrins.w $vr8, $vr10, 0x20
239185 vextrins.w $vr8, $vr11, 0x30
240186 xvpermi.q U4, U8, 0x02
187+ b .L01_X_1
188+ .L01_X_0:
189+ add .d T3, IX, INCX
190+ xvldx U4, X, T3
191+ .L01_X_1:
192+ .endm
193+
194+ PROLOGUE
241195
242- xvand.v $xr12 , $xr2, $xr2
196+ addi .d $ sp , $sp , -88
243197
244- xvfmadd.s U2, U1, U4, U2
245- xvfsub.s U2, U2, $xr12
198+ SDARG $r23, $sp , 0
199+ SDARG $r24, $sp , 8
200+ SDARG $r25, $sp , 16
201+ SDARG $r26, $sp , 32
202+ SDARG $r27, $sp , 40
203+ SDARG $r28, $sp , 48
204+ SDARG $r29, $sp , 56
205+ SDARG $r30, $sp , 64
206+ SDARG $r31, $sp , 72
207+ ST ALPHA, $sp , 80
246208
247- xvpickve.w U4, U2, 0x01
248- xvpickve.w U5, U2, 0x02
249- xvpickve.w U6, U2, 0x03
250- xvpickve.w U7, U2, 0x04
251- xvpickve.w U8, U2, 0x05
252- xvpickve.w U9, U2, 0x06
253- xvpickve.w U10, U2, 0x07
209+ xvldrepl.w VALPHA, $sp , 80
254210
255- fadd .s $f2, $f2, $f4
256- fadd .s $f2, $f2, $f5
257- fadd .s $f2, $f2, $f6
258- fadd .s $f2, $f2, $f7
259- fadd .s $f2, $f2, $f8
260- fadd .s $f2, $f2, $f9
261- fadd .s $f2, $f2, $f10
262- fadd .s $f2, $f2, $f12
211+ addi .d T5, INCY, -1
212+ addi .d T6, INCX, -1
213+ slli.d LDA, LDA, BASE_SHIFT
214+ slli.d INCX, INCX, BASE_SHIFT
215+ slli.d INCY, INCY, BASE_SHIFT
263216
264- xvreplve0.d U2, U2
217+ bge $r0, M, .L999
218+ bge $r0, N, .L999
219+
220+ move J, $r0
221+ move JY, $r0
222+ move JX, $r0
223+ move AO1, A
265224
266- slli.d T2, INCX, 3
267- add .d IX, IX, T2
225+ beq J , N, .L999
226+
227+ .L01:
228+ xvxor.v U2, U2, U2
229+ fldx.s a6 , X, JX
230+ fmul .s a3, ALPHA, a6 //temp1
231+ xvreplve0.w U3, U3
232+
233+ mul.w T0, J, LDA
234+ slli.d T1, J, BASE_SHIFT
235+ add.w T0, T0, T1
236+ fldx.s a6 , AO1, T0
237+ fldx.s a4 , Y, JY
238+ fmadd.s a4 , a3 , a6 , a4
239+ fstx.s a4 , Y, JY
240+
241+ move IY, JY
242+ move IX, JX
243+ addi .d II, J, 1
244+ move I, II
245+ slli.d II, II, BASE_SHIFT
246+
247+ sub .d T0, M, J
248+ addi .d T0, T0, -1
249+ srai.d T0, T0, 3
250+ add .d T0, T0, J
251+ addi .d T0, T0, 1
252+ beq I , T0, .L03
253+ bge I , T0, .L03
254+
255+ mul.w T1, J, LDA
256+ add .d T1, T1, II
257+
258+ .L02: /* /8 */
259+ xvldx U1, AO1, T1
260+
261+ LOAD_Y_8
262+
263+ xvfmadd.s U4, U3, U1, U4
264+
265+ STORE_Y_8
266+
267+ alsl.d IY, INCY, IY, 3
268+
269+ LOAD_X_8
270+
271+ xvfmadd.s U2, U1, U4, U2
272+
273+ alsl.d IX, INCX, IX, 3
268274
269275 addi .d II, II, 32
270276 addi .d T1, T1, 32
271277 addi .d I, I, 1
272278 blt I , T0, .L02
273279
280+ //Acc U2
281+ GACC xvf, s, U4, U2
282+ fmov.d $f2 , $f4
283+
274284.L03: /* &4 */
275285 sub .d T0, M, J
276286 addi .d T0, T0, -1
@@ -433,4 +443,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
433443 addi .d $sp , $sp , 88
434444 jirl $r0, $r1, 0x0
435445
436- EPILOGUE
446+ EPILOGUE
0 commit comments