1+ #define ASSEMBLER
2+ #include "common.h"
3+ #define N $r4
4+ #define X $r5
5+ #define INCX $r6
6+ #define I $r17
7+ #define TEMP $r18
8+ #define t1 $r15
9+ #define t2 $r12
10+ #define t3 $r13
11+ #define t4 $r14
12+ #define VX0 $xr12
13+ #define VX1 $xr13
14+ #define VX2 $xr14
15+ #define VX3 $xr15
16+ #define VT0 $xr23
17+ #define VT1 $xr22
18+ #define res1 $xr16
19+ #define res2 $xr17
20+ #define res0 $xr18
21+ #define neg1 $xr19
22+
23+ PROLOGUE
24+ xvxor.v res1, res1, res1
25+ xvxor.v res2, res2, res2
26+ xvxor.v res0, res0, res0
27+ bge $r0, N, .L999
28+ bge $r0, INCX, .L999
29+ li.d t1, -1
30+ xvreplgr2vr.d neg1, t1
31+ xvffint.d.l neg1, neg1
32+ li.d TEMP, SIZE
33+ slli.d INCX, INCX, BASE_SHIFT
34+ srai.d I, N, 3
35+ bne INCX, TEMP, .L20
36+ bge $r0, I, .L13
37+ .align 3
38+
39+ .L11:
40+ xvld VX0, X, 0 * SIZE
41+ xvld VX1, X, 4 * SIZE
42+ xvfmul.d VX2, neg1, VX0
43+ xvfmul.d VX3, neg1, VX1
44+ xvfcmp.clt.d VT0, VX0, res0
45+ xvfcmp.clt.d VT1, VX1, res0
46+ xvbitsel.v VX0, VX0, VX2, VT0
47+ xvbitsel.v VX1, VX1, VX3, VT1
48+ xvfadd.d res2, VX0, VX1
49+ xvfadd.d res1, res1, res2
50+ addi.d X, X, 8 * SIZE
51+ addi.d I, I, -1
52+ blt $r0, I, .L11
53+ .align 3
54+
55+ .L12:
56+ xvpickve.d VX1, res1, 1
57+ xvpickve.d VX2, res1, 2
58+ xvpickve.d VX3, res1, 3
59+ xvfadd.d res1, VX1, res1
60+ xvfadd.d res1, VX2, res1
61+ xvfadd.d res1, VX3, res1
62+ .align 3
63+
64+ .L13:
65+ andi I, N, 7
66+ bge $r0, I, .L999
67+ .align 3
68+
69+ .L14:
70+ fld .d $f12, X, 0 * SIZE
71+ fabs .d $f12, $f12
72+ fadd .d $f16, $f12, $f16
73+ addi.d I, I, -1
74+ addi.d X, X, SIZE
75+ blt $r0, I, .L14
76+ b .L999
77+ .align 3
78+
79+ .L20:
80+ bge $r0, I, .L23
81+ .align 3
82+
83+ .L21:
84+ ld.d t1, X, 0 * SIZE
85+ add .d X, X, INCX
86+ ld.d t2, X, 0 * SIZE
87+ add .d X, X, INCX
88+ ld.d t3, X, 0 * SIZE
89+ add .d X, X, INCX
90+ ld.d t4, X, 0 * SIZE
91+ add .d X, X, INCX
92+ xvinsgr2vr.d VX0, t1, 0
93+ xvinsgr2vr.d VX0, t2, 1
94+ xvinsgr2vr.d VX0, t3, 2
95+ xvinsgr2vr.d VX0, t4, 3
96+ ld.d t1, X, 0 * SIZE
97+ add .d X, X, INCX
98+ ld.d t2, X, 0 * SIZE
99+ add .d X, X, INCX
100+ ld.d t3, X, 0 * SIZE
101+ add .d X, X, INCX
102+ ld.d t4, X, 0 * SIZE
103+ add .d X, X, INCX
104+ xvinsgr2vr.d VX1, t1, 0
105+ xvinsgr2vr.d VX1, t2, 1
106+ xvinsgr2vr.d VX1, t3, 2
107+ xvinsgr2vr.d VX1, t4, 3
108+ xvfmul.d VX2, neg1, VX0
109+ xvfmul.d VX3, neg1, VX1
110+ xvfcmp.clt.d VT0, VX0, res0
111+ xvfcmp.clt.d VT1, VX1, res0
112+ xvbitsel.v VX0, VX0, VX2, VT0
113+ xvbitsel.v VX1, VX1, VX3, VT1
114+ xvfadd.d res2, VX0, VX1
115+ xvfadd.d res1, res1, res2
116+ addi.d I, I, -1
117+ blt $r0, I, .L21
118+ .align 3
119+
120+ .L22:
121+ xvpickve.d VX1, res1, 1
122+ xvpickve.d VX2, res1, 2
123+ xvpickve.d VX3, res1, 3
124+ xvfadd.d res1, VX1, res1
125+ xvfadd.d res1, VX2, res1
126+ xvfadd.d res1, VX3, res1
127+ .align 3
128+
129+ .L23:
130+ andi I, N, 7
131+ bge $r0, I, .L999
132+ .align 3
133+
134+ .L24:
135+ fld .d $f12, X, 0 * SIZE
136+ fabs .d $f12, $f12
137+ fadd .d $f16, $f12, $f16
138+ addi.d I, I, -1
139+ add .d X, X, INCX
140+ blt $r0, I, .L24
141+ .align 3
142+
143+ .L999:
144+ fmov.d $f0, $f16
145+ jirl $r0, $r1, 0x0
146+ .align 3
147+
148+ EPILOGUE
0 commit comments