1+ #define ASSEMBLER
2+
3+ #include "common.h"
4+
5+ #define N $r4
6+ #define X $r5
7+ #define INCX $r6
8+ #define I $r12
9+ #define t1 $r13
10+ #define t2 $r15
11+ #define t3 $r18
12+ #define t4 $r16
13+ #define i0 $r17
14+ #define i1 $r14
15+ #define TEMP $r19
16+ #define x1 $xr9
17+ #define x2 $xr10
18+ #define x3 $xr11
19+ #define x4 $xr12
20+ #define VX0 $xr13
21+ #define VX1 $xr14
22+ #define VM0 $xr15
23+ #define VM1 $xr16
24+ #define VINC4 $xr17
25+ #define VINC8 $xr18
26+ #define VI0 $xr20
27+ #define VI1 $xr21
28+ #define VI2 $xr22
29+ #define VI3 $xr8
30+ #define VI4 $xr19
31+ #define VT0 $xr23
32+
33+ PROLOGUE
34+ li.d i0, 0
35+ bge $r0, N, .L999
36+ bge $r0, INCX, .L999
37+ li.d TEMP, 1
38+ slli.d TEMP, TEMP, BASE_SHIFT
39+ slli.d INCX, INCX, BASE_SHIFT
40+ bne INCX, TEMP, .L20
41+ xvld VM0, X, 0
42+ addi.d i0, i0, 1
43+ srai.d I, N, 3
44+ bge $r0, I, .L21
45+ slli.d i0, i0, 2 //4
46+ xvreplgr2vr.d VINC4, i0
47+ slli.d i0, i0, 1 //8
48+ xvreplgr2vr.d VINC8, i0
49+ addi.d i0, i0, -15
50+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
51+ addi.d i0, i0, 1
52+ xvinsgr2vr.d VI1, i0, 1
53+ addi.d i0, i0, 1
54+ xvinsgr2vr.d VI1, i0, 2
55+ addi.d i0, i0, 1
56+ xvinsgr2vr.d VI1, i0, 3
57+ addi.d i0, i0, 5
58+ xvinsgr2vr.d VI0, i0, 0 //1
59+ addi.d i0, i0, 1
60+ xvinsgr2vr.d VI0, i0, 1 //2
61+ addi.d i0, i0, 1
62+ xvinsgr2vr.d VI0, i0, 2 //3
63+ addi.d i0, i0, 1
64+ xvinsgr2vr.d VI0, i0, 3 //4
65+ .align 3
66+
67+ .L10:
68+ xvld VX0, X, 0 * SIZE
69+ xvadd.d VI1, VI1, VINC8
70+ xvld VX1, X, 4 * SIZE
71+ xvadd.d VI2, VI1, VINC4
72+ xvfcmp.clt.d VT0, VX0, VX1
73+ addi.d I, I, -1
74+ xvbitsel.v VM1, VX0, VX1, VT0
75+ xvbitsel.v VI2, VI1, VI2, VT0
76+ xvfcmp.clt.d VT0, VM0, VM1
77+ addi.d X, X, 8 * SIZE
78+ xvbitsel.v VM0, VM0, VM1, VT0
79+ xvbitsel.v VI0, VI0, VI2, VT0
80+ blt $r0, I, .L10
81+ .align 3
82+
83+ .L15:
84+ xvpickve.d VI1, VI0, 0
85+ xvpickve.d VI2, VI0, 1
86+ xvpickve.d VI3, VI0, 2
87+ xvpickve.d VI4, VI0, 3
88+ xvpickve.d x1, VM0, 0
89+ xvpickve.d x2, VM0, 1
90+ xvpickve.d x3, VM0, 2
91+ xvpickve.d x4, VM0, 3
92+ xvfcmp.clt.d VT0, x1, x2
93+ xvbitsel.v VM1, x1, x2, VT0
94+ xvbitsel.v VINC4, VI1, VI2, VT0
95+ xvfcmp.clt.d VT0, x3, x4
96+ xvbitsel.v VM0, x3, x4, VT0
97+ xvbitsel.v VINC8, VI3, VI4, VT0
98+ xvfcmp.clt.d VT0, VM0, VM1
99+ xvbitsel.v VM0, VM0, VM1, VT0
100+ xvbitsel.v VI0, VINC8, VINC4, VT0
101+ li.d TEMP, 1 //处理尾数相等时取最小序号
102+ movgr2fr.d $f17, TEMP
103+ ffint.d.l $f17, $f17
104+ xvfcmp.ceq.d VT0, VM0, x1
105+ fcmp.ceq.d $fcc0, $f23, $f17
106+ bceqz $fcc0, .L26
107+ xvfcmp.clt.d VT0, VI1, VI0
108+ xvbitsel.v VI0, VI0, VI1, VT0
109+ b .L26
110+ .align 3
111+
112+
113+ .L20: // INCX!=1
114+ move TEMP, X
115+ addi.d i0, i0, 1
116+ ld.d t1, TEMP, 0 * SIZE
117+ add .d TEMP, TEMP, INCX
118+ xvinsgr2vr.d VM0, t1, 0
119+ srai.d I, N, 3
120+ bge $r0, I, .L21
121+ ld.d t2, TEMP, 0 * SIZE
122+ add .d TEMP, TEMP, INCX
123+ ld.d t3, TEMP, 0 * SIZE
124+ add .d TEMP, TEMP, INCX
125+ ld.d t4, TEMP, 0 * SIZE
126+ add .d TEMP, TEMP, INCX
127+ xvinsgr2vr.d VM0, t2, 1
128+ xvinsgr2vr.d VM0, t3, 2
129+ xvinsgr2vr.d VM0, t4, 3
130+ slli.d i0, i0, 2 //4
131+ xvreplgr2vr.d VINC4, i0
132+ slli.d i0, i0, 1 //8
133+ xvreplgr2vr.d VINC8, i0
134+ addi.d i0, i0, -15
135+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
136+ addi.d i0, i0, 1
137+ xvinsgr2vr.d VI1, i0, 1
138+ addi.d i0, i0, 1
139+ xvinsgr2vr.d VI1, i0, 2
140+ addi.d i0, i0, 1
141+ xvinsgr2vr.d VI1, i0, 3
142+ addi.d i0, i0, 5
143+ xvinsgr2vr.d VI0, i0, 0 //1
144+ addi.d i0, i0, 1
145+ xvinsgr2vr.d VI0, i0, 1 //2
146+ addi.d i0, i0, 1
147+ xvinsgr2vr.d VI0, i0, 2 //3
148+ addi.d i0, i0, 1
149+ xvinsgr2vr.d VI0, i0, 3 //4
150+ .align 3
151+
152+ .L24:
153+ ld.d t1, X, 0 * SIZE
154+ add .d X, X, INCX
155+ ld.d t2, X, 0 * SIZE
156+ add .d X, X, INCX
157+ ld.d t3, X, 0 * SIZE
158+ add .d X, X, INCX
159+ ld.d t4, X, 0 * SIZE
160+ add .d X, X, INCX
161+ xvinsgr2vr.d VX0, t1, 0
162+ xvinsgr2vr.d VX0, t2, 1
163+ xvinsgr2vr.d VX0, t3, 2
164+ xvinsgr2vr.d VX0, t4, 3
165+ xvadd.d VI1, VI1, VINC8
166+ ld.d t1, X, 0 * SIZE
167+ add .d X, X, INCX
168+ ld.d t2, X, 0 * SIZE
169+ add .d X, X, INCX
170+ ld.d t3, X, 0 * SIZE
171+ add .d X, X, INCX
172+ ld.d t4, X, 0 * SIZE
173+ add .d X, X, INCX
174+ xvinsgr2vr.d VX1, t1, 0
175+ xvinsgr2vr.d VX1, t2, 1
176+ xvinsgr2vr.d VX1, t3, 2
177+ xvinsgr2vr.d VX1, t4, 3
178+ xvadd.d VI2, VI1, VINC4
179+ xvfcmp.clt.d VT0, VX0, VX1
180+ addi.d I, I, -1
181+ xvbitsel.v VM1, VX0, VX1, VT0
182+ xvbitsel.v VI2, VI1, VI2, VT0
183+ xvfcmp.clt.d VT0, VM0, VM1
184+ xvbitsel.v VM0, VM0, VM1, VT0
185+ xvbitsel.v VI0, VI0, VI2, VT0
186+ blt $r0, I, .L24
187+ .align 3
188+
189+ .L25:
190+ xvpickve.d VI1, VI0, 0
191+ xvpickve.d VI2, VI0, 1
192+ xvpickve.d VI3, VI0, 2
193+ xvpickve.d VI4, VI0, 3
194+ xvpickve.d x1, VM0, 0
195+ xvpickve.d x2, VM0, 1
196+ xvpickve.d x3, VM0, 2
197+ xvpickve.d x4, VM0, 3
198+ xvfcmp.clt.d VT0, x1, x2
199+ xvbitsel.v VM1, x1, x2, VT0
200+ xvbitsel.v VINC4, VI1, VI2, VT0
201+ xvfcmp.clt.d VT0, x3, x4
202+ xvbitsel.v VM0, x3, x4, VT0
203+ xvbitsel.v VINC8, VI3, VI4, VT0
204+ xvfcmp.clt.d VT0, VM0, VM1
205+ xvbitsel.v VM0, VM0, VM1, VT0
206+ xvbitsel.v VI0, VINC8, VINC4, VT0
207+ li.d TEMP, 1 //处理尾数相等时取最小序号
208+ movgr2fr.d $f17, TEMP
209+ ffint.d.l $f17, $f17
210+ xvfcmp.ceq.d VT0, VM0, x1
211+ fcmp.ceq.d $fcc0, $f23, $f17
212+ bceqz $fcc0, .L26
213+ xvfcmp.clt.d VT0, VI1, VI0
214+ xvbitsel.v VI0, VI0, VI1, VT0
215+ .align 3
216+
217+ .L26:
218+ xvfcmp.ceq.d VT0, VM0, x2
219+ fcmp.ceq.d $fcc0, $f23, $f17
220+ bceqz $fcc0, .L27
221+ xvfcmp.clt.d VT0, VI2, VI0
222+ xvbitsel.v VI0, VI0, VI2, VT0
223+ .align 3
224+
225+ .L27:
226+ xvfcmp.ceq.d VT0, VM0, x3
227+ fcmp.ceq.d $fcc0, $f23, $f17
228+ bceqz $fcc0, .L28
229+ xvfcmp.clt.d VT0, VI3, VI0
230+ xvbitsel.v VI0, VI0, VI3, VT0
231+ .align 3
232+
233+ .L28:
234+ xvfcmp.ceq.d VT0, VM0, x4
235+ fcmp.ceq.d $fcc0, $f23, $f17
236+ bceqz $fcc0, .L29
237+ xvfcmp.clt.d VT0, VI4, VI0
238+ xvbitsel.v VI0, VI0, VI4, VT0
239+ .align 3
240+
241+ .L29:
242+ movfr2gr.d i0, $f20
243+ .align 3
244+
245+ .L21: //N<8
246+ andi I, N, 7
247+ bge $r0, I, .L999
248+ srai.d i1, N, 3
249+ slli.d i1, i1, 3
250+ addi.d i1, i1, 1 //current index
251+ movgr2fr.d $f21, i1
252+ movgr2fr.d $f20, i0
253+ .align 3
254+
255+ .L22:
256+ fld .d $f9, X, 0
257+ addi.d I, I, -1
258+ fcmp.clt.d $fcc0, $f15, $f9
259+ add .d X, X, INCX
260+ fsel $f15, $f15, $f9, $fcc0
261+ fsel $f20, $f20, $f21, $fcc0
262+ addi.d i1, i1, 1
263+ movgr2fr.d $f21, i1
264+ blt $r0, I, .L22
265+ movfr2gr.d i0, $f20
266+ .align 3
267+
268+ .L999:
269+ move $r4, $r17
270+ jirl $r0, $r1, 0x0
271+ .align 3
272+
273+ EPILOGUE
0 commit comments