1+ #define ASSEMBLER
2+
3+ #include "common.h"
4+
5+ #define N $r4
6+ #define X $r5
7+ #define INCX $r6
8+ #define I $r12
9+ #define t1 $r13
10+ #define t2 $r15
11+ #define t3 $r18
12+ #define t4 $r16
13+ #define i0 $r17
14+ #define i1 $r14
15+ #define TEMP $r19
16+ #define x1 $xr9
17+ #define x2 $xr10
18+ #define x3 $xr11
19+ #define x4 $xr12
20+ #define VX0 $xr13
21+ #define VX1 $xr14
22+ #define VM0 $xr15
23+ #define VM1 $xr16
24+ #define VINC4 $xr17
25+ #define VINC8 $xr18
26+ #define VI0 $xr20
27+ #define VI1 $xr21
28+ #define VI2 $xr22
29+ #define VI3 $xr8
30+ #define VI4 $xr19
31+ #define VT0 $xr23
32+
33+ PROLOGUE
34+ li.d i0, 0
35+ bge $r0, N, .L999
36+ bge $r0, INCX, .L999
37+ li.d TEMP, 1
38+ slli.d TEMP, TEMP, BASE_SHIFT
39+ slli.d INCX, INCX, BASE_SHIFT
40+ bne INCX, TEMP, .L20
41+ xvld VM0, X, 0
42+ addi.d i0, i0, 1
43+ srai.d I, N, 3
44+ bge $r0, I, .L21
45+ slli.d i0, i0, 2 //4
46+ xvreplgr2vr.d VINC4, i0
47+ slli.d i0, i0, 1 //8
48+ xvreplgr2vr.d VINC8, i0
49+ addi.d i0, i0, -15
50+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
51+ addi.d i0, i0, 1
52+ xvinsgr2vr.d VI1, i0, 1
53+ addi.d i0, i0, 1
54+ xvinsgr2vr.d VI1, i0, 2
55+ addi.d i0, i0, 1
56+ xvinsgr2vr.d VI1, i0, 3
57+ addi.d i0, i0, 5
58+ xvinsgr2vr.d VI0, i0, 0 //1
59+ addi.d i0, i0, 1
60+ xvinsgr2vr.d VI0, i0, 1 //2
61+ addi.d i0, i0, 1
62+ xvinsgr2vr.d VI0, i0, 2 //3
63+ addi.d i0, i0, 1
64+ xvinsgr2vr.d VI0, i0, 3 //4
65+ .align 3
66+
67+ .L10:
68+ xvld VX0, X, 0 * SIZE
69+ xvadd.d VI1, VI1, VINC8
70+ xvld VX1, X, 4 * SIZE
71+ xvadd.d VI2, VI1, VINC4
72+ xvfcmp.clt.d VT0, VX1, VX0
73+ addi.d I, I, -1
74+ xvbitsel.v VM1, VX0, VX1, VT0
75+ xvbitsel.v VI2, VI1, VI2, VT0
76+ xvfcmp.clt.d VT0, VM1, VM0
77+ addi.d X, X, 8 * SIZE
78+ xvbitsel.v VM0, VM0, VM1, VT0
79+ xvbitsel.v VI0, VI0, VI2, VT0
80+ blt $r0, I, .L10
81+ .align 3
82+
83+ .L15:
84+ xvpickve.d VI1, VI0, 0
85+ xvpickve.d VI2, VI0, 1
86+ xvpickve.d VI3, VI0, 2
87+ xvpickve.d VI4, VI0, 3
88+ xvpickve.d x1, VM0, 0
89+ xvpickve.d x2, VM0, 1
90+ xvpickve.d x3, VM0, 2
91+ xvpickve.d x4, VM0, 3
92+ xvfcmp.clt.d VT0, x2, x1
93+ xvbitsel.v VM1, x1, x2, VT0
94+ xvbitsel.v VINC4, VI1, VI2, VT0
95+ xvfcmp.clt.d VT0, x4, x3
96+ xvbitsel.v VM0, x3, x4, VT0
97+ xvbitsel.v VINC8, VI3, VI4, VT0
98+ xvfcmp.clt.d VT0, VM1, VM0
99+ xvbitsel.v VM0, VM0, VM1, VT0
100+ xvbitsel.v VI0, VINC8, VINC4, VT0
101+ li.d TEMP, 1 //处理尾数相等时取最小序号
102+ movgr2fr.d $f17, TEMP
103+ ffint.d.l $f17, $f17
104+ xvfcmp.ceq.d VT0, VM0, x1
105+ fcmp.ceq.d $fcc0, $f23, $f17
106+ bceqz $fcc0, .L26
107+ xvfcmp.clt.d VT0, VI1, VI0
108+ xvbitsel.v VI0, VI0, VI1, VT0
109+ b .L26
110+ .align 3
111+
112+ .L20: // INCX!=1
113+ move TEMP, X
114+ addi.d i0, i0, 1
115+ ld.d t1, TEMP, 0 * SIZE
116+ add .d TEMP, TEMP, INCX
117+ xvinsgr2vr.d VM0, t1, 0
118+ srai.d I, N, 3
119+ bge $r0, I, .L21
120+ ld.d t2, TEMP, 0 * SIZE
121+ add .d TEMP, TEMP, INCX
122+ ld.d t3, TEMP, 0 * SIZE
123+ add .d TEMP, TEMP, INCX
124+ ld.d t4, TEMP, 0 * SIZE
125+ add .d TEMP, TEMP, INCX
126+ xvinsgr2vr.d VM0, t2, 1
127+ xvinsgr2vr.d VM0, t3, 2
128+ xvinsgr2vr.d VM0, t4, 3
129+ slli.d i0, i0, 2 //4
130+ xvreplgr2vr.d VINC4, i0
131+ slli.d i0, i0, 1 //8
132+ xvreplgr2vr.d VINC8, i0
133+ addi.d i0, i0, -15
134+ xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
135+ addi.d i0, i0, 1
136+ xvinsgr2vr.d VI1, i0, 1
137+ addi.d i0, i0, 1
138+ xvinsgr2vr.d VI1, i0, 2
139+ addi.d i0, i0, 1
140+ xvinsgr2vr.d VI1, i0, 3
141+ addi.d i0, i0, 5
142+ xvinsgr2vr.d VI0, i0, 0 //1
143+ addi.d i0, i0, 1
144+ xvinsgr2vr.d VI0, i0, 1 //2
145+ addi.d i0, i0, 1
146+ xvinsgr2vr.d VI0, i0, 2 //3
147+ addi.d i0, i0, 1
148+ xvinsgr2vr.d VI0, i0, 3 //4
149+ .align 3
150+
151+ .L24:
152+ ld.d t1, X, 0 * SIZE
153+ add .d X, X, INCX
154+ ld.d t2, X, 0 * SIZE
155+ add .d X, X, INCX
156+ ld.d t3, X, 0 * SIZE
157+ add .d X, X, INCX
158+ ld.d t4, X, 0 * SIZE
159+ add .d X, X, INCX
160+ xvinsgr2vr.d VX0, t1, 0
161+ xvinsgr2vr.d VX0, t2, 1
162+ xvinsgr2vr.d VX0, t3, 2
163+ xvinsgr2vr.d VX0, t4, 3
164+ xvadd.d VI1, VI1, VINC8
165+ ld.d t1, X, 0 * SIZE
166+ add .d X, X, INCX
167+ ld.d t2, X, 0 * SIZE
168+ add .d X, X, INCX
169+ ld.d t3, X, 0 * SIZE
170+ add .d X, X, INCX
171+ ld.d t4, X, 0 * SIZE
172+ add .d X, X, INCX
173+ xvinsgr2vr.d VX1, t1, 0
174+ xvinsgr2vr.d VX1, t2, 1
175+ xvinsgr2vr.d VX1, t3, 2
176+ xvinsgr2vr.d VX1, t4, 3
177+ xvadd.d VI2, VI1, VINC4
178+ xvfcmp.clt.d VT0, VX1, VX0
179+ addi.d I, I, -1
180+ xvbitsel.v VM1, VX0, VX1, VT0
181+ xvbitsel.v VI2, VI1, VI2, VT0
182+ xvfcmp.clt.d VT0, VM1, VM0
183+ xvbitsel.v VM0, VM0, VM1, VT0
184+ xvbitsel.v VI0, VI0, VI2, VT0
185+ blt $r0, I, .L24
186+ .align 3
187+
188+ .L25:
189+ xvpickve.d VI1, VI0, 0
190+ xvpickve.d VI2, VI0, 1
191+ xvpickve.d VI3, VI0, 2
192+ xvpickve.d VI4, VI0, 3
193+ xvpickve.d x1, VM0, 0
194+ xvpickve.d x2, VM0, 1
195+ xvpickve.d x3, VM0, 2
196+ xvpickve.d x4, VM0, 3
197+ xvfcmp.clt.d VT0, x2, x1
198+ xvbitsel.v VM1, x1, x2, VT0
199+ xvbitsel.v VINC4, VI1, VI2, VT0
200+ xvfcmp.clt.d VT0, x4, x3
201+ xvbitsel.v VM0, x3, x4, VT0
202+ xvbitsel.v VINC8, VI3, VI4, VT0
203+ xvfcmp.clt.d VT0, VM1, VM0
204+ xvbitsel.v VM0, VM0, VM1, VT0
205+ xvbitsel.v VI0, VINC8, VINC4, VT0
206+ li.d TEMP, 1 //处理尾数相等时取最小序号
207+ movgr2fr.d $f17, TEMP
208+ ffint.d.l $f17, $f17
209+ xvfcmp.ceq.d VT0, VM0, x1
210+ fcmp.ceq.d $fcc0, $f23, $f17
211+ bceqz $fcc0, .L26
212+ xvfcmp.clt.d VT0, VI1, VI0
213+ xvbitsel.v VI0, VI0, VI1, VT0
214+ .align 3
215+
216+ .L26:
217+ xvfcmp.ceq.d VT0, VM0, x2
218+ fcmp.ceq.d $fcc0, $f23, $f17
219+ bceqz $fcc0, .L27
220+ xvfcmp.clt.d VT0, VI2, VI0
221+ xvbitsel.v VI0, VI0, VI2, VT0
222+ .align 3
223+
224+ .L27:
225+ xvfcmp.ceq.d VT0, VM0, x3
226+ fcmp.ceq.d $fcc0, $f23, $f17
227+ bceqz $fcc0, .L28
228+ xvfcmp.clt.d VT0, VI3, VI0
229+ xvbitsel.v VI0, VI0, VI3, VT0
230+ .align 3
231+
232+ .L28:
233+ xvfcmp.ceq.d VT0, VM0, x4
234+ fcmp.ceq.d $fcc0, $f23, $f17
235+ bceqz $fcc0, .L29
236+ xvfcmp.clt.d VT0, VI4, VI0
237+ xvbitsel.v VI0, VI0, VI4, VT0
238+ .align 3
239+
240+ .L29:
241+ movfr2gr.d i0, $f20
242+ .align 3
243+
244+ .L21: //N<8
245+ andi I, N, 7
246+ bge $r0, I, .L999
247+ srai.d i1, N, 3
248+ slli.d i1, i1, 3
249+ addi.d i1, i1, 1 //current index
250+ movgr2fr.d $f21, i1
251+ movgr2fr.d $f20, i0
252+ .align 3
253+
254+ .L22:
255+ fld .d $f9, X, 0
256+ addi.d I, I, -1
257+ fcmp.clt.d $fcc0, $f9, $f15
258+ add .d X, X, INCX
259+ fsel $f15, $f15, $f9, $fcc0
260+ fsel $f20, $f20, $f21, $fcc0
261+ addi.d i1, i1, 1
262+ movgr2fr.d $f21, i1
263+ blt $r0, I, .L22
264+ movfr2gr.d i0, $f20
265+ .align 3
266+
267+ .L999:
268+ move $r4, $r17
269+ jirl $r0, $r1, 0x0
270+ .align 3
271+
272+ EPILOGUE
0 commit comments