@@ -72,12 +72,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7272 FABS a1, a1
7373 ADD s1, a1, a0
7474#ifdef DOUBLE
75- xvreplve0.d VM0, VM0
7675 xvxor.v VI3, VI3, VI3 // 0
7776 li.d I, -1
7877 xvreplgr2vr.d VI4, I
7978 xvffint.d.l VI4, VI4 // -1
8079 bne INCX, TEMP, .L20
80+ // Init VM0
81+ xvreplve0.d VM0, VM0
82+ xvld VX0, X, 0 * SIZE
83+ xvld VX1, X, 4 * SIZE
84+ xvpickev.d x1, VX1, VX0
85+ xvpickod.d x2, VX1, VX0
86+ xvfmul.d x3, VI4, x1
87+ xvfmul.d x4, VI4, x2
88+ xvfcmp.clt.d VT0, x1, VI3
89+ xvfcmp.clt.d VINC8, x2, VI3
90+ xvbitsel.v x1, x1, x3, VT0
91+ xvbitsel.v x2, x2, x4, VINC8
92+ xvfadd.d VM0, x1, x2
93+
8194 addi.d i0, i0, 1
8295 srai.d I, N, 2
8396 bge $r0, I, .L21
@@ -100,12 +113,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
100113 addi.d i0, i0, 2
101114 xvinsgr2vr.d VI0, i0, 3 //4
102115#else
103- xvreplve0.w VM0, VM0
104116 xvxor.v VI3, VI3, VI3 // 0
105117 li.w I, -1
106118 xvreplgr2vr.w VI4, I
107119 xvffint.s.w VI4, VI4 // -1
108120 bne INCX, TEMP, .L20
121+ // Init VM0
122+ xvld VX0, X, 0 * SIZE
123+ xvld VX1, X, 8 * SIZE
124+ xvpickev.w x1, VX1, VX0
125+ xvpickod.w x2, VX1, VX0
126+ xvfmul.s x3, VI4, x1
127+ xvfmul.s x4, VI4, x2
128+ xvfcmp.clt.s VT0, x1, VI3
129+ xvfcmp.clt.s VINC4, x2, VI3
130+ xvbitsel.v x1, x1, x3, VT0
131+ xvbitsel.v x2, x2, x4, VINC4
132+ xvfadd.s VM0, x1, x2
133+
109134 addi.w i0, i0, 1
110135 srai.d I, N, 3
111136 bge $r0, I, .L21
@@ -160,6 +185,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
160185 xvfcmp.clt.d VINC8, x2, VI3
161186 xvbitsel.v x1, x1, x3, VT0
162187 xvbitsel.v x2, x2, x4, VINC8
188+ addi.d X, X, 8 * SIZE
163189#else
164190 xvadd.w VI1, VI1, VINC8
165191 xvld VX1, X, 8 * SIZE
@@ -172,11 +198,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
172198 xvfcmp.clt.s VINC4, x2, VI3
173199 xvbitsel.v x1, x1, x3, VT0
174200 xvbitsel.v x2, x2, x4, VINC4
201+ addi.d X, X, 16 * SIZE
175202#endif
176203 XVFADD x1, x1, x2
177204 XVFMIN x3, VM0, x1
178205 XVCMPEQ VT0, x3, VM0
179- addi.d X, X, 8 * SIZE
180206 xvbitsel.v VM0, x3, VM0, VT0
181207 xvbitsel.v VI0, VI1, VI0, VT0
182208 blt $r0, I, .L10
@@ -214,13 +240,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
214240 xvpickve.w x2, VM0, 1
215241 xvpickve.w x3, VM0, 2
216242 xvpickve.w x4, VM0, 3
217- xvfcmp.clt.s VT0, x1, x2
243+ xvfcmp.clt.s VT0, x2, x1
218244 xvbitsel.v VM1, x1, x2, VT0
219245 xvbitsel.v VINC4, VI1, VI2, VT0
220- xvfcmp.clt.s VT0, x3, x4
246+ xvfcmp.clt.s VT0, x4, x3
221247 xvbitsel.v VM0, x3, x4, VT0
222248 xvbitsel.v VINC8, VI3, VI4, VT0
223- xvfcmp.clt.s VT0, VM0, VM1
249+ xvfcmp.clt.s VT0, VM1, VM0
224250 xvbitsel.v VM0, VM0, VM1, VT0
225251 xvbitsel.v VI0, VINC8, VINC4, VT0
226252#endif
@@ -233,28 +259,105 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
233259
234260.L20: // INCX!=1
235261#ifdef DOUBLE
262+ // Init VM0
263+ ld.d t1, X, 0 * SIZE
264+ ld.d t2, X, 1 * SIZE
265+ add .d i1, X, INCX
266+ ld.d t3, i1, 0 * SIZE
267+ ld.d t4, i1, 1 * SIZE
268+ add .d i1, i1, INCX
269+ xvinsgr2vr.d x1, t1, 0
270+ xvinsgr2vr.d x2, t2, 0
271+ xvinsgr2vr.d x1, t3, 1
272+ xvinsgr2vr.d x2, t4, 1
273+ ld.d t1, i1, 0 * SIZE
274+ ld.d t2, i1, 1 * SIZE
275+ add .d i1, i1, INCX
276+ ld.d t3, i1, 0 * SIZE
277+ ld.d t4, i1, 1 * SIZE
278+ xvinsgr2vr.d x1, t1, 2
279+ xvinsgr2vr.d x2, t2, 2
280+ xvinsgr2vr.d x1, t3, 3
281+ xvinsgr2vr.d x2, t4, 3
282+ xvfmul.d x3, VI4, x1
283+ xvfmul.d x4, VI4, x2
284+ xvfcmp.clt.d VT0, x1, VI3
285+ xvfcmp.clt.d VINC8, x2, VI3
286+ xvbitsel.v x1, x1, x3, VT0
287+ xvbitsel.v x2, x2, x4, VINC8
288+ xvfadd.d VM0, x1, x2
289+
236290 addi.d i0, i0, 1
237291 srai.d I, N, 2
238292 bge $r0, I, .L21
239293 slli.d i0, i0, 2 //4
240294 xvreplgr2vr.d VINC4, i0
241295 addi.d i0, i0, -7
242296 xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
243- addi.d i0, i0, 2
297+ addi.d i0, i0, 1
244298 xvinsgr2vr.d VI1, i0, 1
245- addi.d i0, i0, - 1
299+ addi.d i0, i0, 1
246300 xvinsgr2vr.d VI1, i0, 2
247- addi.d i0, i0, 2
301+ addi.d i0, i0, 1
248302 xvinsgr2vr.d VI1, i0, 3
249303 addi.d i0, i0, 1
250304 xvinsgr2vr.d VI0, i0, 0 //1
251- addi.d i0, i0, 2
252- xvinsgr2vr.d VI0, i0, 1 //3
253- addi.d i0, i0, - 1
254- xvinsgr2vr.d VI0, i0, 2 //2
255- addi.d i0, i0, 2
305+ addi.d i0, i0, 1
306+ xvinsgr2vr.d VI0, i0, 1 //2
307+ addi.d i0, i0, 1
308+ xvinsgr2vr.d VI0, i0, 2 //3
309+ addi.d i0, i0, 1
256310 xvinsgr2vr.d VI0, i0, 3 //4
257311#else
312+ // Init VM0
313+ ld.w t1, X, 0 * SIZE
314+ ld.w t2, X, 1 * SIZE
315+ add .d i1, X, INCX
316+ ld.w t3, i1, 0 * SIZE
317+ ld.w t4, i1, 1 * SIZE
318+ add .d i1, i1, INCX
319+ xvinsgr2vr.w x1, t1, 0
320+ xvinsgr2vr.w x2, t2, 0
321+ xvinsgr2vr.w x1, t3, 1
322+ xvinsgr2vr.w x2, t4, 1
323+ ld.w t1, i1, 0 * SIZE
324+ ld.w t2, i1, 1 * SIZE
325+ add .d i1, i1, INCX
326+ ld.w t3, i1, 0 * SIZE
327+ ld.w t4, i1, 1 * SIZE
328+ add .d i1, i1, INCX
329+ xvinsgr2vr.w x1, t1, 2
330+ xvinsgr2vr.w x2, t2, 2
331+ xvinsgr2vr.w x1, t3, 3
332+ xvinsgr2vr.w x2, t4, 3
333+ ld.w t1, i1, 0 * SIZE
334+ ld.w t2, i1, 1 * SIZE
335+ add .d i1, i1, INCX
336+ ld.w t3, i1, 0 * SIZE
337+ ld.w t4, i1, 1 * SIZE
338+ add .d i1, i1, INCX
339+ xvinsgr2vr.w x1, t1, 4
340+ xvinsgr2vr.w x2, t2, 4
341+ xvinsgr2vr.w x1, t3, 5
342+ xvinsgr2vr.w x2, t4, 5
343+ ld.w t1, i1, 0 * SIZE
344+ ld.w t2, i1, 1 * SIZE
345+ add .d i1, i1, INCX
346+ ld.w t3, i1, 0 * SIZE
347+ ld.w t4, i1, 1 * SIZE
348+ add .d i1, i1, INCX
349+ xvinsgr2vr.w x1, t1, 6
350+ xvinsgr2vr.w x2, t2, 6
351+ xvinsgr2vr.w x1, t3, 7
352+ xvinsgr2vr.w x2, t4, 7
353+ xvfmul.s x3, VI4, x1
354+ xvfmul.s x4, VI4, x2
355+ xvfcmp.clt.s VT0, x1, VI3
356+ xvfcmp.clt.s VINC8, x2, VI3
357+ xvbitsel.v x1, x1, x3, VT0
358+ xvbitsel.v x2, x2, x4, VINC8
359+ xvfadd.s VM0, x1, x2
360+
258361 addi.w i0, i0, 1
259362 srai.d I, N, 3
260363 bge $r0, I, .L21
@@ -264,31 +367,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
264367 xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
265368 addi.w i0, i0, 1
266369 xvinsgr2vr.w VI1, i0, 1
267- addi.w i0, i0, 3
370+ addi.w i0, i0, 1
268371 xvinsgr2vr.w VI1, i0, 2
269372 addi.w i0, i0, 1
270373 xvinsgr2vr.w VI1, i0, 3
271- addi.w i0, i0, -3
374+ addi.w i0, i0, 1
272375 xvinsgr2vr.w VI1, i0, 4
273376 addi.w i0, i0, 1
274377 xvinsgr2vr.w VI1, i0, 5
275- addi.w i0, i0, 3
378+ addi.w i0, i0, 1
276379 xvinsgr2vr.w VI1, i0, 6
277380 addi.w i0, i0, 1
278381 xvinsgr2vr.w VI1, i0, 7
279382 addi.w i0, i0, 1
280383 xvinsgr2vr.w VI0, i0, 0 //1
281384 addi.w i0, i0, 1
282385 xvinsgr2vr.w VI0, i0, 1 //2
283- addi.w i0, i0, 3
284- xvinsgr2vr.w VI0, i0, 2 //5
285386 addi.w i0, i0, 1
286- xvinsgr2vr.w VI0, i0, 3 //6
287- addi.w i0, i0, -3
288- xvinsgr2vr.w VI0, i0, 4 //3
387+ xvinsgr2vr.w VI0, i0, 2 //3
388+ addi.w i0, i0, 1
389+ xvinsgr2vr.w VI0, i0, 3 //4
390+ addi.w i0, i0, 1
391+ xvinsgr2vr.w VI0, i0, 4 //5
392+ addi.w i0, i0, 1
393+ xvinsgr2vr.w VI0, i0, 5 //6
289394 addi.w i0, i0, 1
290- xvinsgr2vr.w VI0, i0, 5 //4
291- addi.w i0, i0, 3
292395 xvinsgr2vr.w VI0, i0, 6 //7
293396 addi.w i0, i0, 1
294397 xvinsgr2vr.w VI0, i0, 7 //8
@@ -350,7 +453,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
350453 xvinsgr2vr.w x2, t2, 4
351454 xvinsgr2vr.w x1, t3, 5
352455 xvinsgr2vr.w x2, t4, 5
353- xvadd.w VI1, VI1, VINC8
354456 ld.w t1, X, 0 * SIZE
355457 ld.w t2, X, 1 * SIZE
356458 add .d X, X, INCX
@@ -361,8 +463,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
361463 xvinsgr2vr.w x2, t2, 6
362464 xvinsgr2vr.w x1, t3, 7
363465 xvinsgr2vr.w x2, t4, 7
364- xvpickev.w x1, VX1, VX0
365- xvpickod.w x2, VX1, VX0
366466#endif
367467 addi.d I, I, -1
368468 XVFMUL x3, VI4, x1
@@ -410,13 +510,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
410510 xvpickve.w x2, VM0, 1
411511 xvpickve.w x3, VM0, 2
412512 xvpickve.w x4, VM0, 3
413- xvfcmp.clt.s VT0, x1, x2
513+ xvfcmp.clt.s VT0, x2, x1
414514 xvbitsel.v VM1, x1, x2, VT0
415515 xvbitsel.v VINC4, VI1, VI2, VT0
416- xvfcmp.clt.s VT0, x3, x4
516+ xvfcmp.clt.s VT0, x4, x3
417517 xvbitsel.v VM0, x3, x4, VT0
418518 xvbitsel.v VINC8, VI3, VI4, VT0
419- xvfcmp.clt.s VT0, VM0, VM1
519+ xvfcmp.clt.s VT0, VM1, VM0
420520 xvbitsel.v VM0, VM0, VM1, VT0
421521#endif
422522 xvbitsel.v VI0, VINC8, VINC4, VT0
@@ -475,13 +575,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
475575 xvpickve.w x2, VM0, 5
476576 xvpickve.w x3, VM0, 6
477577 xvpickve.w x4, VM0, 7
478- xvfcmp.clt.s VT0, x1, x2
578+ xvfcmp.clt.s VT0, x2, x1
479579 xvbitsel.v x1, x1, x2, VT0
480580 xvbitsel.v VINC4, VI1, VI2, VT0
481- xvfcmp.clt.s VT0, x3, x4
581+ xvfcmp.clt.s VT0, x4, x3
482582 xvbitsel.v VM0, x3, x4, VT0
483583 xvbitsel.v VINC8, VI3, VI4, VT0
484- xvfcmp.clt.s VT0, VM0, x1
584+ xvfcmp.clt.s VT0, x1, VM0
485585 xvbitsel.v VM0, VM0, x1, VT0
486586 xvbitsel.v VI0, VINC8, VINC4, VT0
487587 fcmp.ceq.d $fcc0, $f15, $f9
@@ -512,7 +612,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
512612 .align 3
513613
514614.L292:
515- fcmp.clt.s $fcc0, $f15 , $f13
615+ fcmp.clt.s $fcc0, $f13 , $f15
516616 fsel $f15, $f15, $f13, $fcc0
517617 fsel $f20, $f20, $f16, $fcc0
518618 movfr2gr.s i0, $f20
0 commit comments