@@ -70,18 +70,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
7070 LD a1, X, 1 * SIZE
7171 FABS a0, a0
7272 FABS a1, a1
73- ADD s1, a1, a0
74- vreplvei.w VM0, VM0, 0
73+ ADD s1, a1, a0 // Initialization value
7574 vxor.v VI3, VI3, VI3 // 0
7675#ifdef DOUBLE
7776 li.d I, -1
7877 vreplgr2vr.d VI4, I
7978 vffint.d.l VI4, VI4 // -1
80- bne INCX, TEMP, .L20
79+ bne INCX, TEMP, .L20 // incx != 1
80+
81+ // Init Index
8182 addi.d i0, i0, 1
82- srai.d I, N, 2
83- bge $r0, I, .L21
84- slli.d i0, i0, 1 //2
83+ slli.d i0, i0, 1 // 2
8584 vreplgr2vr.d VINC4, i0
8685 addi.d i0, i0, -3
8786 vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
@@ -91,14 +90,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
9190 vinsgr2vr.d VI0, i0, 0 //1
9291 addi.d i0, i0, 1
9392 vinsgr2vr.d VI0, i0, 1 //2
93+
94+ srai.d I, N, 2
95+ bge $r0, I, .L21
96+
97+ // Init VM0
98+ vld VX0, X, 0 * SIZE
99+ vld VX1, X, 2 * SIZE
100+ vpickev.d x1, VX1, VX0
101+ vpickod.d x2, VX1, VX0
102+ vfmul.d x3, VI4, x1
103+ vfmul.d x4, VI4, x2
104+ vfcmp.clt.d VT0, x1, VI3
105+ vfcmp.clt.d VINC8, x2, VI3
106+ vbitsel.v x1, x1, x3, VT0
107+ vbitsel.v x2, x2, x4, VINC8
108+ vfadd.d VM0, x1, x2
94109#else
95110 li.w I, -1
96111 vreplgr2vr.w VI4, I
97112 vffint.s.w VI4, VI4 // -1
98- bne INCX, TEMP, .L20
113+ bne INCX, TEMP, .L20 // incx != 1
114+
115+ // Init Index
99116 addi.w i0, i0, 1
100- srai.d I, N, 2
101- bge $r0, I, .L21
102117 slli.w i0, i0, 2 //4
103118 vreplgr2vr.w VINC4, i0
104119 addi.w i0, i0, -7
@@ -117,6 +132,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
117132 vinsgr2vr.w VI0, i0, 2 //3
118133 addi.w i0, i0, 1
119134 vinsgr2vr.w VI0, i0, 3 //4
135+
136+ srai.d I, N, 2
137+ bge $r0, I, .L21
138+
139+ // Init VM0
140+ vld VX0, X, 0 * SIZE
141+ vld VX1, X, 4 * SIZE
142+ vpickev.w x1, VX1, VX0
143+ vpickod.w x2, VX1, VX0
144+ vfmul.s x3, VI4, x1
145+ vfmul.s x4, VI4, x2
146+ vfcmp.clt.s VT0, x1, VI3
147+ vfcmp.clt.s VINC8, x2, VI3
148+ vbitsel.v x1, x1, x3, VT0
149+ vbitsel.v x2, x2, x4, VINC8
150+ vfadd.s VM0, x1, x2
120151#endif
121152 .align 3
122153
@@ -139,6 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
139170 vfcmp.ceq.d VT0, x3, VM0
140171 vbitsel.v VM0, x3, VM0, VT0
141172 vbitsel.v VI0, VI1, VI0, VT0
173+
142174 vld VX0, X, 4 * SIZE
143175 vadd.d VI1, VI1, VINC4
144176 vld VX1, X, 6 * SIZE
@@ -206,9 +238,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
206238.L20: // INCX!=1
207239#ifdef DOUBLE
208240 addi.d i0, i0, 1
209- srai.d I, N, 2
210- bge $r0, I, .L21
211- slli.d i0, i0, 1 //2
241+ // Init index
242+ slli.d i0, i0, 1 //2
212243 vreplgr2vr.d VINC4, i0
213244 addi.d i0, i0, -3
214245 vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
@@ -218,10 +249,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
218249 vinsgr2vr.d VI0, i0, 0 //1
219250 addi.d i0, i0, 1
220251 vinsgr2vr.d VI0, i0, 1 //2
252+
253+ srai.d I, N, 2
254+ bge $r0, I, .L21 // N < 4
255+
256+ // Init VM0
257+ ld.d t1, X, 0 * SIZE
258+ ld.d t2, X, 1 * SIZE
259+ add .d i1, X, INCX
260+ ld.d t3, i1, 0 * SIZE
261+ ld.d t4, i1, 1 * SIZE
262+ add .d i1, i1, INCX
263+ vinsgr2vr.d x1, t1, 0
264+ vinsgr2vr.d x2, t2, 0
265+ vinsgr2vr.d x1, t3, 1
266+ vinsgr2vr.d x2, t4, 1
267+ vfmul.d x3, VI4, x1
268+ vfmul.d x4, VI4, x2
269+ vfcmp.clt.d VT0, x1, VI3
270+ vfcmp.clt.d VINC8, x2, VI3
271+ vbitsel.v x1, x1, x3, VT0
272+ vbitsel.v x2, x2, x4, VINC8
273+ vfadd.d VM0, x1, x2
221274#else
222275 addi.w i0, i0, 1
223- srai.d I, N, 2
224- bge $r0, I, .L21
276+
277+ // Init index
225278 slli.w i0, i0, 2 //4
226279 vreplgr2vr.w VINC4, i0
227280 addi.w i0, i0, -7
@@ -240,6 +293,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
240293 vinsgr2vr.w VI0, i0, 2 //3
241294 addi.w i0, i0, 1
242295 vinsgr2vr.w VI0, i0, 3 //4
296+
297+ srai.d I, N, 2
298+ bge $r0, I, .L21 // N < 4
299+
300+ // Init VM0
301+ ld.w t1, X, 0 * SIZE
302+ ld.w t2, X, 1 * SIZE
303+ add .d i1, X, INCX
304+ ld.w t3, i1, 0 * SIZE
305+ ld.w t4, i1, 1 * SIZE
306+ add .d i1, i1, INCX
307+ vinsgr2vr.w x1, t1, 0
308+ vinsgr2vr.w x2, t2, 0
309+ vinsgr2vr.w x1, t3, 1
310+ vinsgr2vr.w x2, t4, 1
311+ ld.w t1, i1, 0 * SIZE
312+ ld.w t2, i1, 1 * SIZE
313+ add .d i1, i1, INCX
314+ ld.w t3, i1, 0 * SIZE
315+ ld.w t4, i1, 1 * SIZE
316+ add .d i1, i1, INCX
317+ vinsgr2vr.w x1, t1, 2
318+ vinsgr2vr.w x2, t2, 2
319+ vinsgr2vr.w x1, t3, 3
320+ vinsgr2vr.w x2, t4, 3
321+ vfcmp.clt.s VT0, x1, VI3
322+ vfcmp.clt.s VINC8, x2, VI3
323+ vbitsel.v x1, x1, x3, VT0
324+ vbitsel.v x2, x2, x4, VINC8
325+ vfadd.s VM0, x1, x2
243326#endif
244327 .align 3
245328
@@ -300,8 +383,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
300383 vinsgr2vr.w x2, t2, 2
301384 vinsgr2vr.w x1, t3, 3
302385 vinsgr2vr.w x2, t4, 3
303- vpickev.w x1, VX1, VX0
304- vpickod.w x2, VX1, VX0
305386#endif
306387 addi.d I, I, -1
307388 VFMUL x3, VI4, x1
@@ -358,12 +439,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
358439#ifdef DOUBLE
359440 vfmina.d VM0, x1, x2
360441 vfcmp.ceq.d VT0, x1, VM0
442+ vbitsel.v VI0, VI2, VI1, VT0
361443#else
362444 fcmp.ceq.d $fcc0, $f15, $f10
363445 bceqz $fcc0, .L27
364446 vfcmp.clt.s VT0, VI2, VI0
365- #endif
366447 vbitsel.v VI0, VI0, VI2, VT0
448+ #endif
367449 .align 3
368450
369451.L27:
0 commit comments