@@ -47,6 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4747#define VX4 $xr21
4848#define res1 $xr19
4949#define res2 $xr20
50+ #define RCP $f2
51+ #define VALPHA $xr3
5052
5153 PROLOGUE
5254
@@ -55,10 +57,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5557 LDINT INCX, 0 (INCX)
5658#endif
5759
58- xvxor.v res1, res1, res1
59- xvxor.v res2, res2, res2
6060 bge $r0, N, .L999
6161 beq $r0, INCX, .L999
62+
63+ addi.d $sp, $sp, -32
64+ st.d $ra, $sp, 0
65+ st.d N, $sp, 8
66+ st.d X, $sp, 16
67+ st.d INCX, $sp, 24
68+ #ifdef DYNAMIC_ARCH
69+ bl camax_k_LA264
70+ #else
71+ bl camax_k
72+ #endif
73+ ld.d $ra, $sp, 0
74+ ld.d N, $sp, 8
75+ ld.d X, $sp, 16
76+ ld.d INCX, $sp, 24
77+ addi.d $sp, $sp, 32
78+
79+ frecip.s RCP, $f0
80+ vreplvei.w $vr3, $vr2, 0
81+ xvpermi.d VALPHA, $xr3,0x00
82+ xvxor.v res1, res1, res1
83+ xvxor.v res2, res2, res2
84+ fcmp.ceq.s $fcc0, $f0, $f19
85+ bcnez $fcc0, .L999
86+
6287 li.d TEMP, SIZE
6388 slli.d INCX, INCX, ZBASE_SHIFT
6489 srai.d I, N, 2
@@ -67,13 +92,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6792 .align 3
6893
6994.L10:
70- xvld VX0, X, 0 * SIZE
71- xvfcvtl.d.s VX1, VX0
72- xvfcvth.d.s VX2, VX0
73- xvfmadd.d res1, VX1, VX1, res1
74- xvfmadd.d res2, VX2, VX2, res2
7595 addi.d I, I, -1
76- addi.d X, X, 8 * SIZE
96+
97+ xvld VX0, X, 0 * SIZE
98+ xvld VX1, X, 8 * SIZE
99+ xvfmul.s VX0, VX0, VALPHA
100+ xvfmul.s VX1, VX1, VALPHA
101+ xvfmadd.s res1, VX0, VX0, res1
102+ xvfmadd.s res2, VX1, VX1, res2
103+
104+ addi.d X, X, 16 * SIZE
77105 blt $r0, I, .L10
78106 .align 3
79107 b .L996
@@ -103,22 +131,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
103131 xvinsgr2vr.w VX0, t3, 6
104132 xvinsgr2vr.w VX0, t4, 7
105133 add .d X, X, INCX
106- xvfcvtl.d.s VX1, VX0
107- xvfcvth.d.s VX2, VX0
108- xvfmadd.d res1, VX1, VX1, res1
109- xvfmadd.d res2, VX2, VX2, res2
134+ xvfmul.s VX0, VX0, VALPHA
135+ xvfmadd.s res2, VX0, VX0, res2
110136 addi.d I, I, -1
111137 blt $r0, I, .L21
112138 b .L996
113139
114140.L996:
115- xvfadd.d res1, res1, res2
116- xvpickve.d VX1, res1, 1
117- xvpickve.d VX2, res1, 2
118- xvpickve.d VX3, res1, 3
119- xvfadd.d res1, VX1, res1
120- xvfadd.d res1, VX2, res1
121- xvfadd.d res1, VX3, res1
141+ xvfadd.s res1, res1, res2
142+ xvpermi.d VX1, res1, 0x4e
143+ xvfadd.s res1, res1, VX1
144+ vreplvei.w $vr17, $vr19, 1
145+ vreplvei.w $vr18, $vr19, 2
146+ vreplvei.w $vr21, $vr19, 3
147+ xvfadd.s res1, VX2, res1
148+ xvfadd.s res1, VX3, res1
149+ xvfadd.s res1, VX4, res1
122150 .align 3
123151
124152.L997:
@@ -130,18 +158,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
130158 fld .s a1, X, 0 * SIZE
131159 fld .s a2, X, 1 * SIZE
132160 addi.d I, I, -1
133- fcvt.d. s a1, a1
134- fcvt.d. s a2, a2
135- fmadd.d res, a1, a1, res
136- fmadd.d res, a2, a2, res
161+ fmul . s a1, a1, RCP
162+ fmul . s a2, a2, RCP
163+ fmadd.s res, a1, a1, res
164+ fmadd.s res, a2, a2, res
137165 add .d X, X, INCX
138166 blt $r0, I, .L998
139167 .align 3
140168
141169.L999:
142- fsqrt .d res, res
170+ fsqrt .s res, res
171+ fmul .s $f0, res, $f0
143172 move $r4, $r17
144- fcvt.s.d $f0, res
145173 jirl $r0, $r1, 0x0
146174
147175 EPILOGUE
0 commit comments