@@ -43,45 +43,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4343#define t2 $r13
4444#define t3 $r14
4545#define t4 $r15
46-
47- /* Don't change following FR unless you know the effects. */
4846#define VX0 $xr15
4947#define VX1 $xr16
5048#define VX2 $xr17
5149#define VX3 $xr18
5250#define VX4 $xr21
51+ #define VX5 $xr22
52+ /* Don't change following FR unless you know the effects. */
5353#define res1 $xr19
5454#define res2 $xr20
55+ #define RCP $f2
56+ #define VALPHA $xr3
57+
58+ // The optimization for snrm2 cannot simply involve
59+ // extending the data type from float to double and
60+ // then summing the squares of the data. LAPACK tests
61+ // have shown that this approach can still lead to data overflow.
62+ // Instead, we need to find the maximum absolute value in the entire
63+ // array and divide each data element by this maximum value before
64+ // performing the calculation. This approach can avoid overflow (and does not require extending the data type).
5565
5666 PROLOGUE
5767
5868#ifdef F_INTERFACE
5969 LDINT N, 0 (N)
6070 LDINT INCX, 0 (INCX)
6171#endif
72+ bge $r0, N, .L999
73+ beq $r0, INCX, .L999
6274
75+ addi.d $sp, $sp, -32
76+ st.d $ra, $sp, 0
77+ st.d N, $sp, 8
78+ st.d X, $sp, 16
79+ st.d INCX, $sp, 24
80+ #ifdef DYNAMIC_ARCH
81+ bl samax_k_LA264
82+ #else
83+ bl samax_k
84+ #endif
85+ ld.d $ra, $sp, 0
86+ ld.d N, $sp, 8
87+ ld.d X, $sp, 16
88+ ld.d INCX, $sp, 24
89+ addi.d $sp, $sp, 32
90+
91+ frecip.s RCP, $f0
92+ vreplvei.w $vr3, $vr2, 0
93+ xvpermi.d VALPHA, $xr3,0x00
6394 xvxor.v res1, res1, res1
6495 xvxor.v res2, res2, res2
65- bge $r0, N, .L999
66- beq $r0, INCX , .L999
96+ fcmp.ceq.s $fcc0, $f0, $f19
97+ bcnez $fcc0 , .L999
6798 li.d TEMP, SIZE
6899 slli.d INCX, INCX, BASE_SHIFT
69- srai.d I, N, 3
100+ srai.d I, N, 4
70101 bne INCX, TEMP, .L20
71- bge $r0, I, .L997
102+ bge $r0, I, .L997
72103 .align 3
73104
74105.L10:
75- xvld VX0, X, 0
76- xvfcvtl.d.s VX1, VX0
77- xvfcvth.d.s VX2, VX0
78- xvfmadd.d res1, VX1, VX1, res1
79- xvfmadd.d res2, VX2, VX2, res2
106+ xvld VX0, X, 0
107+ xvld VX5, X, 8 * SIZE
80108 addi.d I, I, -1
81- addi.d X, X, 8 * SIZE
109+ addi.d X, X, 16 * SIZE
110+
111+ xvfmul.s VX0, VX0, VALPHA
112+ xvfmul.s VX5, VX5, VALPHA
113+
114+ xvfmadd.s res1, VX0, VX0, res1
115+ xvfmadd.s res2, VX5, VX5, res2
82116 blt $r0, I, .L10
83- .align 3
84117 b .L996
118+ .align 3
85119
86120.L20:
87121 bge $r0, I, .L997
@@ -107,47 +141,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
107141 ld.w t3, X, 0
108142 add .d X, X, INCX
109143 ld.w t4, X, 0
144+ add .d X, X, INCX
110145 xvinsgr2vr.w VX0, t1, 4
111146 xvinsgr2vr.w VX0, t2, 5
112147 xvinsgr2vr.w VX0, t3, 6
113148 xvinsgr2vr.w VX0, t4, 7
149+ xvfmul.s VX0, VX0, VALPHA
150+ xvfmadd.s res1, VX0, VX0, res1
151+
152+ ld.w t1, X, 0
153+ add .d X, X, INCX
154+ ld.w t2, X, 0
114155 add .d X, X, INCX
115- xvfcvtl.d.s VX1, VX0
116- xvfcvth.d.s VX2, VX0
117- xvfmadd.d res1, VX1, VX1, res1
118- xvfmadd.d res2, VX2, VX2, res2
156+ ld.w t3, X, 0
157+ add .d X, X, INCX
158+ ld.w t4, X, 0
159+ add .d X, X, INCX
160+ xvinsgr2vr.w VX0, t1, 0
161+ xvinsgr2vr.w VX0, t2, 1
162+ xvinsgr2vr.w VX0, t3, 2
163+ xvinsgr2vr.w VX0, t4, 3
164+ ld.w t1, X, 0
165+ add .d X, X, INCX
166+ ld.w t2, X, 0
167+ add .d X, X, INCX
168+ ld.w t3, X, 0
169+ add .d X, X, INCX
170+ ld.w t4, X, 0
171+ add .d X, X, INCX
172+ xvinsgr2vr.w VX0, t1, 4
173+ xvinsgr2vr.w VX0, t2, 5
174+ xvinsgr2vr.w VX0, t3, 6
175+ xvinsgr2vr.w VX0, t4, 7
176+ xvfmul.s VX0, VX0, VALPHA
177+ xvfmadd.s res2, VX0, VX0, res2
119178 addi.d I, I, -1
120179 blt $r0, I, .L21
121- b .L996
180+ .align 3
122181
123182.L996:
124- xvfadd.d res1, res1, res2
125- xvpickve.d VX1, res1, 1
126- xvpickve.d VX2, res1, 2
127- xvpickve.d VX3, res1, 3
128- fadd .d $f19, $f19, $f16
129- fadd .d $f19, $f19, $f17
130- fadd .d $f19, $f19, $f18
183+ xvfadd.s res1, res1, res2
184+ xvpermi.d VX1, res1, 0x4e
185+ xvfadd.s res1, res1, VX1
186+ vreplvei.w $vr16, $vr19, 1
187+ vreplvei.w $vr17, $vr19, 2
188+ vreplvei.w $vr18, $vr19, 3
189+ xvfadd.s res1, VX1, res1
190+ xvfadd.s res1, VX2, res1
191+ xvfadd.s res1, VX3, res1
131192 .align 3
132193
133194.L997:
134- andi I, N, 7
195+ andi I, N, 15
135196 bge $r0, I, .L999
136197 .align 3
137198
138199.L998:
139200 fld .s $f15, X, 0
140- add .d X, X, INCX
141- addi.d I, I, -1
142- fcvt.d. s $f15, $f15
143- fmadd .d $f19, $f15, $f15, $f19
201+ addi .d I, I, -1
202+ fmul .s $f15, $f15, RCP
203+ fmadd. s $f19, $ f15, $f15, $f19
204+ add .d X, X, INCX
144205 blt $r0, I, .L998
145206 .align 3
146207
147208.L999:
148- fsqrt .d $f19, $f19
209+ fsqrt .s $f19, $f19
210+ fmul .s $f0, $f19, $f0
149211 move $r4, $r17
150- fcvt.s.d $f0, $f19
151212 jirl $r0, $r1, 0x0
213+ .align 3
152214
153215 EPILOGUE
0 commit comments