@@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
3939 FLOAT x0 , x1 , x2 , x3 , y0 , y1 , y2 , y3 ;
4040 v4f32 vx0 , vx1 , vx2 , vx3 , vx4 , vx5 , vx6 , vx7 ;
4141 v4f32 vy0 , vy1 , vy2 , vy3 , vy4 , vy5 , vy6 , vy7 ;
42+ #if defined(DSDOT )
43+ v2f64 dvx0 , dvx1 , dvx2 , dvx3 , dvx4 , dvx5 , dvx6 , dvx7 ;
44+ v2f64 dvy0 , dvy1 , dvy2 , dvy3 , dvy4 , dvy5 , dvy6 , dvy7 ;
45+ v2f64 dot0 = {0 , 0 };
46+ v2f64 dot1 = {0 , 0 };
47+ v2f64 dot2 = {0 , 0 };
48+ v2f64 dot3 = {0 , 0 };
49+ #else
4250 v4f32 dot0 = {0 , 0 , 0 , 0 };
4351 v4f32 dot1 = {0 , 0 , 0 , 0 };
4452 v4f32 dot2 = {0 , 0 , 0 , 0 };
4553 v4f32 dot3 = {0 , 0 , 0 , 0 };
54+ #endif
4655
4756 if (n < 1 ) return (dot );
4857
@@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
8392 x_pref += 32 ;
8493 y_pref += 32 ;
8594
95+ #if defined(DSDOT )
96+ /* Extend single precision to double precision */
97+ dvy0 = __msa_fexupr_d (vy0 );
98+ dvy1 = __msa_fexupr_d (vy1 );
99+ dvy2 = __msa_fexupr_d (vy2 );
100+ dvy3 = __msa_fexupr_d (vy3 );
101+ dvy4 = __msa_fexupr_d (vy4 );
102+ dvy5 = __msa_fexupr_d (vy5 );
103+ dvy6 = __msa_fexupr_d (vy6 );
104+ dvy7 = __msa_fexupr_d (vy7 );
105+
106+ vy0 = (v4f32 )__msa_fexupl_d (vy0 );
107+ vy1 = (v4f32 )__msa_fexupl_d (vy1 );
108+ vy2 = (v4f32 )__msa_fexupl_d (vy2 );
109+ vy3 = (v4f32 )__msa_fexupl_d (vy3 );
110+ vy4 = (v4f32 )__msa_fexupl_d (vy4 );
111+ vy5 = (v4f32 )__msa_fexupl_d (vy5 );
112+ vy6 = (v4f32 )__msa_fexupl_d (vy6 );
113+ vy7 = (v4f32 )__msa_fexupl_d (vy7 );
114+
115+ dvx0 = __msa_fexupr_d (vx0 );
116+ dvx1 = __msa_fexupr_d (vx1 );
117+ dvx2 = __msa_fexupr_d (vx2 );
118+ dvx3 = __msa_fexupr_d (vx3 );
119+ dvx4 = __msa_fexupr_d (vx4 );
120+ dvx5 = __msa_fexupr_d (vx5 );
121+ dvx6 = __msa_fexupr_d (vx6 );
122+ dvx7 = __msa_fexupr_d (vx7 );
123+
124+ vx0 = (v4f32 )__msa_fexupl_d (vx0 );
125+ vx1 = (v4f32 )__msa_fexupl_d (vx1 );
126+ vx2 = (v4f32 )__msa_fexupl_d (vx2 );
127+ vx3 = (v4f32 )__msa_fexupl_d (vx3 );
128+ vx4 = (v4f32 )__msa_fexupl_d (vx4 );
129+ vx5 = (v4f32 )__msa_fexupl_d (vx5 );
130+ vx6 = (v4f32 )__msa_fexupl_d (vx6 );
131+ vx7 = (v4f32 )__msa_fexupl_d (vx7 );
132+
133+ dot0 += (dvy0 * dvx0 );
134+ dot1 += (dvy1 * dvx1 );
135+ dot2 += (dvy2 * dvx2 );
136+ dot3 += (dvy3 * dvx3 );
137+ dot0 += (dvy4 * dvx4 );
138+ dot1 += (dvy5 * dvx5 );
139+ dot2 += (dvy6 * dvx6 );
140+ dot3 += (dvy7 * dvx7 );
141+ dot0 += ((v2f64 )vy0 * (v2f64 )vx0 );
142+ dot1 += ((v2f64 )vy1 * (v2f64 )vx1 );
143+ dot2 += ((v2f64 )vy2 * (v2f64 )vx2 );
144+ dot3 += ((v2f64 )vy3 * (v2f64 )vx3 );
145+ dot0 += ((v2f64 )vy4 * (v2f64 )vx4 );
146+ dot1 += ((v2f64 )vy5 * (v2f64 )vx5 );
147+ dot2 += ((v2f64 )vy6 * (v2f64 )vx6 );
148+ dot3 += ((v2f64 )vy7 * (v2f64 )vx7 );
149+ #else
86150 dot0 += (vy0 * vx0 );
87151 dot1 += (vy1 * vx1 );
88152 dot2 += (vy2 * vx2 );
@@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
91155 dot1 += (vy5 * vx5 );
92156 dot2 += (vy6 * vx6 );
93157 dot3 += (vy7 * vx7 );
158+ #endif
94159 }
95160
96161 if (n & 31 )
@@ -100,53 +165,123 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
100165 LD_SP4_INC (x , 4 , vx0 , vx1 , vx2 , vx3 );
101166 LD_SP4_INC (y , 4 , vy0 , vy1 , vy2 , vy3 );
102167
168+ #if defined(DSDOT )
169+ dvy0 = __msa_fexupr_d (vy0 );
170+ dvy1 = __msa_fexupr_d (vy1 );
171+ dvy2 = __msa_fexupr_d (vy2 );
172+ dvy3 = __msa_fexupr_d (vy3 );
173+
174+ vy0 = (v4f32 )__msa_fexupl_d (vy0 );
175+ vy1 = (v4f32 )__msa_fexupl_d (vy1 );
176+ vy2 = (v4f32 )__msa_fexupl_d (vy2 );
177+ vy3 = (v4f32 )__msa_fexupl_d (vy3 );
178+
179+ dvx0 = __msa_fexupr_d (vx0 );
180+ dvx1 = __msa_fexupr_d (vx1 );
181+ dvx2 = __msa_fexupr_d (vx2 );
182+ dvx3 = __msa_fexupr_d (vx3 );
183+
184+ vx0 = (v4f32 )__msa_fexupl_d (vx0 );
185+ vx1 = (v4f32 )__msa_fexupl_d (vx1 );
186+ vx2 = (v4f32 )__msa_fexupl_d (vx2 );
187+ vx3 = (v4f32 )__msa_fexupl_d (vx3 );
188+
189+ dot0 += (dvy0 * dvx0 );
190+ dot1 += (dvy1 * dvx1 );
191+ dot2 += (dvy2 * dvx2 );
192+ dot3 += (dvy3 * dvx3 );
193+ dot0 += ((v2f64 )vy0 * (v2f64 )vx0 );
194+ dot1 += ((v2f64 )vy1 * (v2f64 )vx1 );
195+ dot2 += ((v2f64 )vy2 * (v2f64 )vx2 );
196+ dot3 += ((v2f64 )vy3 * (v2f64 )vx3 );
197+ #else
103198 dot0 += (vy0 * vx0 );
104199 dot1 += (vy1 * vx1 );
105200 dot2 += (vy2 * vx2 );
106201 dot3 += (vy3 * vx3 );
202+ #endif
107203 }
108204
109205 if (n & 8 )
110206 {
111207 LD_SP2_INC (x , 4 , vx0 , vx1 );
112208 LD_SP2_INC (y , 4 , vy0 , vy1 );
113209
210+ #if defined(DSDOT )
211+ dvy0 = __msa_fexupr_d (vy0 );
212+ dvy1 = __msa_fexupr_d (vy1 );
213+
214+ vy0 = (v4f32 )__msa_fexupl_d (vy0 );
215+ vy1 = (v4f32 )__msa_fexupl_d (vy1 );
216+
217+ dvx0 = __msa_fexupr_d (vx0 );
218+ dvx1 = __msa_fexupr_d (vx1 );
219+
220+ vx0 = (v4f32 )__msa_fexupl_d (vx0 );
221+ vx1 = (v4f32 )__msa_fexupl_d (vx1 );
222+
223+ dot0 += (dvy0 * dvx0 );
224+ dot1 += (dvy1 * dvx1 );
225+ dot0 += ((v2f64 )vy0 * (v2f64 )vx0 );
226+ dot1 += ((v2f64 )vy1 * (v2f64 )vx1 );
227+ #else
114228 dot0 += (vy0 * vx0 );
115229 dot1 += (vy1 * vx1 );
230+ #endif
116231 }
117232
118233 if (n & 4 )
119234 {
120235 vx0 = LD_SP (x ); x += 4 ;
121236 vy0 = LD_SP (y ); y += 4 ;
122237
238+ #if defined(DSDOT )
239+ dvy0 = __msa_fexupr_d (vy0 );
240+ vy0 = (v4f32 )__msa_fexupl_d (vy0 );
241+ dvx0 = __msa_fexupr_d (vx0 );
242+ vx0 = (v4f32 )__msa_fexupl_d (vx0 );
243+ dot0 += (dvy0 * dvx0 );
244+ dot0 += ((v2f64 )vy0 * (v2f64 )vx0 );
245+ #else
123246 dot0 += (vy0 * vx0 );
247+ #endif
124248 }
125249
126250 if (n & 2 )
127251 {
128252 LD_GP2_INC (x , 1 , x0 , x1 );
129253 LD_GP2_INC (y , 1 , y0 , y1 );
130254
255+ #if defined(DSDOT )
256+ dot += ((double )y0 * (double )x0 );
257+ dot += ((double )y1 * (double )x1 );
258+ #else
131259 dot += (y0 * x0 );
132260 dot += (y1 * x1 );
261+ #endif
133262 }
134263
135264 if (n & 1 )
136265 {
137266 x0 = * x ;
138267 y0 = * y ;
139268
269+ #if defined(DSDOT )
270+ dot += ((double )y0 * (double )x0 );
271+ #else
140272 dot += (y0 * x0 );
273+ #endif
141274 }
142275 }
143276
144277 dot0 += dot1 + dot2 + dot3 ;
145278
146279 dot += dot0 [0 ];
147280 dot += dot0 [1 ];
281+ #if !defined(DSDOT )
148282 dot += dot0 [2 ];
149283 dot += dot0 [3 ];
284+ #endif
150285 }
151286 else
152287 {
@@ -155,27 +290,43 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
155290 LD_GP4_INC (x , inc_x , x0 , x1 , x2 , x3 );
156291 LD_GP4_INC (y , inc_y , y0 , y1 , y2 , y3 );
157292
293+ #if defined(DSDOT )
294+ dot += ((double )y0 * (double )x0 );
295+ dot += ((double )y1 * (double )x1 );
296+ dot += ((double )y2 * (double )x2 );
297+ dot += ((double )y3 * (double )x3 );
298+ #else
158299 dot += (y0 * x0 );
159300 dot += (y1 * x1 );
160301 dot += (y2 * x2 );
161302 dot += (y3 * x3 );
303+ #endif
162304 }
163305
164306 if (n & 2 )
165307 {
166308 LD_GP2_INC (x , inc_x , x0 , x1 );
167309 LD_GP2_INC (y , inc_y , y0 , y1 );
168310
311+ #if defined(DSDOT )
312+ dot += ((double )y0 * (double )x0 );
313+ dot += ((double )y1 * (double )x1 );
314+ #else
169315 dot += (y0 * x0 );
170316 dot += (y1 * x1 );
317+ #endif
171318 }
172319
173320 if (n & 1 )
174321 {
175322 x0 = * x ;
176323 y0 = * y ;
177324
325+ #if defined(DSDOT )
326+ dot += ((double )y0 * (double )x0 );
327+ #else
178328 dot += (y0 * x0 );
329+ #endif
179330 }
180331 }
181332
0 commit comments