@@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626*****************************************************************************/
2727
28+ /*
29+ * Avoid contraction of floating point operations, specifically fused
30+ * multiply-add, because they can cause unexpected results in complex
31+ * multiplication.
32+ */
33+ #if defined(__GNUC__ ) && !defined(__clang__ )
34+ #pragma GCC optimize ("fp-contract=off")
35+ #endif
36+
37+ #if defined(__clang__ )
38+ #pragma clang fp contract(off)
39+ #endif
40+
2841#include "common.h"
42+ #include "vector-common.h"
2943
30- static void cscal_kernel_16 (BLASLONG n , FLOAT * alpha , FLOAT * x ) {
31- __asm__("vlrepf %%v0,0(%[alpha])\n\t"
32- "vlef %%v1,4(%[alpha]),0\n\t"
33- "vlef %%v1,4(%[alpha]),2\n\t"
34- "vflcsb %%v1,%%v1\n\t"
35- "vlef %%v1,4(%[alpha]),1\n\t"
36- "vlef %%v1,4(%[alpha]),3\n\t"
37- "srlg %[n],%[n],4\n\t"
38- "xgr %%r1,%%r1\n\t"
39- "0:\n\t"
40- "pfd 2, 1024(%%r1,%[x])\n\t"
41- "vl %%v16,0(%%r1,%[x])\n\t"
42- "vl %%v17,16(%%r1,%[x])\n\t"
43- "vl %%v18,32(%%r1,%[x])\n\t"
44- "vl %%v19,48(%%r1,%[x])\n\t"
45- "vl %%v20,64(%%r1,%[x])\n\t"
46- "vl %%v21,80(%%r1,%[x])\n\t"
47- "vl %%v22,96(%%r1,%[x])\n\t"
48- "vl %%v23,112(%%r1,%[x])\n\t"
49- "verllg %%v24,%%v16,32\n\t"
50- "verllg %%v25,%%v17,32\n\t"
51- "verllg %%v26,%%v18,32\n\t"
52- "verllg %%v27,%%v19,32\n\t"
53- "verllg %%v28,%%v20,32\n\t"
54- "verllg %%v29,%%v21,32\n\t"
55- "verllg %%v30,%%v22,32\n\t"
56- "verllg %%v31,%%v23,32\n\t"
57- "vfmsb %%v16,%%v16,%%v0\n\t"
58- "vfmsb %%v17,%%v17,%%v0\n\t"
59- "vfmsb %%v18,%%v18,%%v0\n\t"
60- "vfmsb %%v19,%%v19,%%v0\n\t"
61- "vfmsb %%v20,%%v20,%%v0\n\t"
62- "vfmsb %%v21,%%v21,%%v0\n\t"
63- "vfmsb %%v22,%%v22,%%v0\n\t"
64- "vfmsb %%v23,%%v23,%%v0\n\t"
65- "vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
66- "vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
67- "vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
68- "vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
69- "vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
70- "vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
71- "vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
72- "vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
73- "vst %%v16,0(%%r1,%[x])\n\t"
74- "vst %%v17,16(%%r1,%[x])\n\t"
75- "vst %%v18,32(%%r1,%[x])\n\t"
76- "vst %%v19,48(%%r1,%[x])\n\t"
77- "vst %%v20,64(%%r1,%[x])\n\t"
78- "vst %%v21,80(%%r1,%[x])\n\t"
79- "vst %%v22,96(%%r1,%[x])\n\t"
80- "vst %%v23,112(%%r1,%[x])\n\t"
81- "agfi %%r1,128\n\t"
82- "brctg %[n],0b"
83- : "+m" (* (FLOAT (* )[n * 2 ]) x ),[n ] "+&r" (n )
84- : [x ] "a" (x ), "m" (* (const FLOAT (* )[2 ]) alpha ),
85- [alpha ] "a" (alpha )
86- : "cc" , "r1" , "v0" , "v1" , "v16" , "v17" , "v18" , "v19" , "v20" , "v21" ,
87- "v22" , "v23" , "v24" , "v25" , "v26" , "v27" , "v28" , "v29" , "v30" ,
88- "v31" );
44+ static void cscal_kernel_16 (BLASLONG n , FLOAT da_r , FLOAT da_i , FLOAT * x ) {
45+ vector_float da_r_vec = vec_splats (da_r );
46+ vector_float da_i_vec = { - da_i , da_i , - da_i , da_i };
47+
48+ vector_float * x_vec_ptr = (vector_float * )x ;
49+
50+ #pragma GCC unroll 16
51+ for (size_t i = 0 ; i < n /2 ; i ++ ) {
52+ vector_float x_vec = vec_load_hinted (x + i * VLEN_FLOATS );
53+ vector_float x_swapped = {x_vec [1 ], x_vec [0 ], x_vec [3 ], x_vec [2 ]};
54+
55+ x_vec_ptr [i ] = x_vec * da_r_vec + x_swapped * da_i_vec ;
56+ }
8957}
9058
9159static void cscal_kernel_16_zero_r (BLASLONG n , FLOAT * alpha , FLOAT * x ) {
@@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
199167 : "cc" , "r1" , "v0" );
200168}
201169
202- static void cscal_kernel_inc_8 (BLASLONG n , FLOAT * alpha , FLOAT * x ,
170+ static void cscal_kernel_inc_8 (BLASLONG n , FLOAT da_r , FLOAT da_i , FLOAT * x ,
203171 BLASLONG inc_x ) {
204172 BLASLONG i ;
205173 BLASLONG inc_x2 = 2 * inc_x ;
206174 BLASLONG inc_x3 = inc_x2 + inc_x ;
207175 FLOAT t0 , t1 , t2 , t3 ;
208- FLOAT da_r = alpha [0 ];
209- FLOAT da_i = alpha [1 ];
210176
211177 for (i = 0 ; i < n ; i += 4 ) {
212178 t0 = da_r * x [0 ] - da_i * x [1 ];
@@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
324290
325291 BLASLONG n1 = n & -8 ;
326292 if (n1 > 0 ) {
327- alpha [0 ] = da_r ;
328- alpha [1 ] = da_i ;
329- cscal_kernel_inc_8 (n1 , alpha , x , inc_x );
293+ cscal_kernel_inc_8 (n1 , da_r , da_i , x , inc_x );
330294 j = n1 ;
331295 i = n1 * inc_x ;
332296 }
@@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
362326 else if (da_i == 0 )
363327 cscal_kernel_16_zero_i (n1 , alpha , x );
364328 else
365- cscal_kernel_16 (n1 , alpha , x );
329+ cscal_kernel_16 (n1 , da_r , da_i , x );
366330
367331 i = n1 << 1 ;
368332 j = n1 ;
0 commit comments