@@ -35,327 +35,72 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535
3636
3737#if defined(HAVE_KERNEL4x8_ASM )
38- static void dgemv_kernel_4x8 (BLASLONG n , BLASLONG lda , double * ap , double * x , double * y , double alpha ) {
39-
38+ typedef __vector unsigned char vec_t ;
39+ static void dgemv_kernel_4x8 (BLASLONG n , BLASLONG lda , FLOAT * ap , FLOAT * x , FLOAT * y , FLOAT alpha ) {
40+ BLASLONG i ;
4041 FLOAT * a0 , * a1 , * a2 , * a3 , * a4 , * a5 , * a6 , * a7 ;
41- BLASLONG off2 ;
42- BLASLONG tempR ;
43- __asm__(
44-
45- "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
46- "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double)
47- "xxlxor 34,34,34 \n\t"
48- "xxlxor 35,34,34 \n\t"
49- "add %[a2], %[a0], %[temp] \n\t"
50- "add %[a1], %[a0], %[off] \n\t"
51- "xxlxor 4,34,34 \n\t"
52- "xxlxor 5,34,34 \n\t"
53- "xxlxor 6,34,34 \n\t"
54- "xxlxor 7,34,34 \n\t"
55- "add %[a3], %[a2], %[off] \n\t"
56- "add %[a4], %[a2], %[temp] \n\t"
57-
58- "xxlxor 8,34,34 \n\t"
59- "xxlxor 9,34,34 \n\t"
60- "add %[a5], %[a3], %[temp] \n\t"
61- "li %[off],0 \n\t"
62- "li %[off2],16 \n\t"
63-
64- "add %[a6], %[a4], %[temp] \n\t"
65- "add %[a7], %[a5], %[temp] \n\t"
66-
67-
68-
69-
70- "lxvp 32, 0(%[x]) \n\t"
71- "lxvp 36, 0(%[a0]) \n\t"
72- "lxvp 38, 0(%[a1]) \n\t"
73- "lxvp 40, 0(%[a2]) \n\t"
74- "lxvp 42, 0(%[a3]) \n\t"
75- "lxvp 44, 0(%[a4]) \n\t"
76- "lxvp 46, 0(%[a5]) \n\t"
77- "lxvp 48, 0(%[a6]) \n\t"
78- "lxvp 50, 0(%[a7]) \n\t"
79- #if defined(PREFETCH )
80- "li %[temp],896 \n\t"
81- #endif
82- "addic. %[n],%[n],-4 \n\t"
83-
84- "li %[off],32 \n\t"
85-
86-
87- "ble- two%= \n\t"
88-
89- //--------------------------------------------------
90- ".align 5 \n\t"
91- "one%=: \n\t"
92- "xvmaddadp 34,36,32 \n\t"
93- "xvmaddadp 35,38,32 \n\t"
94- "addi %[off2], %[off2],32 \n\t"
95- "xvmaddadp 4,40,32 \n\t"
96- "xvmaddadp 5,42,32 \n\t"
97- "xvmaddadp 6,44,32 \n\t"
98- "xvmaddadp 7,46,32 \n\t"
99- "xvmaddadp 8,48,32 \n\t"
100- "xvmaddadp 9,50,32 \n\t"
101-
102- "xvmaddadp 34,37,33 \n\t"
103- "xvmaddadp 35,39,33 \n\t"
104- "lxvp 36, 32(%[a0]) \n\t"
105- "lxvp 38, 32(%[a1]) \n\t"
106- "xvmaddadp 4,41,33 \n\t"
107- "xvmaddadp 5,43,33 \n\t"
108- "addi %[off], %[off],32 \n\t"
109- "lxvp 40, 32(%[a2]) \n\t"
110- "lxvp 42, 32(%[a3]) \n\t"
111- "xvmaddadp 6,45,33 \n\t"
112- "xvmaddadp 7,47,33 \n\t"
113- "lxvp 44, 32(%[a4]) \n\t"
114- "lxvp 46, 32(%[a5]) \n\t"
115- "xvmaddadp 8,49,33 \n\t"
116- "xvmaddadp 9,51,33 \n\t"
117-
118- "addic. %[n],%[n],-4 \n\t"
119- "lxvp 48, 32(%[a6]) \n\t"
120- "lxvp 50, 32(%[a7]) \n\t"
121- "lxvp 32, 32(%[x]) \n\t"
122- "ble- two%= \n\t"
123- "xvmaddadp 34,36,32 \n\t"
124- "xvmaddadp 35,38,32 \n\t"
125- "addi %[off2], %[off2],32 \n\t"
126- "xvmaddadp 4,40,32 \n\t"
127- "xvmaddadp 5,42,32 \n\t"
128- "xvmaddadp 6,44,32 \n\t"
129- "xvmaddadp 7,46,32 \n\t"
130- "xvmaddadp 8,48,32 \n\t"
131- "xvmaddadp 9,50,32 \n\t"
132-
133- "xvmaddadp 34,37,33 \n\t"
134- "xvmaddadp 35,39,33 \n\t"
135- "lxvp 36, 64(%[a0]) \n\t"
136- "lxvp 38, 64(%[a1]) \n\t"
137- "xvmaddadp 4,41,33 \n\t"
138- "xvmaddadp 5,43,33 \n\t"
139- "addi %[off], %[off],32 \n\t"
140- "lxvp 40, 64(%[a2]) \n\t"
141- "lxvp 42, 64(%[a3]) \n\t"
142- "xvmaddadp 6,45,33 \n\t"
143- "xvmaddadp 7,47,33 \n\t"
144- "lxvp 44, 64(%[a4]) \n\t"
145- "lxvp 46, 64(%[a5]) \n\t"
146- "xvmaddadp 8,49,33 \n\t"
147- "xvmaddadp 9,51,33 \n\t"
148-
149- "addic. %[n],%[n],-4 \n\t"
150- "lxvp 48, 64(%[a6]) \n\t"
151- "lxvp 50, 64(%[a7]) \n\t"
152- "lxvp 32, 64(%[x]) \n\t"
153- "ble- two%= \n\t"
154- "xvmaddadp 34,36,32 \n\t"
155- "xvmaddadp 35,38,32 \n\t"
156- #if defined(PREFETCH )
157- "addi %[temp],%[temp],128 \n\t"
158- #endif
159- "addi %[off2], %[off2],32 \n\t"
160- "xvmaddadp 4,40,32 \n\t"
161- "xvmaddadp 5,42,32 \n\t"
162- "xvmaddadp 6,44,32 \n\t"
163- "xvmaddadp 7,46,32 \n\t"
164- "xvmaddadp 8,48,32 \n\t"
165- "xvmaddadp 9,50,32 \n\t"
166- #if defined(PREFETCH )
167- "dcbt %[temp],%[a0] \n\t"
168- #endif
169-
170- "xvmaddadp 34,37,33 \n\t"
171- "xvmaddadp 35,39,33 \n\t"
172- "lxvp 36, 96(%[a0]) \n\t"
173- "lxvp 38, 96(%[a1]) \n\t"
174- "xvmaddadp 4,41,33 \n\t"
175- "xvmaddadp 5,43,33 \n\t"
176- #if defined(PREFETCH )
177- "dcbt %[temp],%[a1] \n\t"
178- #endif
179- "lxvp 40, 96(%[a2]) \n\t"
180- "lxvp 42, 96(%[a3]) \n\t"
181- "addi %[off], %[off],32 \n\t"
182- "xvmaddadp 6,45,33 \n\t"
183- "xvmaddadp 7,47,33 \n\t"
184- "lxvp 44, 96(%[a4]) \n\t"
185- "lxvp 46, 96(%[a5]) \n\t"
186- "xvmaddadp 8,49,33 \n\t"
187- "xvmaddadp 9,51,33 \n\t"
188- #if defined(PREFETCH )
189- "dcbt %[temp],%[a3] \n\t"
190- #endif
191- "lxvp 48, 96(%[a6]) \n\t"
192- "lxvp 50, 96(%[a7]) \n\t"
193- "lxvp 32, 96(%[x]) \n\t"
194-
195- "addic. %[n],%[n],-4 \n\t"
196- "ble- two%= \n\t"
197-
198- "addi %[off2], %[off2],32 \n\t"
199- #if defined(PREFETCH )
200- "dcbt %[temp],%[a2] \n\t"
201- #endif
202- "xvmaddadp 34,36,32 \n\t"
203- "xvmaddadp 35,38,32 \n\t"
204- "xvmaddadp 4,40,32 \n\t"
205- "xvmaddadp 5,42,32 \n\t"
206- #if defined(PREFETCH )
207- "dcbt %[temp],%[a4] \n\t"
208- #endif
209- "xvmaddadp 6,44,32 \n\t"
210- "xvmaddadp 7,46,32 \n\t"
211- "xvmaddadp 8,48,32 \n\t"
212- "xvmaddadp 9,50,32 \n\t"
213-
214- #if defined(PREFETCH )
215- "dcbt %[temp],%[a5] \n\t"
216- #endif
217- "xvmaddadp 34,37,33 \n\t"
218- "xvmaddadp 35,39,33 \n\t"
219- "lxvp 36, 128(%[a0]) \n\t"
220- "lxvp 38, 128(%[a1]) \n\t"
221- "xvmaddadp 4,41,33 \n\t"
222- "xvmaddadp 5,43,33 \n\t"
223- "addi %[off], %[off],32 \n\t"
224- "lxvp 40, 128(%[a2]) \n\t"
225- "lxvp 42, 128(%[a3]) \n\t"
226- #if defined(PREFETCH )
227- "dcbt %[temp],%[a6] \n\t"
228- #endif
229- "xvmaddadp 6,45,33 \n\t"
230- "xvmaddadp 7,47,33 \n\t"
231- "lxvp 44, 128(%[a4]) \n\t"
232- "lxvp 46, 128(%[a5]) \n\t"
233- "xvmaddadp 8,49,33 \n\t"
234- "xvmaddadp 9,51,33 \n\t"
235-
236- #if defined(PREFETCH )
237- "dcbt %[temp],%[a7] \n\t"
238- #endif
239- "addic. %[n],%[n],-4 \n\t"
240- "lxvp 48, 128(%[a6]) \n\t"
241- "lxvp 50, 128(%[a7]) \n\t"
242- "lxvp 32, 128(%[x]) \n\t"
243- #if defined(PREFETCH )
244- "dcbt %[temp],%[x] \n\t"
245- #endif
246- "addi %[a0], %[a0], 128 \n\t"
247- "addi %[a1], %[a1], 128 \n\t"
248- "addi %[a2], %[a2], 128 \n\t"
249- "addi %[a3], %[a3], 128 \n\t"
250- "addi %[a4], %[a4], 128 \n\t"
251- "addi %[a5], %[a5], 128 \n\t"
252- "addi %[a6], %[a6], 128 \n\t"
253- "addi %[a7], %[a7], 128 \n\t"
254- "addi %[x], %[x], 128 \n\t"
255- "bgt+ one%= \n\t"
256- ".align 5 \n\t"
257- "two%=: \n\t"
258- //--------------------------------------------
259-
260- "xvmaddadp 34,36,32 \n\t"
261- "xvmaddadp 35,38,32 \n\t"
262- "xvmaddadp 4,40,32 \n\t"
263- "xvmaddadp 5,42,32 \n\t"
264- "xvmaddadp 6,44,32 \n\t"
265- "xvmaddadp 7,46,32 \n\t"
266- "xvmaddadp 8,48,32 \n\t"
267- "xvmaddadp 9,50,32 \n\t"
268- XXSPLTD_S (36 ,%x [alpha ],0 )
269- "xvmaddadp 34,37,33 \n\t"
270- "xvmaddadp 35,39,33 \n\t"
271- "xvmaddadp 4,41,33 \n\t"
272- "xvmaddadp 5,43,33 \n\t"
273- "xvmaddadp 6,45,33 \n\t"
274- "xvmaddadp 7,47,33 \n\t"
275- "xvmaddadp 8,49,33 \n\t"
276- "xvmaddadp 9,51,33 \n\t"
277-
278- "lxvp 38, 0(%[y]) \n\t"
279- "lxvp 40, 32(%[y]) \n\t"
280-
281-
282- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
283- XXMRGHD_S (42 ,34 ,35 )
284- XXMRGLD_S (43 ,34 ,35 )
285-
286- XXMRGHD_S (44 ,4 ,5 )
287- XXMRGLD_S (45 ,4 ,5 )
288- #else
289- XXMRGLD_S (42 ,35 ,34 )
290- XXMRGHD_S (43 ,35 ,34 )
291-
292- XXMRGLD_S (44 ,5 ,4 )
293- XXMRGHD_S (45 ,5 ,4 )
294- #endif
295-
296- "xvadddp 42,42,43 \n\t"
297-
298- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
299- XXMRGHD_S (46 ,6 ,7 )
300- XXMRGLD_S (47 ,6 ,7 )
301- #else
302- XXMRGLD_S (46 ,7 ,6 )
303- XXMRGHD_S (47 ,7 ,6 )
304- #endif
305- "xvadddp 44,44,45 \n\t"
306-
307- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
308- XXMRGHD_S (48 ,8 ,9 )
309- XXMRGLD_S (49 ,8 ,9 )
310- #else
311- XXMRGLD_S (48 ,9 ,8 )
312- XXMRGHD_S (49 ,9 ,8 )
313- #endif
314- "xvadddp 46,46,47 \n\t"
315- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
316- "xvmaddadp 38,42,36 \n\t"
317- "xvmaddadp 39,44,36 \n\t"
318- #else
319- "xvmaddadp 39,42,36 \n\t"
320- "xvmaddadp 38,44,36 \n\t"
321- #endif
322- "xvadddp 48,48,49 \n\t"
323- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
324- "xvmaddadp 41,48,36 \n\t"
325- #else
326- "xvmaddadp 41,46,36 \n\t"
327- #endif
328- "stxvp 38, 0(%[y]) \n\t"
329- #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ )
330- "xvmaddadp 40,46,36 \n\t"
331- #else
332- "xvmaddadp 40,48,36 \n\t"
333- #endif
334- "stxvp 40, 32(%[y]) \n\t"
335-
336- : [memy ] "+ m " (*(double (*)[8])y),
337- [n ] "+ & r " (n),
338- [a0 ] "= b " (a0),
339- [a1 ] "= & b " (a1),
340- [a2 ] "= & b " (a2),
341- [a3 ] "= & b " (a3),
342- [a4 ] "= & b " (a4),
343- [a5 ] "= & b " (a5),
344- [a6 ] "= & b " (a6),
345- [a7 ] "= & b " (a7),
346- [off ] "+ & b " (lda),
347- [off2 ]"= & b " (off2),
348- [temp ] "= & b " (tempR)
349- : [memx ] "m" (* (const double (* )[n ])x ),
350- [mem_ap ] "m" (* (const double (* )[n * 8 ]) ap ),
351- [alpha ] "d" (alpha ),
352- "[a0]" (ap ),
353- [x ] "b" (x ),
354- [y ] "b" (y )
355- : "cc" ,"vs4" ,"vs5" ,"vs6" ,"vs7" ,"vs8" ,"vs9" ,"vs32" ,"vs33" ,"vs34" ,"vs35" , "vs36" , "vs37" , "vs38" , "vs39" ,
356- "vs40" , "vs41" , "vs42" , "vs43" , "vs44" , "vs45" , "vs46" , "vs47" , "vs48" , "vs49" , "vs50" , "vs51"
357- );
358- return ;
42+ __vector_pair vx , vp ;
43+ vec_t res [2 ],res1 [2 ];
44+ register __vector double temp0 = {0 , 0 };
45+ register __vector double temp1 = {0 , 0 };
46+ register __vector double temp2 = {0 , 0 };
47+ register __vector double temp3 = {0 , 0 };
48+ register __vector double temp4 = {0 , 0 };
49+ register __vector double temp5 = {0 , 0 };
50+ register __vector double temp6 = {0 , 0 };
51+ register __vector double temp7 = {0 , 0 };
52+ a0 = ap ;
53+ a1 = ap + lda ;
54+ a2 = a1 + lda ;
55+ a3 = a2 + lda ;
56+ a4 = a3 + lda ;
57+ a5 = a4 + lda ;
58+ a6 = a5 + lda ;
59+ a7 = a6 + lda ;
60+ for (i = 0 ; i < n /2 ; i += 2 ) {
61+ vp = * ((__vector_pair * )((void * )& a0 [i * 2 ]));
62+ vx = * ((__vector_pair * )((void * )& x [i * 2 ]));
63+ __builtin_vsx_disassemble_pair (res , & vx );
64+ __builtin_vsx_disassemble_pair (res1 , & vp );
65+ temp0 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp0 );
66+ temp0 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp0 );
67+ vp = * ((__vector_pair * )((void * )& a1 [i * 2 ]));
68+ __builtin_vsx_disassemble_pair (res1 , & vp );
69+ temp1 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp1 );
70+ temp1 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp1 );
71+ vp = * ((__vector_pair * )((void * )& a2 [i * 2 ]));
72+ __builtin_vsx_disassemble_pair (res1 , & vp );
73+ temp2 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp2 );
74+ temp2 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp2 );
75+ vp = * ((__vector_pair * )((void * )& a3 [i * 2 ]));
76+ __builtin_vsx_disassemble_pair (res1 , & vp );
77+ temp3 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp3 );
78+ temp3 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp3 );
79+ vp = * ((__vector_pair * )((void * )& a4 [i * 2 ]));
80+ __builtin_vsx_disassemble_pair (res1 , & vp );
81+ temp4 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp4 );
82+ temp4 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp4 );
83+ vp = * ((__vector_pair * )((void * )& a5 [i * 2 ]));
84+ __builtin_vsx_disassemble_pair (res1 , & vp );
85+ temp5 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp5 );
86+ temp5 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp5 );
87+ vp = * ((__vector_pair * )((void * )& a6 [i * 2 ]));
88+ __builtin_vsx_disassemble_pair (res1 , & vp );
89+ temp6 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp6 );
90+ temp6 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp6 );
91+ vp = * ((__vector_pair * )((void * )& a7 [i * 2 ]));
92+ __builtin_vsx_disassemble_pair (res1 , & vp );
93+ temp7 = vec_madd ((__vector double )res [0 ], (__vector double )res1 [0 ], temp7 );
94+ temp7 = vec_madd ((__vector double )res [1 ], (__vector double )res1 [1 ], temp7 );
95+ }
96+ y [0 ] += alpha * (temp0 [0 ] + temp0 [1 ]);
97+ y [1 ] += alpha * (temp1 [0 ] + temp1 [1 ]);
98+ y [2 ] += alpha * (temp2 [0 ] + temp2 [1 ]);
99+ y [3 ] += alpha * (temp3 [0 ] + temp3 [1 ]);
100+ y [4 ] += alpha * (temp4 [0 ] + temp4 [1 ]);
101+ y [5 ] += alpha * (temp5 [0 ] + temp5 [1 ]);
102+ y [6 ] += alpha * (temp6 [0 ] + temp6 [1 ]);
103+ y [7 ] += alpha * (temp7 [0 ] + temp7 [1 ]);
359104}
360105#else
361106static void dgemv_kernel_4x8 (BLASLONG n , BLASLONG lda , FLOAT * ap , FLOAT * x , FLOAT * y , FLOAT alpha ) {
0 commit comments