|
1 | | -#define ASSEMBLER |
| 1 | +/*************************************************************************** |
| 2 | +Copyright (c) 2023, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +1. Redistributions of source code must retain the above copyright |
| 8 | +notice, this list of conditions and the following disclaimer. |
| 9 | +2. Redistributions in binary form must reproduce the above copyright |
| 10 | +notice, this list of conditions and the following disclaimer in |
| 11 | +the documentation and/or other materials provided with the |
| 12 | +distribution. |
| 13 | +3. Neither the name of the OpenBLAS project nor the names of |
| 14 | +its contributors may be used to endorse or promote products |
| 15 | +derived from this software without specific prior written permission. |
| 16 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 17 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 18 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 19 | +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
| 20 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 21 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 22 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 23 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 24 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 25 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 26 | +*****************************************************************************/ |
2 | 27 |
|
| 28 | +#define ASSEMBLER |
3 | 29 | #include "common.h" |
| 30 | + |
4 | 31 | #define N $r4 |
5 | 32 | #define XX $r5 |
6 | 33 | #define YY $r6 |
|
35 | 62 | bge $r0, N, .L999 |
36 | 63 | li.d TEMP, 1 |
37 | 64 | movgr2fr.d a1, $r0 |
38 | | - ffint.d.l a1, a1 |
| 65 | + FFINT a1, a1 |
39 | 66 | movgr2fr.d a2, TEMP |
40 | | - ffint.d.l a2, a2 |
41 | | - fcmp.ceq.d $fcc0, ALPHA, a1 |
| 67 | + FFINT a2, a2 |
| 68 | + CMPEQ $fcc0, ALPHA, a1 |
42 | 69 | bcnez $fcc0, .L999 |
43 | 70 | slli.d TEMP, TEMP, BASE_SHIFT |
44 | 71 | slli.d INCX, INCX, BASE_SHIFT |
45 | 72 | slli.d INCY, INCY, BASE_SHIFT |
46 | | - movfr2gr.d t1, ALPHA |
| 73 | + MTG t1, ALPHA |
| 74 | +#ifdef DOUBLE |
47 | 75 | xvreplgr2vr.d VXA, t1 |
| 76 | +#else |
| 77 | + xvreplgr2vr.w VXA, t1 |
| 78 | +#endif |
48 | 79 |
|
49 | 80 | srai.d I, N, 3 |
50 | 81 | bne INCX, TEMP, .L20 |
|
56 | 87 |
|
57 | 88 | .L11: |
58 | 89 | bge $r0, I, .L113 |
59 | | - fcmp.ceq.d $fcc0, ALPHA, a2 |
| 90 | + CMPEQ $fcc0, ALPHA, a2 |
60 | 91 | bceqz $fcc0, .L112 |
61 | 92 | .align 3 |
62 | 93 |
|
63 | 94 | .L111: |
| 95 | +#ifdef DOUBLE |
64 | 96 | xvld VX0, X, 0 * SIZE |
65 | 97 | xvld VX2, Y, 0 * SIZE |
66 | 98 | xvld VX1, X, 4 * SIZE |
|
70 | 102 | addi.d I, I, -1 |
71 | 103 | xvst VX2, Y, 0 * SIZE |
72 | 104 | xvst VX3, Y, 4 * SIZE |
| 105 | +#else |
| 106 | + xvld VX0, X, 0 * SIZE |
| 107 | + xvld VX2, Y, 0 * SIZE |
| 108 | + addi.d I, I, -1 |
| 109 | + xvfadd.s VX2, VX0, VX2 |
| 110 | + xvst VX2, Y, 0 * SIZE |
| 111 | +#endif |
73 | 112 | addi.d X, X, 8 * SIZE |
74 | 113 | addi.d Y, Y, 8 * SIZE |
75 | 114 | blt $r0, I, .L111 |
76 | 115 | b .L113 |
77 | 116 | .align 3 |
78 | 117 |
|
79 | 118 | .L112: |
| 119 | +#ifdef DOUBLE |
80 | 120 | xvld VX0, X, 0 * SIZE |
81 | 121 | xvld VX2, Y, 0 * SIZE |
82 | 122 | xvld VX1, X, 4 * SIZE |
|
86 | 126 | addi.d I, I, -1 |
87 | 127 | xvst VX2, Y, 0 * SIZE |
88 | 128 | xvst VX3, Y, 4 * SIZE |
| 129 | +#else |
| 130 | + xvld VX0, X, 0 * SIZE |
| 131 | + xvld VX2, Y, 0 * SIZE |
| 132 | + addi.d I, I, -1 |
| 133 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 134 | + xvst VX2, Y, 0 * SIZE |
| 135 | +#endif |
89 | 136 | addi.d X, X, 8 * SIZE |
90 | 137 | addi.d Y, Y, 8 * SIZE |
91 | 138 | blt $r0, I, .L112 |
|
97 | 144 | .align 3 |
98 | 145 |
|
99 | 146 | .L114: |
100 | | - fld.d $f12, X, 0 * SIZE |
101 | | - fld.d $f14, Y, 0 * SIZE |
| 147 | + LD $f12, X, 0 * SIZE |
| 148 | + LD $f14, Y, 0 * SIZE |
102 | 149 | addi.d I, I, -1 |
103 | | - fmadd.d $f14, $f12, $f0, $f14 |
104 | | - fst.d $f14, Y, 0 * SIZE |
| 150 | + MADD $f14, $f12, $f0, $f14 |
| 151 | + ST $f14, Y, 0 * SIZE |
105 | 152 | addi.d X, X, SIZE |
106 | 153 | addi.d Y, Y, SIZE |
107 | 154 | blt $r0, I, .L114 |
|
114 | 161 | .align 3 |
115 | 162 |
|
116 | 163 | .L121: |
| 164 | +#ifdef DOUBLE |
117 | 165 | xvld VX0, X, 0 * SIZE |
118 | 166 | ld.d t1, Y, 0 * SIZE |
119 | 167 | add.d Y, Y, INCY |
|
158 | 206 | xvstelm.d VX3, YY, 0, 2 |
159 | 207 | add.d YY, YY, INCY |
160 | 208 | xvstelm.d VX3, YY, 0, 3 |
| 209 | +#else |
| 210 | + xvld VX0, X, 0 * SIZE |
| 211 | + ld.w t1, Y, 0 * SIZE |
| 212 | + add.d Y, Y, INCY |
| 213 | + ld.w t2, Y, 0 * SIZE |
| 214 | + add.d Y, Y, INCY |
| 215 | + ld.w t3, Y, 0 * SIZE |
| 216 | + add.d Y, Y, INCY |
| 217 | + ld.w t4, Y, 0 * SIZE |
| 218 | + xvinsgr2vr.w VX2, t1, 0 |
| 219 | + xvinsgr2vr.w VX2, t2, 1 |
| 220 | + xvinsgr2vr.w VX2, t3, 2 |
| 221 | + xvinsgr2vr.w VX2, t4, 3 |
| 222 | + add.d Y, Y, INCY |
| 223 | + ld.w t1, Y, 0 * SIZE |
| 224 | + add.d Y, Y, INCY |
| 225 | + ld.w t2, Y, 0 * SIZE |
| 226 | + add.d Y, Y, INCY |
| 227 | + ld.w t3, Y, 0 * SIZE |
| 228 | + add.d Y, Y, INCY |
| 229 | + ld.w t4, Y, 0 * SIZE |
| 230 | + xvinsgr2vr.w VX2, t1, 4 |
| 231 | + xvinsgr2vr.w VX2, t2, 5 |
| 232 | + xvinsgr2vr.w VX2, t3, 6 |
| 233 | + xvinsgr2vr.w VX2, t4, 7 |
| 234 | + add.d Y, Y, INCY |
| 235 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 236 | + addi.d I, I, -1 |
| 237 | + xvstelm.w VX2, YY, 0, 0 |
| 238 | + add.d YY, YY, INCY |
| 239 | + xvstelm.w VX2, YY, 0, 1 |
| 240 | + add.d YY, YY, INCY |
| 241 | + xvstelm.w VX2, YY, 0, 2 |
| 242 | + add.d YY, YY, INCY |
| 243 | + xvstelm.w VX2, YY, 0, 3 |
| 244 | + add.d YY, YY, INCY |
| 245 | + xvstelm.w VX2, YY, 0, 4 |
| 246 | + add.d YY, YY, INCY |
| 247 | + xvstelm.w VX2, YY, 0, 5 |
| 248 | + add.d YY, YY, INCY |
| 249 | + xvstelm.w VX2, YY, 0, 6 |
| 250 | + add.d YY, YY, INCY |
| 251 | + xvstelm.w VX2, YY, 0, 7 |
| 252 | +#endif |
161 | 253 | add.d YY, YY, INCY |
162 | 254 | addi.d X, X, 8 * SIZE |
163 | 255 | blt $r0, I, .L121 |
|
169 | 261 | .align 3 |
170 | 262 |
|
171 | 263 | .L123: |
172 | | - fld.d $f12, X, 0 * SIZE |
173 | | - fld.d $f14, Y, 0 * SIZE |
| 264 | + LD $f12, X, 0 * SIZE |
| 265 | + LD $f14, Y, 0 * SIZE |
174 | 266 | addi.d I, I, -1 |
175 | | - fmadd.d $f14, $f12, $f0, $f14 |
176 | | - fst.d $f14, Y, 0 * SIZE |
| 267 | + MADD $f14, $f12, $f0, $f14 |
| 268 | + ST $f14, Y, 0 * SIZE |
177 | 269 | addi.d X, X, SIZE |
178 | 270 | add.d Y, Y, INCY |
179 | 271 | blt $r0, I, .L123 |
|
185 | 277 | .align 3 |
186 | 278 |
|
187 | 279 | .L211: |
| 280 | +#ifdef DOUBLE |
188 | 281 | xvld VX2, Y, 0 * SIZE |
189 | 282 | ld.d t1, X, 0 * SIZE |
190 | 283 | add.d X, X, INCX |
|
217 | 310 | addi.d I, I, -1 |
218 | 311 | xvst VX3, Y, 4 * SIZE |
219 | 312 | addi.d Y, Y, 8 * SIZE |
| 313 | +#else |
| 314 | + xvld VX2, Y, 0 * SIZE |
| 315 | + ld.w t1, X, 0 * SIZE |
| 316 | + add.d X, X, INCX |
| 317 | + ld.w t2, X, 0 * SIZE |
| 318 | + add.d X, X, INCX |
| 319 | + ld.w t3, X, 0 * SIZE |
| 320 | + add.d X, X, INCX |
| 321 | + ld.w t4, X, 0 * SIZE |
| 322 | + xvinsgr2vr.w VX0, t1, 0 |
| 323 | + xvinsgr2vr.w VX0, t2, 1 |
| 324 | + xvinsgr2vr.w VX0, t3, 2 |
| 325 | + xvinsgr2vr.w VX0, t4, 3 |
| 326 | + add.d X, X, INCX |
| 327 | + ld.w t1, X, 0 * SIZE |
| 328 | + add.d X, X, INCX |
| 329 | + ld.w t2, X, 0 * SIZE |
| 330 | + add.d X, X, INCX |
| 331 | + ld.w t3, X, 0 * SIZE |
| 332 | + add.d X, X, INCX |
| 333 | + ld.w t4, X, 0 * SIZE |
| 334 | + add.d X, X, INCX |
| 335 | + xvinsgr2vr.w VX0, t1, 4 |
| 336 | + xvinsgr2vr.w VX0, t2, 5 |
| 337 | + xvinsgr2vr.w VX0, t3, 6 |
| 338 | + xvinsgr2vr.w VX0, t4, 7 |
| 339 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 340 | + addi.d I, I, -1 |
| 341 | + xvst VX2, Y, 0 * SIZE |
| 342 | + addi.d Y, Y, 8 * SIZE |
| 343 | +#endif |
220 | 344 | blt $r0, I, .L211 |
221 | 345 | .align 3 |
222 | 346 |
|
|
226 | 350 | .align 3 |
227 | 351 |
|
228 | 352 | .L213: |
229 | | - fld.d $f12, X, 0 * SIZE |
230 | | - fld.d $f14, Y, 0 * SIZE |
| 353 | + LD $f12, X, 0 * SIZE |
| 354 | + LD $f14, Y, 0 * SIZE |
231 | 355 | addi.d I, I, -1 |
232 | | - fmadd.d $f14, $f12, $f0, $f14 |
233 | | - fst.d $f14, Y, 0 * SIZE |
| 356 | + MADD $f14, $f12, $f0, $f14 |
| 357 | + ST $f14, Y, 0 * SIZE |
234 | 358 | add.d X, X, INCX |
235 | 359 | addi.d Y, Y, SIZE |
236 | 360 | blt $r0, I, .L213 |
|
243 | 367 | .align 3 |
244 | 368 |
|
245 | 369 | .L222: |
| 370 | +#ifdef DOUBLE |
246 | 371 | ld.d t1, X, 0 * SIZE |
247 | 372 | add.d X, X, INCX |
248 | 373 | ld.d t2, X, 0 * SIZE |
|
309 | 434 | xvstelm.d VX3, YY, 0, 2 |
310 | 435 | add.d YY, YY, INCY |
311 | 436 | xvstelm.d VX3, YY, 0, 3 |
| 437 | +#else |
| 438 | + ld.w t1, X, 0 * SIZE |
| 439 | + add.d X, X, INCX |
| 440 | + ld.w t2, X, 0 * SIZE |
| 441 | + add.d X, X, INCX |
| 442 | + ld.w t3, X, 0 * SIZE |
| 443 | + add.d X, X, INCX |
| 444 | + ld.w t4, X, 0 * SIZE |
| 445 | + add.d X, X, INCX |
| 446 | + xvinsgr2vr.w VX0, t1, 0 |
| 447 | + xvinsgr2vr.w VX0, t2, 1 |
| 448 | + xvinsgr2vr.w VX0, t3, 2 |
| 449 | + xvinsgr2vr.w VX0, t4, 3 |
| 450 | + ld.w t1, Y, 0 * SIZE |
| 451 | + add.d Y, Y, INCY |
| 452 | + ld.w t2, Y, 0 * SIZE |
| 453 | + add.d Y, Y, INCY |
| 454 | + ld.w t3, Y, 0 * SIZE |
| 455 | + add.d Y, Y, INCY |
| 456 | + ld.w t4, Y, 0 * SIZE |
| 457 | + add.d Y, Y, INCY |
| 458 | + xvinsgr2vr.w VX2, t1, 0 |
| 459 | + xvinsgr2vr.w VX2, t2, 1 |
| 460 | + xvinsgr2vr.w VX2, t3, 2 |
| 461 | + xvinsgr2vr.w VX2, t4, 3 |
| 462 | + ld.w t1, X, 0 * SIZE |
| 463 | + add.d X, X, INCX |
| 464 | + ld.w t2, X, 0 * SIZE |
| 465 | + add.d X, X, INCX |
| 466 | + ld.w t3, X, 0 * SIZE |
| 467 | + add.d X, X, INCX |
| 468 | + ld.w t4, X, 0 * SIZE |
| 469 | + add.d X, X, INCX |
| 470 | + xvinsgr2vr.w VX0, t1, 4 |
| 471 | + xvinsgr2vr.w VX0, t2, 5 |
| 472 | + xvinsgr2vr.w VX0, t3, 6 |
| 473 | + xvinsgr2vr.w VX0, t4, 7 |
| 474 | + ld.w t1, Y, 0 * SIZE |
| 475 | + add.d Y, Y, INCY |
| 476 | + ld.w t2, Y, 0 * SIZE |
| 477 | + add.d Y, Y, INCY |
| 478 | + ld.w t3, Y, 0 * SIZE |
| 479 | + add.d Y, Y, INCY |
| 480 | + ld.w t4, Y, 0 * SIZE |
| 481 | + xvinsgr2vr.w VX2, t1, 4 |
| 482 | + xvinsgr2vr.w VX2, t2, 5 |
| 483 | + xvinsgr2vr.w VX2, t3, 6 |
| 484 | + xvinsgr2vr.w VX2, t4, 7 |
| 485 | + add.d Y, Y, INCY |
| 486 | + xvfmadd.s VX2, VX0, VXA, VX2 |
| 487 | + addi.d I, I, -1 |
| 488 | + xvstelm.w VX2, YY, 0, 0 |
| 489 | + add.d YY, YY, INCY |
| 490 | + xvstelm.w VX2, YY, 0, 1 |
| 491 | + add.d YY, YY, INCY |
| 492 | + xvstelm.w VX2, YY, 0, 2 |
| 493 | + add.d YY, YY, INCY |
| 494 | + xvstelm.w VX2, YY, 0, 3 |
| 495 | + add.d YY, YY, INCY |
| 496 | + xvstelm.w VX2, YY, 0, 4 |
| 497 | + add.d YY, YY, INCY |
| 498 | + xvstelm.w VX2, YY, 0, 5 |
| 499 | + add.d YY, YY, INCY |
| 500 | + xvstelm.w VX2, YY, 0, 6 |
| 501 | + add.d YY, YY, INCY |
| 502 | + xvstelm.w VX2, YY, 0, 7 |
| 503 | +#endif |
312 | 504 | add.d YY, YY, INCY |
313 | 505 | blt $r0, I, .L222 |
314 | 506 | .align 3 |
|
319 | 511 | .align 3 |
320 | 512 |
|
321 | 513 | .L224: |
322 | | - fld.d $f12, X, 0 * SIZE |
323 | | - fld.d $f14, Y, 0 * SIZE |
| 514 | + LD $f12, X, 0 * SIZE |
| 515 | + LD $f14, Y, 0 * SIZE |
324 | 516 | addi.d I, I, -1 |
325 | | - fmadd.d $f14, $f12, $f0, $f14 |
326 | | - fst.d $f14, Y, 0 * SIZE |
| 517 | + MADD $f14, $f12, $f0, $f14 |
| 518 | + ST $f14, Y, 0 * SIZE |
327 | 519 | add.d X, X, INCX |
328 | 520 | add.d Y, Y, INCY |
329 | 521 | blt $r0, I, .L224 |
330 | | - b .L999 |
331 | 522 | .align 3 |
332 | 523 |
|
333 | 524 | .L999: |
|
0 commit comments