|
| 1 | +/*************************************************************************** |
| 2 | +Copyright (c) 2025, The OpenBLAS Project |
| 3 | +All rights reserved. |
| 4 | +Redistribution and use in source and binary forms, with or without |
| 5 | +modification, are permitted provided that the following conditions are |
| 6 | +met: |
| 7 | +
|
| 8 | + 1. Redistributions of source code must retain the above copyright |
| 9 | + notice, this list of conditions and the following disclaimer. |
| 10 | +
|
| 11 | + 2. Redistributions in binary form must reproduce the above copyright |
| 12 | + notice, this list of conditions and the following disclaimer in |
| 13 | + the documentation and/or other materials provided with the |
| 14 | + distribution. |
| 15 | + 3. Neither the name of the OpenBLAS project nor the names of |
| 16 | + its contributors may be used to endorse or promote products |
| 17 | + derived from this software without specific prior written |
| 18 | + permission. |
| 19 | +
|
| 20 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| 21 | +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 22 | +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 23 | +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
| 24 | +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 25 | +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| 26 | +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| 27 | +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 28 | +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
| 29 | +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 30 | +*****************************************************************************/ |
| 31 | + |
| 32 | +#include "symv_microk_asimd_4x4.c" |
| 33 | + |
| 34 | +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, |
| 35 | + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) |
| 36 | +{ |
| 37 | + BLASLONG i, j; |
| 38 | + FLOAT temp1, temp2; |
| 39 | + FLOAT tmp1[4]; |
| 40 | + FLOAT tmp2[4]; |
| 41 | + FLOAT *a0, *a1, *a2, *a3; |
| 42 | + FLOAT x0, x1, x2, x3; |
| 43 | + FLOAT *X = x; |
| 44 | + FLOAT *Y = y; |
| 45 | + |
| 46 | + if (inc_y != 1) { |
| 47 | + Y = buffer; |
| 48 | + COPY_K(m, y, inc_y, Y, 1); |
| 49 | + } |
| 50 | + if (inc_x != 1) { |
| 51 | + if (inc_y != 1) { |
| 52 | + X = Y + m; |
| 53 | + } else { |
| 54 | + X = buffer; |
| 55 | + } |
| 56 | + COPY_K(m, x, inc_x, X, 1); |
| 57 | + } |
| 58 | + |
| 59 | + BLASLONG offset1 = (offset / 4) * 4; |
| 60 | + for (j = 0; j < offset1; j+=4) { |
| 61 | + a0 = &a[j*lda]; |
| 62 | + a1 = a0 + lda; |
| 63 | + a2 = a1 + lda; |
| 64 | + a3 = a2 + lda; |
| 65 | + x0 = X[j]; |
| 66 | + x1 = X[j+1]; |
| 67 | + x2 = X[j+2]; |
| 68 | + x3 = X[j+3]; |
| 69 | + tmp2[0] = a0[j ]*x0 + a0[j+1]*x1 + a0[j+2]*x2 + a0[j+3]*x3; |
| 70 | + tmp2[1] = a0[j+1]*x0 + a1[j+1]*x1 + a1[j+2]*x2 + a1[j+3]*x3; |
| 71 | + tmp2[2] = a0[j+2]*x0 + a1[j+2]*x1 + a2[j+2]*x2 + a2[j+3]*x3; |
| 72 | + tmp2[3] = a0[j+3]*x0 + a1[j+3]*x1 + a2[j+3]*x2 + a3[j+3]*x3; |
| 73 | + tmp1[0] = alpha * x0; |
| 74 | + tmp1[1] = alpha * x1; |
| 75 | + tmp1[2] = alpha * x2; |
| 76 | + tmp1[3] = alpha * x3; |
| 77 | + |
| 78 | + BLASLONG m2 = (m/4)*4; |
| 79 | + if (m2 > j+4) |
| 80 | + symv_kernel_4x4(j+4, m2, a0, a1, a2, a3, X, Y, tmp1, tmp2); |
| 81 | + |
| 82 | + for (i = m2; i < m; i++) { |
| 83 | + Y[i] += tmp1[0] * a0[i]; |
| 84 | + tmp2[0] += a0[i] * X[i]; |
| 85 | + Y[i] += tmp1[1] * a1[i]; |
| 86 | + tmp2[1] += a1[i] * X[i]; |
| 87 | + Y[i] += tmp1[2] * a2[i]; |
| 88 | + tmp2[2] += a2[i] * X[i]; |
| 89 | + Y[i] += tmp1[3] * a3[i]; |
| 90 | + tmp2[3] += a3[i] * X[i]; |
| 91 | + } |
| 92 | + Y[j] += alpha * tmp2[0]; |
| 93 | + Y[j+1] += alpha * tmp2[1]; |
| 94 | + Y[j+2] += alpha * tmp2[2]; |
| 95 | + Y[j+3] += alpha * tmp2[3]; |
| 96 | + } |
| 97 | + |
| 98 | + for (j = offset1; j < offset; j++) { |
| 99 | + temp1 = alpha * X[j]; |
| 100 | + temp2 = 0.0; |
| 101 | + Y[j] += temp1 * a[j*lda+j]; |
| 102 | + for (i = j+1; i < m; i++) { |
| 103 | + Y[i] += temp1 * a[j*lda+i]; |
| 104 | + temp2 += a[j*lda+i] * X[i]; |
| 105 | + } |
| 106 | + Y[j] += alpha * temp2; |
| 107 | + } |
| 108 | + |
| 109 | + if (inc_y != 1) { |
| 110 | + COPY_K(m, Y, 1, y, inc_y); |
| 111 | + } |
| 112 | + return(0); |
| 113 | +} |
0 commit comments