@@ -51,6 +51,8 @@ PROLOGUE
5151 LDINT INCX, 0 (INCX)
5252 LDINT INCY, 0 (INCY)
5353#endif
54+
55+ /* init $f8 and $f9 to zero */
5456 SUB s1, s1, s1
5557 SUB s2, s2, s2
5658 slli.d INCX, INCX, BASE_SHIFT
@@ -59,25 +61,33 @@ PROLOGUE
5961 bge $r0, N, .L999
6062 bne INCX, TEMP, .L20 /* inc_x=1 */
6163 bne INCY, TEMP, .L20 /* inc_y=1 */
62- #ifdef DOUBLE
63- srai.d I, N, 4
64- #else
65- srai.d I, N, 5
66- #endif
64+
65+ /* !((inc_x == 1) && (inc_y == 1)) */
6766
6867 /* init $xr8 and $xr9 to zero */
6968#ifdef DOUBLE
7069 xvldrepl.d $xr0, X, 0
7170#else
7271 xvldrepl.w $xr0, X, 0
7372#endif
73+ #ifdef DSDOT
74+ xvfcvtl.d.s $xr0, $xr0
75+ xvfsub.d $xr8, $xr0, $xr0
76+ xvfsub.d $xr9, $xr0, $xr0
77+ #else
7478 XVFSUB $xr8, $xr0, $xr0
7579 XVFSUB $xr9, $xr0, $xr0
80+ #endif
7681
77- /* !((inc_x == 1) && (inc_y == 1)) */
78- bge $r0, I, .L12 /* <32 */
82+ #ifdef DOUBLE
83+ srai.d I, N, 4
84+ #else
85+ srai.d I, N, 5
86+ #endif
87+ bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */
88+ .align 3
7989.L11:
80- /* case 32~ */
90+ /* FLOAT: 32~ ; DOUBLE: 16 ~ */
8191 xvld $xr0, X, 0
8292 xvld $xr1, X, 32
8393 xvld $xr2, X, 64
@@ -89,11 +99,39 @@ PROLOGUE
8999 addi.w I, I, -1
90100 addi.d X, X, 128
91101 addi.d Y, Y, 128
102+ #ifdef DSDOT
103+ xvfcvtl.d.s $xr10, $xr0
104+ xvfcvtl.d.s $xr11, $xr4
105+ xvfcvth.d.s $xr12, $xr0
106+ xvfcvth.d.s $xr13, $xr4
107+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
108+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
109+ xvfcvtl.d.s $xr10, $xr1
110+ xvfcvtl.d.s $xr11, $xr5
111+ xvfcvth.d.s $xr12, $xr1
112+ xvfcvth.d.s $xr13, $xr5
113+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
114+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
115+ xvfcvtl.d.s $xr10, $xr2
116+ xvfcvtl.d.s $xr11, $xr6
117+ xvfcvth.d.s $xr12, $xr2
118+ xvfcvth.d.s $xr13, $xr6
119+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
120+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
121+ xvfcvtl.d.s $xr10, $xr3
122+ xvfcvtl.d.s $xr11, $xr7
123+ xvfcvth.d.s $xr12, $xr3
124+ xvfcvth.d.s $xr13, $xr7
125+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
126+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
127+ #else
92128 XVFMADD $xr8, $xr0, $xr4, $xr8
93129 XVFMADD $xr9, $xr1, $xr5, $xr9
94130 XVFMADD $xr8, $xr2, $xr6, $xr8
95131 XVFMADD $xr9, $xr3, $xr7, $xr9
132+ #endif
96133 bnez I, .L11
134+ .align 3
97135.L12:
98136#ifdef DOUBLE
99137 andi I, N, 0xf
@@ -102,18 +140,37 @@ PROLOGUE
102140 andi I, N, 0x1f
103141 srai.d I, I, 3
104142#endif
105- bge $r0, I, .L14 /* <8 */
143+ bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */
144+ .align 3
106145.L13:
107- /* case 8~31 */
146+ /* FLOAT: 8~31 ; DOUBLE: 4~15 */
108147 xvld $xr0, X, 0
109148 xvld $xr4, Y, 0
110149 addi.w I, I, -1
111150 addi.d X, X, 32
112151 addi.d Y, Y, 32
152+ #ifdef DSDOT
153+ xvfcvtl.d.s $xr10, $xr0
154+ xvfcvtl.d.s $xr11, $xr4
155+ xvfcvth.d.s $xr12, $xr0
156+ xvfcvth.d.s $xr13, $xr4
157+ xvfmadd.d $xr8, $xr10, $xr12, $xr8
158+ xvfmadd.d $xr9, $xr11, $xr13, $xr9
159+ #else
113160 XVFMADD $xr8, $xr0, $xr4, $xr8
161+ #endif
114162 bnez I, .L13
163+ .align 3
115164.L14:
116165 /* store dot in s1 $f8 */
166+ #ifdef DSDOT
167+ xvfadd.d $xr8, $xr8, $xr9
168+ fsub .s s2, s2, s2, /* set s2 to 0.0 */
169+ xvpermi.q $xr0, $xr8, 0x1
170+ vfadd.d $vr8, $vr8, $vr0
171+ vpackod.d $vr0, $vr8, $vr8
172+ vfadd.d $vr8, $vr8, $vr0
173+ #else
117174 XVFADD $xr8, $xr8, $xr9
118175 SUB s2, s2, s2 /* set s2 to 0.0 */
119176 xvpermi.q $xr0, $xr8, 0x1
@@ -125,7 +182,9 @@ PROLOGUE
125182 VFADD $vr8, $vr8, $vr0
126183 vpackod.w $vr0, $vr8, $vr8
127184 VFADD $vr8, $vr8, $vr0
128- #endif
185+ #endif /* defined DOUBLE */
186+ #endif /* defined DSDOT */
187+ .align 3
129188.L15:
130189#ifdef DOUBLE
131190 andi I, N, 0x3
@@ -135,7 +194,7 @@ PROLOGUE
135194 bge $r0, I, .L999 /* =0 */
136195 .align 3
137196.L16:
138- /* case 1~7 */
197+ /* FLOAT: 1~7 ; DOUBLE: 1~3 */
139198 LD a1, X, 0
140199 LD b1, Y, 0
141200#ifdef DSDOT
0 commit comments