2525 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
2626 * POSSIBILITY OF SUCH DAMAGE.
2727 * *****************************************************************************/
28+ #include <arm_neon.h>
2829
2930#include "common.h"
3031
@@ -34,6 +35,9 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
3435 a_offset = a ;
3536 b_offset = b ;
3637
38+ uint16x8_t v0 , v1 , v2 , v3 , v4 , v5 , v6 , v7 ;
39+ uint16x4_t v0_h , v1_h , v2_h , v3_h , v4_h , v5_h , v6_h , v7_h ;
40+
3741 for (BLASLONG j = 0 ; j < n / 8 ; j ++ ) {
3842 a_offset0 = a_offset ;
3943 a_offset1 = a_offset0 + lda ;
@@ -42,12 +46,29 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
4246 a_offset += 8 ;
4347
4448 for (BLASLONG i = 0 ; i < m / 4 ; i ++ ) {
45- for (BLASLONG line = 0 ; line < 8 ; line ++ ) {
46- b_offset [line * 4 ] = a_offset0 [line ];
47- b_offset [line * 4 + 1 ] = a_offset1 [line ];
48- b_offset [line * 4 + 2 ] = a_offset2 [line ];
49- b_offset [line * 4 + 3 ] = a_offset3 [line ];
50- }
49+ v0 = vld1q_u16 (a_offset0 );
50+ v1 = vld1q_u16 (a_offset1 );
51+ v2 = vld1q_u16 (a_offset2 );
52+ v3 = vld1q_u16 (a_offset3 );
53+
54+ v4 = vtrn1q_u16 (v0 , v1 );
55+ v5 = vtrn2q_u16 (v0 , v1 );
56+ v6 = vtrn1q_u16 (v2 , v3 );
57+ v7 = vtrn2q_u16 (v2 , v3 );
58+
59+ v0 = (uint16x8_t )vtrn1q_u32 ((uint32x4_t )v4 , (uint32x4_t )v6 );
60+ v1 = (uint16x8_t )vtrn1q_u32 ((uint32x4_t )v5 , (uint32x4_t )v7 );
61+ v2 = (uint16x8_t )vtrn2q_u32 ((uint32x4_t )v4 , (uint32x4_t )v6 );
62+ v3 = (uint16x8_t )vtrn2q_u32 ((uint32x4_t )v5 , (uint32x4_t )v7 );
63+
64+ vst1_u16 (b_offset , vget_low_u16 (v0 ));
65+ vst1_u16 (b_offset + 4 , vget_low_u16 (v1 ));
66+ vst1_u16 (b_offset + 8 , vget_low_u16 (v2 ));
67+ vst1_u16 (b_offset + 12 , vget_low_u16 (v3 ));
68+ vst1_u16 (b_offset + 16 , vget_high_u16 (v0 ));
69+ vst1_u16 (b_offset + 20 , vget_high_u16 (v1 ));
70+ vst1_u16 (b_offset + 24 , vget_high_u16 (v2 ));
71+ vst1_u16 (b_offset + 28 , vget_high_u16 (v3 ));
5172
5273 b_offset += 32 ;
5374 a_offset0 += 4 * lda ;
@@ -76,12 +97,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
7697 a_offset += 4 ;
7798
7899 for (BLASLONG i = 0 ; i < m / 4 ; i ++ ) {
79- for (BLASLONG line = 0 ; line < 4 ; line ++ ) {
80- b_offset [line * 4 ] = a_offset0 [line ];
81- b_offset [line * 4 + 1 ] = a_offset1 [line ];
82- b_offset [line * 4 + 2 ] = a_offset2 [line ];
83- b_offset [line * 4 + 3 ] = a_offset3 [line ];
84- }
100+ v0_h = vld1_u16 (a_offset0 );
101+ v1_h = vld1_u16 (a_offset1 );
102+ v2_h = vld1_u16 (a_offset2 );
103+ v3_h = vld1_u16 (a_offset3 );
104+
105+ v4_h = vtrn1_u16 (v0_h , v1_h );
106+ v5_h = vtrn2_u16 (v0_h , v1_h );
107+ v6_h = vtrn1_u16 (v2_h , v3_h );
108+ v7_h = vtrn2_u16 (v2_h , v3_h );
109+
110+ v0_h = (uint16x4_t )vtrn1_u32 ((uint32x2_t )v4_h , (uint32x2_t )v6_h );
111+ v1_h = (uint16x4_t )vtrn1_u32 ((uint32x2_t )v5_h , (uint32x2_t )v7_h );
112+ v2_h = (uint16x4_t )vtrn2_u32 ((uint32x2_t )v4_h , (uint32x2_t )v6_h );
113+ v3_h = (uint16x4_t )vtrn2_u32 ((uint32x2_t )v5_h , (uint32x2_t )v7_h );
114+
115+ vst1_u16 (b_offset , v0_h );
116+ vst1_u16 (b_offset + 4 , v1_h );
117+ vst1_u16 (b_offset + 8 , v2_h );
118+ vst1_u16 (b_offset + 12 , v3_h );
85119
86120 b_offset += 16 ;
87121 a_offset0 += 4 * lda ;
0 commit comments