@@ -2298,10 +2298,9 @@ void av1_lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output,
22982298 int ud_flip , lr_flip ;
22992299
23002300 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2301- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2302- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2303- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2304- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2301+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2302+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2303+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
23052304 if (ud_flip ) {
23062305 load_buffer_16bit_to_16bit_w4_flip (input , stride , buf0 , height );
23072306 } else {
@@ -2342,10 +2341,9 @@ void av1_lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output,
23422341 int ud_flip , lr_flip ;
23432342
23442343 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2345- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2346- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2347- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2348- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2344+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2345+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2346+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
23492347 if (ud_flip ) {
23502348 load_buffer_16bit_to_16bit_w4_flip (input , stride , buf0 , height );
23512349 } else {
@@ -2384,10 +2382,9 @@ void av1_lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output,
23842382 int ud_flip , lr_flip ;
23852383
23862384 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2387- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2388- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2389- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2390- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2385+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2386+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2387+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
23912388 if (ud_flip ) {
23922389 load_buffer_16bit_to_16bit_w4_flip (input , stride , buf0 , height );
23932390 } else {
@@ -2430,10 +2427,9 @@ void av1_lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output,
24302427 int ud_flip , lr_flip ;
24312428
24322429 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2433- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2434- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2435- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2436- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2430+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2431+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2432+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
24372433 if (ud_flip )
24382434 load_buffer_16bit_to_16bit_flip (input , stride , buf0 , height );
24392435 else
@@ -2471,10 +2467,9 @@ void av1_lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output,
24712467 int ud_flip , lr_flip ;
24722468
24732469 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2474- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2475- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2476- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2477- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2470+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2471+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2472+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
24782473 if (ud_flip )
24792474 load_buffer_16bit_to_16bit_flip (input , stride , buf0 , height );
24802475 else
@@ -2512,10 +2507,9 @@ void av1_lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output,
25122507 int ud_flip , lr_flip ;
25132508
25142509 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2515- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2516- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2517- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2518- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2510+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2511+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2512+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
25192513 if (ud_flip ) {
25202514 load_buffer_16bit_to_16bit_flip (input , stride , buf0 , height );
25212515 } else {
@@ -2558,10 +2552,9 @@ void av1_lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output,
25582552 int ud_flip , lr_flip ;
25592553
25602554 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2561- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2562- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2563- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2564- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2555+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2556+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2557+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
25652558 if (ud_flip ) {
25662559 load_buffer_16bit_to_16bit_flip (input , stride , buf0 , height );
25672560 } else {
@@ -2607,10 +2600,9 @@ void av1_lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output,
26072600 int ud_flip , lr_flip ;
26082601
26092602 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2610- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2611- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2612- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2613- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2603+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2604+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2605+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
26142606 for (int i = 0 ; i < 2 ; i ++ ) {
26152607 if (ud_flip ) {
26162608 load_buffer_16bit_to_16bit_flip (input + 8 * i , stride , buf0 , height );
@@ -2654,10 +2646,9 @@ void av1_lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output,
26542646 int ud_flip , lr_flip ;
26552647
26562648 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2657- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2658- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2659- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2660- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2649+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2650+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2651+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
26612652 for (int i = 0 ; i < 2 ; i ++ ) {
26622653 if (ud_flip ) {
26632654 load_buffer_16bit_to_16bit_flip (input + 8 * i , stride , buf0 , height );
@@ -2700,10 +2691,9 @@ void av1_lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output,
27002691 int ud_flip , lr_flip ;
27012692
27022693 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2703- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2704- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2705- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2706- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2694+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2695+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2696+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
27072697
27082698 for (int i = 0 ; i < 2 ; i ++ ) {
27092699 if (ud_flip ) {
@@ -2753,10 +2743,9 @@ void av1_lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output,
27532743 if (col_txfm != NULL && row_txfm != NULL ) {
27542744 int ud_flip , lr_flip ;
27552745 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2756- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2757- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2758- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2759- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2746+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2747+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2748+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
27602749
27612750 for (int i = 0 ; i < 2 ; i ++ ) {
27622751 if (ud_flip ) {
@@ -2812,10 +2801,9 @@ void av1_lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output,
28122801 if (col_txfm != NULL && row_txfm != NULL ) {
28132802 int ud_flip , lr_flip ;
28142803 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
2815- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2816- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2817- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2818- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2804+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2805+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2806+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
28192807
28202808 for (int i = 0 ; i < 4 ; i ++ ) {
28212809 if (ud_flip ) {
@@ -2872,10 +2860,9 @@ void av1_lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output,
28722860 const transform_1d_lbd_neon row_txfm = row_txfm8x32_arr [tx_type ];
28732861
28742862 if (col_txfm != NULL && row_txfm != NULL ) {
2875- const int16x4_t v_shifts = vget_low_s16 (vmovl_s8 (vld1_s8 (& shift [0 ])));
2876- const int16x8_t v_shift0 = vdupq_lane_s16 (v_shifts , 0 );
2877- const int16x8_t v_shift1 = vdupq_lane_s16 (v_shifts , 1 );
2878- const int16x8_t v_shift2 = vdupq_lane_s16 (v_shifts , 2 );
2863+ const int16x8_t v_shift0 = vdupq_n_s16 (shift [0 ]);
2864+ const int16x8_t v_shift1 = vdupq_n_s16 (shift [1 ]);
2865+ const int16x8_t v_shift2 = vdupq_n_s16 (shift [2 ]);
28792866 int ud_flip , lr_flip ;
28802867 get_flip_cfg (tx_type , & ud_flip , & lr_flip );
28812868
0 commit comments