xref: /aosp_15_r20/external/libaom/av1/common/arm/highbd_inv_txfm_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you canzip
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <assert.h>
14 
15 #include "av1/common/av1_inv_txfm1d_cfg.h"
16 #include "av1/common/idct.h"
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19 
20 #if AOM_ARCH_AARCH64
21 #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)         \
22   do {                                                        \
23     int32x4x2_t swap_low = vtrnq_s32(x0, x1);                 \
24     int32x4x2_t swap_high = vtrnq_s32(x2, x3);                \
25     y0 = vreinterpretq_s32_s64(                               \
26         vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
27                    vreinterpretq_s64_s32(swap_high.val[0]))); \
28     y1 = vreinterpretq_s32_s64(                               \
29         vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
30                    vreinterpretq_s64_s32(swap_high.val[1]))); \
31     y2 = vreinterpretq_s32_s64(                               \
32         vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]),    \
33                    vreinterpretq_s64_s32(swap_high.val[0]))); \
34     y3 = vreinterpretq_s32_s64(                               \
35         vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]),    \
36                    vreinterpretq_s64_s32(swap_high.val[1]))); \
37   } while (0)
38 #else
39 #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3)                    \
40   do {                                                                   \
41     int32x4x2_t swap_low = vtrnq_s32(x0, x1);                            \
42     int32x4x2_t swap_high = vtrnq_s32(x2, x3);                           \
43     y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2),       \
44                    swap_high.val[0], 2);                                 \
45     y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2),       \
46                    swap_high.val[1], 2);                                 \
47     y2 = vextq_s32(swap_low.val[0],                                      \
48                    vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
49     y3 = vextq_s32(swap_low.val[1],                                      \
50                    vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
51   } while (0)
52 #endif  // AOM_ARCH_AARCH64
53 
transpose_4x4(const int32x4_t * in,int32x4_t * out)54 static inline void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
55   TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
56 }
57 
transpose_8x8(const int32x4_t * in,int32x4_t * out)58 static inline void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
59   TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
60   TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
61   TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
62   TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
63                 out[15]);
64 }
65 
round_shift_array_32_neon(int32x4_t * input,int32x4_t * output,const int size,const int bit)66 static inline void round_shift_array_32_neon(int32x4_t *input,
67                                              int32x4_t *output, const int size,
68                                              const int bit) {
69   const int32x4_t v_bit = vdupq_n_s32(-bit);
70   for (int i = 0; i < size; i++) {
71     output[i] = vrshlq_s32(input[i], v_bit);
72   }
73 }
74 
round_shift_rect_array_32_neon(int32x4_t * input,int32x4_t * output,const int size)75 static inline void round_shift_rect_array_32_neon(int32x4_t *input,
76                                                   int32x4_t *output,
77                                                   const int size) {
78   for (int i = 0; i < size; i++) {
79     const int32x4_t r0 = vmulq_n_s32(input[i], NewInvSqrt2);
80     output[i] = vrshrq_n_s32(r0, NewSqrt2Bits);
81   }
82 }
83 
half_btf_neon_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)84 static inline int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
85                                         const int32_t *n1, const int32x4_t *w1,
86                                         const int32x4_t *v_bit,
87                                         const int32x4_t *rnding) {
88   int32x4_t x;
89   x = vmlaq_n_s32(*rnding, *w0, *n0);
90   x = vmlaq_n_s32(x, *w1, *n1);
91   x = vshlq_s32(x, *v_bit);
92   return x;
93 }
94 
half_btf_neon_mode11_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)95 static inline int32x4_t half_btf_neon_mode11_r(
96     const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
97     const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
98   int32x4_t x;
99   x = vmlaq_n_s32(*rnding, *w0, -*n0);
100   x = vmlaq_n_s32(x, *w1, -*n1);
101   x = vshlq_s32(x, *v_bit);
102   return x;
103 }
104 
half_btf_neon_mode01_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)105 static inline int32x4_t half_btf_neon_mode01_r(
106     const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
107     const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
108   int32x4_t x;
109   x = vmlaq_n_s32(*rnding, *w0, *n0);
110   x = vmlsq_n_s32(x, *w1, *n1);
111   x = vshlq_s32(x, *v_bit);
112   return x;
113 }
114 
half_btf_neon_mode10_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)115 static inline int32x4_t half_btf_neon_mode10_r(
116     const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
117     const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
118   int32x4_t x;
119   x = vmlaq_n_s32(*rnding, *w1, *n1);
120   x = vmlsq_n_s32(x, *w0, *n0);
121   x = vshlq_s32(x, *v_bit);
122   return x;
123 }
124 
half_btf_0_neon_r(const int32_t * n0,const int32x4_t * w0,const int32x4_t * v_bit,const int32x4_t * rnding)125 static inline int32x4_t half_btf_0_neon_r(const int32_t *n0,
126                                           const int32x4_t *w0,
127                                           const int32x4_t *v_bit,
128                                           const int32x4_t *rnding) {
129   int32x4_t x;
130   x = vmlaq_n_s32(*rnding, *w0, *n0);
131   x = vshlq_s32(x, *v_bit);
132   return x;
133 }
134 
half_btf_0_m_neon_r(const int32_t * n0,const int32x4_t * w0,const int32x4_t * v_bit,const int32x4_t * rnding)135 static inline int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
136                                             const int32x4_t *w0,
137                                             const int32x4_t *v_bit,
138                                             const int32x4_t *rnding) {
139   int32x4_t x;
140   x = vmlaq_n_s32(*rnding, *w0, -*n0);
141   x = vshlq_s32(x, *v_bit);
142   return x;
143 }
144 
flip_buf_neon(int32x4_t * in,int32x4_t * out,int size)145 static inline void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
146   for (int i = 0; i < size; ++i) {
147     out[size - i - 1] = in[i];
148   }
149 }
150 
151 typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
152                                       const int num_cols);
153 
154 typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit,
155                                   int32_t do_cols, int32_t bd,
156                                   int32_t out_shift);
157 
highbd_clamp_u16(uint16x8_t * u,const uint16x8_t * min,const uint16x8_t * max)158 static inline uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
159                                           const uint16x8_t *max) {
160   int16x8_t clamped;
161   clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max));
162   clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min));
163   return vreinterpretq_u16_s16(clamped);
164 }
165 
round_shift_4x4(int32x4_t * in,int shift)166 static inline void round_shift_4x4(int32x4_t *in, int shift) {
167   if (shift != 0) {
168     const int32x4_t v_shift = vdupq_n_s32(-shift);
169     in[0] = vrshlq_s32(in[0], v_shift);
170     in[1] = vrshlq_s32(in[1], v_shift);
171     in[2] = vrshlq_s32(in[2], v_shift);
172     in[3] = vrshlq_s32(in[3], v_shift);
173   }
174 }
175 
round_shift_8x8(int32x4_t * in,int shift)176 static void round_shift_8x8(int32x4_t *in, int shift) {
177   assert(shift != 0);
178   const int32x4_t v_shift = vdupq_n_s32(-shift);
179   in[0] = vrshlq_s32(in[0], v_shift);
180   in[1] = vrshlq_s32(in[1], v_shift);
181   in[2] = vrshlq_s32(in[2], v_shift);
182   in[3] = vrshlq_s32(in[3], v_shift);
183   in[4] = vrshlq_s32(in[4], v_shift);
184   in[5] = vrshlq_s32(in[5], v_shift);
185   in[6] = vrshlq_s32(in[6], v_shift);
186   in[7] = vrshlq_s32(in[7], v_shift);
187   in[8] = vrshlq_s32(in[8], v_shift);
188   in[9] = vrshlq_s32(in[9], v_shift);
189   in[10] = vrshlq_s32(in[10], v_shift);
190   in[11] = vrshlq_s32(in[11], v_shift);
191   in[12] = vrshlq_s32(in[12], v_shift);
192   in[13] = vrshlq_s32(in[13], v_shift);
193   in[14] = vrshlq_s32(in[14], v_shift);
194   in[15] = vrshlq_s32(in[15], v_shift);
195 }
196 
highbd_clamp_s32_neon(int32x4_t * in,int32x4_t * out,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,int size)197 static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out,
198                                   const int32x4_t *clamp_lo,
199                                   const int32x4_t *clamp_hi, int size) {
200   int32x4_t a0, a1;
201   for (int i = 0; i < size; i += 4) {
202     a0 = vmaxq_s32(in[i], *clamp_lo);
203     out[i] = vminq_s32(a0, *clamp_hi);
204 
205     a1 = vmaxq_s32(in[i + 1], *clamp_lo);
206     out[i + 1] = vminq_s32(a1, *clamp_hi);
207 
208     a0 = vmaxq_s32(in[i + 2], *clamp_lo);
209     out[i + 2] = vminq_s32(a0, *clamp_hi);
210 
211     a1 = vmaxq_s32(in[i + 3], *clamp_lo);
212     out[i + 3] = vminq_s32(a1, *clamp_hi);
213   }
214 }
215 
highbd_get_recon_8x8_neon(const uint16x8_t pred,int32x4_t res0,int32x4_t res1,const int bd)216 static inline uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
217                                                    int32x4_t res0,
218                                                    int32x4_t res1,
219                                                    const int bd) {
220   const uint16x8_t v_zero = vdupq_n_u16(0);
221   int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero);
222   int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1);
223   uint16x8x2_t x;
224   x.val[0] = vreinterpretq_u16_s32(
225       vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred))));
226   x.val[1] = vreinterpretq_u16_s32(
227       vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred))));
228   x.val[0] = vreinterpretq_u16_s32(
229       vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val));
230   x.val[0] = vreinterpretq_u16_s32(
231       vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val));
232   x.val[1] = vreinterpretq_u16_s32(
233       vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val));
234   x.val[1] = vreinterpretq_u16_s32(
235       vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val));
236   uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
237                                 vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
238   return res;
239 }
240 
highbd_get_recon_4xn_neon(uint16x4_t pred,int32x4_t res0,const int bd)241 static inline uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
242                                                    int32x4_t res0,
243                                                    const int bd) {
244   uint16x4_t x0_ = vreinterpret_u16_s16(
245       vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred))));
246   uint16x8_t x0 = vcombine_u16(x0_, x0_);
247   const uint16x8_t vmin = vdupq_n_u16(0);
248   const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
249   x0 = highbd_clamp_u16(&x0, &vmin, &vmax);
250   return vget_low_u16(x0);
251 }
252 
highbd_write_buffer_4xn_neon(int32x4_t * in,uint16_t * output,int stride,int flipud,int height,const int bd)253 static inline void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
254                                                 int stride, int flipud,
255                                                 int height, const int bd) {
256   int j = flipud ? (height - 1) : 0;
257   const int step = flipud ? -1 : 1;
258   for (int i = 0; i < height; ++i, j += step) {
259     uint16x4_t v = vld1_u16(output + i * stride);
260     uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd);
261 
262     vst1_u16(output + i * stride, u);
263   }
264 }
265 
highbd_write_buffer_8xn_neon(int32x4_t * in,uint16_t * output,int stride,int flipud,int height,const int bd)266 static inline void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
267                                                 int stride, int flipud,
268                                                 int height, const int bd) {
269   int j = flipud ? (height - 1) : 0;
270   const int step = flipud ? -1 : 1;
271   for (int i = 0; i < height; ++i, j += step) {
272     uint16x8_t v = vld1q_u16(output + i * stride);
273     uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd);
274 
275     vst1q_u16(output + i * stride, u);
276   }
277 }
278 
load_buffer_32bit_input(const int32_t * in,int stride,int32x4_t * out,int out_size)279 static inline void load_buffer_32bit_input(const int32_t *in, int stride,
280                                            int32x4_t *out, int out_size) {
281   for (int i = 0; i < out_size; ++i) {
282     out[i] = vld1q_s32(in + i * stride);
283   }
284 }
285 
load_buffer_4x4(const int32_t * coeff,int32x4_t * in)286 static inline void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
287   in[0] = vld1q_s32(coeff + 0);
288   in[1] = vld1q_s32(coeff + 4);
289   in[2] = vld1q_s32(coeff + 8);
290   in[3] = vld1q_s32(coeff + 12);
291 }
292 
addsub_neon(const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi)293 static void addsub_neon(const int32x4_t in0, const int32x4_t in1,
294                         int32x4_t *out0, int32x4_t *out1,
295                         const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) {
296   int32x4_t a0 = vaddq_s32(in0, in1);
297   int32x4_t a1 = vsubq_s32(in0, in1);
298 
299   a0 = vmaxq_s32(a0, *clamp_lo);
300   a0 = vminq_s32(a0, *clamp_hi);
301   a1 = vmaxq_s32(a1, *clamp_lo);
302   a1 = vminq_s32(a1, *clamp_hi);
303 
304   *out0 = a0;
305   *out1 = a1;
306 }
307 
shift_and_clamp_neon(int32x4_t * in0,int32x4_t * in1,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_shift)308 static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1,
309                                  const int32x4_t *clamp_lo,
310                                  const int32x4_t *clamp_hi,
311                                  const int32x4_t *v_shift) {
312   int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift);
313   int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift);
314 
315   in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo);
316   in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi);
317   in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo);
318   in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi);
319 
320   *in0 = in0_w_offset;
321   *in1 = in1_w_offset;
322 }
323 
idct32_stage4_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * v_bit,const int32x4_t * rnding)324 static inline void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
325                                       const int32x4_t *v_bit,
326                                       const int32x4_t *rnding) {
327   int32x4_t temp1, temp2;
328   temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
329                                  v_bit, rnding);
330   bf1[30] =
331       half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding);
332   bf1[17] = temp1;
333 
334   temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
335                                  v_bit, rnding);
336   bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
337                                    v_bit, rnding);
338   bf1[18] = temp2;
339 
340   temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
341                                  v_bit, rnding);
342   bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit,
343                             rnding);
344   bf1[21] = temp1;
345 
346   temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
347                                  v_bit, rnding);
348   bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
349                                    v_bit, rnding);
350   bf1[22] = temp2;
351 }
352 
idct32_stage5_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)353 static inline void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
354                                       const int32x4_t *clamp_lo,
355                                       const int32x4_t *clamp_hi,
356                                       const int32x4_t *v_bit,
357                                       const int32x4_t *rnding) {
358   int32x4_t temp1, temp2;
359   temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14],
360                                  v_bit, rnding);
361   bf1[14] =
362       half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding);
363   bf1[9] = temp1;
364 
365   temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13],
366                                  v_bit, rnding);
367   bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13],
368                                    v_bit, rnding);
369   bf1[10] = temp2;
370 
371   addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
372   addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
373   addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
374   addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
375   addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
376   addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
377   addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
378   addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
379 }
380 
idct32_stage6_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)381 static inline void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
382                                       const int32x4_t *clamp_lo,
383                                       const int32x4_t *clamp_hi,
384                                       const int32x4_t *v_bit,
385                                       const int32x4_t *rnding) {
386   int32x4_t temp1, temp2;
387   temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
388                                  v_bit, rnding);
389   bf1[6] =
390       half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding);
391   bf1[5] = temp1;
392 
393   addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
394   addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
395   addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
396   addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
397 
398   temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
399                                  v_bit, rnding);
400   bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit,
401                             rnding);
402   bf1[18] = temp1;
403   temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
404                                  v_bit, rnding);
405   bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit,
406                             rnding);
407   bf1[19] = temp2;
408   temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
409                                  v_bit, rnding);
410   bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
411                                    v_bit, rnding);
412   bf1[20] = temp1;
413   temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
414                                  v_bit, rnding);
415   bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
416                                    v_bit, rnding);
417   bf1[21] = temp2;
418 }
419 
idct32_stage7_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)420 static inline void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
421                                       const int32x4_t *clamp_lo,
422                                       const int32x4_t *clamp_hi,
423                                       const int32x4_t *v_bit,
424                                       const int32x4_t *rnding) {
425   int32x4_t temp1, temp2;
426   addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
427   addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
428   addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
429   addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
430   temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13],
431                                  v_bit, rnding);
432   bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit,
433                             rnding);
434   bf1[10] = temp1;
435   temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12],
436                                  v_bit, rnding);
437   bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit,
438                             rnding);
439   bf1[11] = temp2;
440 
441   addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
442   addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
443   addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
444   addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
445   addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
446   addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
447   addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
448   addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
449 }
450 
idct32_stage8_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)451 static inline void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
452                                       const int32x4_t *clamp_lo,
453                                       const int32x4_t *clamp_hi,
454                                       const int32x4_t *v_bit,
455                                       const int32x4_t *rnding) {
456   int32x4_t temp1, temp2;
457   addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
458   addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
459   addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
460   addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
461   addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
462   addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
463   addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
464   addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
465   temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
466                                  v_bit, rnding);
467   bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit,
468                             rnding);
469   bf1[20] = temp1;
470   temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
471                                  v_bit, rnding);
472   bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit,
473                             rnding);
474   bf1[21] = temp2;
475   temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
476                                  v_bit, rnding);
477   bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit,
478                             rnding);
479   bf1[22] = temp1;
480   temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
481                                  v_bit, rnding);
482   bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit,
483                             rnding);
484   bf1[23] = temp2;
485 }
486 
idct32_stage9_neon(int32x4_t * bf1,int32x4_t * out,const int do_cols,const int bd,const int out_shift,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi)487 static inline void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
488                                       const int do_cols, const int bd,
489                                       const int out_shift,
490                                       const int32x4_t *clamp_lo,
491                                       const int32x4_t *clamp_hi) {
492   addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
493   addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
494   addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
495   addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
496   addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
497   addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
498   addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
499   addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
500   addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
501   addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
502   addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
503   addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
504   addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
505   addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
506   addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
507   addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
508 
509   if (!do_cols) {
510     const int log_range_out = AOMMAX(16, bd + 6);
511     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
512     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
513     for (int i = 0; i < 32; i += 8) {
514       round_shift_4x4(out + i, out_shift);
515       round_shift_4x4(out + i + 4, out_shift);
516     }
517     highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
518   }
519 }
520 
neg_shift_neon(const int32x4_t * in0,const int32x4_t * in1,int32x4_t * out0,int32x4_t * out1,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_shift,int32x4_t * offset)521 static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1,
522                            int32x4_t *out0, int32x4_t *out1,
523                            const int32x4_t *clamp_lo, const int32x4_t *clamp_hi,
524                            const int32x4_t *v_shift, int32x4_t *offset) {
525   int32x4_t a0 = vaddq_s32(*offset, *in0);
526   int32x4_t a1 = vsubq_s32(*offset, *in1);
527 
528   a0 = vshlq_s32(a0, *v_shift);
529   a1 = vshlq_s32(a1, *v_shift);
530 
531   a0 = vmaxq_s32(a0, *clamp_lo);
532   a0 = vminq_s32(a0, *clamp_hi);
533   a1 = vmaxq_s32(a1, *clamp_lo);
534   a1 = vminq_s32(a1, *clamp_hi);
535 
536   *out0 = a0;
537   *out1 = a1;
538 }
539 
idct4x4_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)540 static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
541                          int bd, int out_shift) {
542   const int32_t *cospi = cospi_arr(bit);
543   int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
544   int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
545   int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
546   int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
547 
548   int32x4_t u0, u1, u2, u3;
549   int32x4_t v0, v1, v2, v3, x, y;
550 
551   // Stage 0-1-2
552 
553   u0 = in[0];
554   u1 = in[1];
555   u2 = in[2];
556   u3 = in[3];
557 
558   const int32x4_t v_bit = vdupq_n_s32(-bit);
559 
560   x = vmlaq_n_s32(rnding, u0, cospi[32]);
561   y = vmulq_n_s32(u2, cospi[32]);
562   v0 = vaddq_s32(x, y);
563   v0 = vshlq_s32(v0, v_bit);
564 
565   v1 = vsubq_s32(x, y);
566   v1 = vshlq_s32(v1, v_bit);
567 
568   x = vmlaq_n_s32(rnding, u1, cospi[48]);
569   v2 = vmlsq_n_s32(x, u3, cospi[16]);
570   v2 = vshlq_s32(v2, v_bit);
571 
572   x = vmlaq_n_s32(rnding, u1, cospi[16]);
573   v3 = vmlaq_n_s32(x, u3, cospi[48]);
574   v3 = vshlq_s32(v3, v_bit);
575   // Stage 3
576   addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
577   addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
578 
579   if (!do_cols) {
580     log_range = AOMMAX(16, bd + 6);
581     clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
582     clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
583     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
584     shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift);
585     shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift);
586   }
587 }
588 
iadst4x4_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)589 static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
590                           int bd, int out_shift) {
591   const int32_t *sinpi = sinpi_arr(bit);
592   const int32x4_t zero = vdupq_n_s32(0);
593   int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1));
594   const int32x2_t mul = vdup_n_s32(1 << 4);
595   int32x4_t t;
596   int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
597   int32x4_t x0, x1, x2, x3;
598   int32x4_t u0, u1, u2, u3;
599 
600   x0 = in[0];
601   x1 = in[1];
602   x2 = in[2];
603   x3 = in[3];
604 
605   s0 = vmulq_n_s32(x0, sinpi[1]);
606   s1 = vmulq_n_s32(x0, sinpi[2]);
607   s2 = vmulq_n_s32(x1, sinpi[3]);
608   s3 = vmulq_n_s32(x2, sinpi[4]);
609   s4 = vmulq_n_s32(x2, sinpi[1]);
610   s5 = vmulq_n_s32(x3, sinpi[2]);
611   s6 = vmulq_n_s32(x3, sinpi[4]);
612   t = vsubq_s32(x0, x2);
613   s7 = vaddq_s32(t, x3);
614 
615   t = vaddq_s32(s0, s3);
616   s0 = vaddq_s32(t, s5);
617   t = vsubq_s32(s1, s4);
618   s1 = vsubq_s32(t, s6);
619   s3 = s2;
620   s2 = vmulq_n_s32(s7, sinpi[3]);
621 
622   u0 = vaddq_s32(s0, s3);
623   u1 = vaddq_s32(s1, s3);
624   u2 = s2;
625   t = vaddq_s32(s0, s1);
626   u3 = vsubq_s32(t, s3);
627 
628   // u0
629   int32x4x2_t u0x;
630   u0x.val[0] = vreinterpretq_s32_s64(
631       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
632   u0x.val[0] = vreinterpretq_s32_s64(
633       vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding));
634 
635   u0 = vextq_s32(u0, zero, 1);
636   u0x.val[1] = vreinterpretq_s32_s64(
637       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
638   u0x.val[1] = vreinterpretq_s32_s64(
639       vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding));
640 
641   u0x.val[0] = vreinterpretq_s32_s16(vextq_s16(
642       vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1));
643   u0x.val[1] = vreinterpretq_s32_s16(vextq_s16(
644       vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1));
645 
646   u0x = vzipq_s32(u0x.val[0], u0x.val[1]);
647 #if AOM_ARCH_AARCH64
648   u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
649                                         vreinterpretq_s64_s32(u0x.val[1])));
650 #else
651   u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
652 #endif  // AOM_ARCH_AARCH64
653   // u1
654   int32x4x2_t u1x;
655   u1x.val[0] = vreinterpretq_s32_s64(
656       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
657   u1x.val[0] = vreinterpretq_s32_s64(
658       vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding));
659 
660   u1 = vextq_s32(u1, zero, 1);
661   u1x.val[1] = vreinterpretq_s32_s64(
662       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
663   u1x.val[1] = vreinterpretq_s32_s64(
664       vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding));
665 
666   u1x.val[0] = vreinterpretq_s32_s16(vextq_s16(
667       vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1));
668   u1x.val[1] = vreinterpretq_s32_s16(vextq_s16(
669       vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1));
670 
671   u1x = vzipq_s32(u1x.val[0], u1x.val[1]);
672 #if AOM_ARCH_AARCH64
673   u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
674                                         vreinterpretq_s64_s32(u1x.val[1])));
675 #else
676   u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
677 #endif  // AOM_ARCH_AARCH64
678 
679   // u2
680   int32x4x2_t u2x;
681   u2x.val[0] = vreinterpretq_s32_s64(
682       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
683   u2x.val[0] = vreinterpretq_s32_s64(
684       vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding));
685 
686   u2 = vextq_s32(u2, zero, 1);
687   u2x.val[1] = vreinterpretq_s32_s64(
688       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
689   u2x.val[1] = vreinterpretq_s32_s64(
690       vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding));
691 
692   u2x.val[0] = vreinterpretq_s32_s16(vextq_s16(
693       vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1));
694   u2x.val[1] = vreinterpretq_s32_s16(vextq_s16(
695       vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1));
696 
697   u2x = vzipq_s32(u2x.val[0], u2x.val[1]);
698 #if AOM_ARCH_AARCH64
699   u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
700                                         vreinterpretq_s64_s32(u2x.val[1])));
701 #else
702   u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
703 #endif  // AOM_ARCH_AARCH64
704 
705   // u3
706   int32x4x2_t u3x;
707   u3x.val[0] = vreinterpretq_s32_s64(
708       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
709   u3x.val[0] = vreinterpretq_s32_s64(
710       vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding));
711 
712   u3 = vextq_s32(u3, zero, 1);
713   u3x.val[1] = vreinterpretq_s32_s64(
714       vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
715   u3x.val[1] = vreinterpretq_s32_s64(
716       vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding));
717 
718   u3x.val[0] = vreinterpretq_s32_s16(vextq_s16(
719       vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1));
720   u3x.val[1] = vreinterpretq_s32_s16(vextq_s16(
721       vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1));
722 
723   u3x = vzipq_s32(u3x.val[0], u3x.val[1]);
724 #if AOM_ARCH_AARCH64
725   u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
726                                         vreinterpretq_s64_s32(u3x.val[1])));
727 #else
728   u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
729 #endif  // AOM_ARCH_AARCH64
730 
731   out[0] = u0;
732   out[1] = u1;
733   out[2] = u2;
734   out[3] = u3;
735 
736   if (!do_cols) {
737     const int log_range = AOMMAX(16, bd + 6);
738     const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
739     const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
740     round_shift_4x4(out, out_shift);
741     highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
742   }
743 }
744 
write_buffer_4x4(int32x4_t * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)745 static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride,
746                              int fliplr, int flipud, int shift, int bd) {
747   uint32x4_t u0, u1, u2, u3;
748   uint16x4_t v0, v1, v2, v3;
749   round_shift_4x4(in, shift);
750 
751   v0 = vld1_u16(output + 0 * stride);
752   v1 = vld1_u16(output + 1 * stride);
753   v2 = vld1_u16(output + 2 * stride);
754   v3 = vld1_u16(output + 3 * stride);
755 
756   if (fliplr) {
757     u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0]));
758     in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
759     u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1]));
760     in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
761     u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2]));
762     in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
763     u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3]));
764     in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
765   }
766 
767   if (flipud) {
768     u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0);
769     u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1);
770     u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2);
771     u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3);
772   } else {
773     u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0);
774     u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1);
775     u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2);
776     u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3);
777   }
778 
779   uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1));
780   uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3));
781   const uint16x8_t vmin = vdupq_n_u16(0);
782   const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
783   u4 = highbd_clamp_u16(&u4, &vmin, &vmax);
784   u5 = highbd_clamp_u16(&u5, &vmin, &vmax);
785 
786   vst1_u16(output + 0 * stride, vget_low_u16(u4));
787   vst1_u16(output + 1 * stride, vget_high_u16(u4));
788   vst1_u16(output + 2 * stride, vget_low_u16(u5));
789   vst1_u16(output + 3 * stride, vget_high_u16(u5));
790 }
791 
iidentity4_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)792 static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
793                             int bd, int out_shift) {
794   (void)bit;
795   int32x4_t zero = vdupq_n_s32(0);
796   int32x2_t fact = vdup_n_s32(NewSqrt2);
797   int32x4x2_t a0;
798   const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
799 
800   for (int i = 0; i < 4; i++) {
801     a0.val[0] = vreinterpretq_s32_s64(
802         vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
803     a0.val[0] = vreinterpretq_s32_s64(
804         vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
805     a0.val[1] = vextq_s32(in[i], zero, 1);
806     a0.val[1] = vreinterpretq_s32_s64(
807         vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
808     a0.val[1] = vreinterpretq_s32_s64(
809         vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
810 
811     a0 = vzipq_s32(a0.val[0], a0.val[1]);
812 #if AOM_ARCH_AARCH64
813     out[i] = vreinterpretq_s32_s64(vzip1q_s64(
814         vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
815 #else
816     out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
817 #endif
818   }
819   if (!do_cols) {
820     const int log_range = AOMMAX(16, bd + 6);
821     const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
822     const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
823     round_shift_4x4(out, out_shift);
824     highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
825   }
826 }
827 
av1_inv_txfm2d_add_4x4_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)828 void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output,
829                                  int stride, TX_TYPE tx_type, int bd) {
830   int32x4_t in[4];
831 
832   const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
833 
834   switch (tx_type) {
835     case DCT_DCT:
836       load_buffer_4x4(input, in);
837       idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
838       transpose_4x4(in, in);
839       idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
840       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
841       break;
842     case ADST_DCT:
843       load_buffer_4x4(input, in);
844       idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
845       transpose_4x4(in, in);
846       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
847       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
848       break;
849     case DCT_ADST:
850       load_buffer_4x4(input, in);
851       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
852       transpose_4x4(in, in);
853       idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
854       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
855       break;
856     case ADST_ADST:
857       load_buffer_4x4(input, in);
858       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
859       transpose_4x4(in, in);
860       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
861       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
862       break;
863     case FLIPADST_DCT:
864       load_buffer_4x4(input, in);
865       idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
866       transpose_4x4(in, in);
867       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
868       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
869       break;
870     case DCT_FLIPADST:
871       load_buffer_4x4(input, in);
872       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
873       transpose_4x4(in, in);
874       idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
875       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
876       break;
877     case FLIPADST_FLIPADST:
878       load_buffer_4x4(input, in);
879       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
880       transpose_4x4(in, in);
881       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
882       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
883       break;
884     case ADST_FLIPADST:
885       load_buffer_4x4(input, in);
886       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
887       transpose_4x4(in, in);
888       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
889       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
890       break;
891     case FLIPADST_ADST:
892       load_buffer_4x4(input, in);
893       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
894       transpose_4x4(in, in);
895       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
896       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
897       break;
898     case IDTX:
899       load_buffer_4x4(input, in);
900       iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
901       transpose_4x4(in, in);
902       iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
903       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
904       break;
905     case V_DCT:
906       load_buffer_4x4(input, in);
907       iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
908       transpose_4x4(in, in);
909       idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
910       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
911       break;
912     case H_DCT:
913       load_buffer_4x4(input, in);
914       idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
915       transpose_4x4(in, in);
916       iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
917       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
918       break;
919     case V_ADST:
920       load_buffer_4x4(input, in);
921       iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
922       transpose_4x4(in, in);
923       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
924       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
925       break;
926     case H_ADST:
927       load_buffer_4x4(input, in);
928       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
929       transpose_4x4(in, in);
930       iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
931       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
932       break;
933     case V_FLIPADST:
934       load_buffer_4x4(input, in);
935       iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
936       transpose_4x4(in, in);
937       iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
938       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
939       break;
940     case H_FLIPADST:
941       load_buffer_4x4(input, in);
942       iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
943       transpose_4x4(in, in);
944       iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
945       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
946       break;
947     default: assert(0);
948   }
949 }
950 
951 // 8x8
load_buffer_8x8(const int32_t * coeff,int32x4_t * in)952 static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) {
953   in[0] = vld1q_s32(coeff + 0);
954   in[1] = vld1q_s32(coeff + 4);
955   in[2] = vld1q_s32(coeff + 8);
956   in[3] = vld1q_s32(coeff + 12);
957   in[4] = vld1q_s32(coeff + 16);
958   in[5] = vld1q_s32(coeff + 20);
959   in[6] = vld1q_s32(coeff + 24);
960   in[7] = vld1q_s32(coeff + 28);
961   in[8] = vld1q_s32(coeff + 32);
962   in[9] = vld1q_s32(coeff + 36);
963   in[10] = vld1q_s32(coeff + 40);
964   in[11] = vld1q_s32(coeff + 44);
965   in[12] = vld1q_s32(coeff + 48);
966   in[13] = vld1q_s32(coeff + 52);
967   in[14] = vld1q_s32(coeff + 56);
968   in[15] = vld1q_s32(coeff + 60);
969 }
970 
idct8x8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)971 static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
972                          int bd, int out_shift) {
973   const int32_t *cospi = cospi_arr(bit);
974   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
975   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
976   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
977   int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
978   int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
979   int32x4_t x, y;
980   int col;
981   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
982   const int32x4_t v_bit = vdupq_n_s32(-bit);
983   // Note:
984   //  Even column: 0, 2, ..., 14
985   //  Odd column: 1, 3, ..., 15
986   //  one even column plus one odd column constructs one row (8 coeffs)
987   //  total we have 8 rows (8x8).
988   for (col = 0; col < 2; ++col) {
989     // stage 0
990     // stage 1
991     // stage 2
992     u0 = in[0 * 2 + col];
993     u1 = in[4 * 2 + col];
994     u2 = in[2 * 2 + col];
995     u3 = in[6 * 2 + col];
996 
997     x = vmulq_n_s32(in[1 * 2 + col], cospi[56]);
998     u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]);
999     u4 = vaddq_s32(u4, rnding);
1000     u4 = vshlq_s32(u4, v_bit);
1001 
1002     x = vmulq_n_s32(in[1 * 2 + col], cospi[8]);
1003     u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]);
1004     u7 = vaddq_s32(u7, rnding);
1005     u7 = vshlq_s32(u7, v_bit);
1006 
1007     x = vmulq_n_s32(in[5 * 2 + col], cospi[24]);
1008     u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]);
1009     u5 = vaddq_s32(u5, rnding);
1010     u5 = vshlq_s32(u5, v_bit);
1011 
1012     x = vmulq_n_s32(in[5 * 2 + col], cospi[40]);
1013     u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]);
1014     u6 = vaddq_s32(u6, rnding);
1015     u6 = vshlq_s32(u6, v_bit);
1016 
1017     // stage 3
1018     x = vmulq_n_s32(u0, cospi[32]);
1019     y = vmulq_n_s32(u1, cospi[32]);
1020     v0 = vaddq_s32(x, y);
1021     v0 = vaddq_s32(v0, rnding);
1022     v0 = vshlq_s32(v0, v_bit);
1023 
1024     v1 = vsubq_s32(x, y);
1025     v1 = vaddq_s32(v1, rnding);
1026     v1 = vshlq_s32(v1, v_bit);
1027 
1028     x = vmulq_n_s32(u2, cospi[48]);
1029     v2 = vmlaq_n_s32(x, u3, -cospi[16]);
1030     v2 = vaddq_s32(v2, rnding);
1031     v2 = vshlq_s32(v2, v_bit);
1032 
1033     x = vmulq_n_s32(u2, cospi[16]);
1034     v3 = vmlaq_n_s32(x, u3, cospi[48]);
1035     v3 = vaddq_s32(v3, rnding);
1036     v3 = vshlq_s32(v3, v_bit);
1037 
1038     addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1039     addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1040 
1041     // stage 4
1042     addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1043     addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1044     u4 = v4;
1045     u7 = v7;
1046 
1047     x = vmulq_n_s32(v5, cospi[32]);
1048     y = vmulq_n_s32(v6, cospi[32]);
1049     u6 = vaddq_s32(y, x);
1050     u6 = vaddq_s32(u6, rnding);
1051     u6 = vshlq_s32(u6, v_bit);
1052 
1053     u5 = vsubq_s32(y, x);
1054     u5 = vaddq_s32(u5, rnding);
1055     u5 = vshlq_s32(u5, v_bit);
1056 
1057     // stage 5
1058     addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
1059                 &clamp_hi);
1060     addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
1061                 &clamp_hi);
1062     addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
1063                 &clamp_hi);
1064     addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
1065                 &clamp_hi);
1066   }
1067 
1068   if (!do_cols) {
1069     const int log_range_out = AOMMAX(16, bd + 6);
1070     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1071     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1072     round_shift_8x8(out, out_shift);
1073     highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1074   }
1075 }
1076 
iadst8x8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1077 static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
1078                           int bd, int out_shift) {
1079   const int32_t *cospi = cospi_arr(bit);
1080   const int32x4_t kZero = vdupq_n_s32(0);
1081   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1082   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1083   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1084   int32x4_t u[8], v[8], x;
1085   const int32x4_t v_bit = vdupq_n_s32(-bit);
1086   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1087   // stage 0-1-2
1088   // (1)
1089   u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]);
1090   u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
1091   u[0] = vshlq_s32(u[0], v_bit);
1092 
1093   u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]);
1094   u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
1095   u[1] = vshlq_s32(u[1], v_bit);
1096 
1097   // (2)
1098   u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]);
1099   u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]);
1100   u[2] = vshlq_s32(u[2], v_bit);
1101 
1102   u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]);
1103   u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]);
1104   u[3] = vshlq_s32(u[3], v_bit);
1105 
1106   // (3)
1107   u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]);
1108   u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]);
1109   u[4] = vshlq_s32(u[4], v_bit);
1110 
1111   u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]);
1112   u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]);
1113   u[5] = vshlq_s32(u[5], v_bit);
1114 
1115   // (4)
1116   u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]);
1117   u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]);
1118   u[6] = vshlq_s32(u[6], v_bit);
1119 
1120   u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]);
1121   u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]);
1122   u[7] = vshlq_s32(u[7], v_bit);
1123 
1124   // stage 3
1125   addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1126   addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1127   addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1128   addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1129 
1130   // stage 4
1131   u[0] = v[0];
1132   u[1] = v[1];
1133   u[2] = v[2];
1134   u[3] = v[3];
1135 
1136   u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
1137   u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
1138   u[4] = vshlq_s32(u[4], v_bit);
1139 
1140   u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
1141   u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
1142   u[5] = vshlq_s32(u[5], v_bit);
1143 
1144   u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
1145   u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
1146   u[6] = vshlq_s32(u[6], v_bit);
1147 
1148   u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]);
1149   u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]);
1150   u[7] = vshlq_s32(u[7], v_bit);
1151 
1152   // stage 5
1153   addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1154   addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1155   addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1156   addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1157 
1158   // stage 6
1159   u[0] = v[0];
1160   u[1] = v[1];
1161   u[4] = v[4];
1162   u[5] = v[5];
1163 
1164   v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
1165   x = vmulq_n_s32(v[3], cospi[32]);
1166   u[2] = vaddq_s32(v[0], x);
1167   u[2] = vshlq_s32(u[2], v_bit);
1168 
1169   u[3] = vsubq_s32(v[0], x);
1170   u[3] = vshlq_s32(u[3], v_bit);
1171 
1172   v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
1173   x = vmulq_n_s32(v[7], cospi[32]);
1174   u[6] = vaddq_s32(v[0], x);
1175   u[6] = vshlq_s32(u[6], v_bit);
1176 
1177   u[7] = vsubq_s32(v[0], x);
1178   u[7] = vshlq_s32(u[7], v_bit);
1179 
1180   // stage 7
1181   if (do_cols) {
1182     out[0] = u[0];
1183     out[2] = vsubq_s32(kZero, u[4]);
1184     out[4] = u[6];
1185     out[6] = vsubq_s32(kZero, u[2]);
1186     out[8] = u[3];
1187     out[10] = vsubq_s32(kZero, u[7]);
1188     out[12] = u[5];
1189     out[14] = vsubq_s32(kZero, u[1]);
1190   } else {
1191     const int log_range_out = AOMMAX(16, bd + 6);
1192     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1193     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1194     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1195     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1196     neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
1197                    &v_shift, &offset);
1198     neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
1199                    &v_shift, &offset);
1200     neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out,
1201                    &clamp_hi_out, &v_shift, &offset);
1202     neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out,
1203                    &clamp_hi_out, &v_shift, &offset);
1204   }
1205 
1206   // Odd 8 points: 1, 3, ..., 15
1207   // stage 0
1208   // stage 1
1209   // stage 2
1210   // (1)
1211   u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]);
1212   u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]);
1213   u[0] = vshlq_s32(u[0], v_bit);
1214 
1215   u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]);
1216   u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]);
1217   u[1] = vshlq_s32(u[1], v_bit);
1218 
1219   // (2)
1220   u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]);
1221   u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]);
1222   u[2] = vshlq_s32(u[2], v_bit);
1223 
1224   u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]);
1225   u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]);
1226   u[3] = vshlq_s32(u[3], v_bit);
1227 
1228   // (3)
1229   u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]);
1230   u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]);
1231   u[4] = vshlq_s32(u[4], v_bit);
1232 
1233   u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]);
1234   u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]);
1235   u[5] = vshlq_s32(u[5], v_bit);
1236 
1237   // (4)
1238   u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]);
1239   u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]);
1240   u[6] = vshlq_s32(u[6], v_bit);
1241 
1242   u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]);
1243   u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]);
1244   u[7] = vshlq_s32(u[7], v_bit);
1245 
1246   // stage 3
1247   addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1248   addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1249   addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1250   addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1251 
1252   // stage 4
1253   u[0] = v[0];
1254   u[1] = v[1];
1255   u[2] = v[2];
1256   u[3] = v[3];
1257 
1258   u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
1259   u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
1260   u[4] = vshlq_s32(u[4], v_bit);
1261 
1262   u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
1263   u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
1264   u[5] = vshlq_s32(u[5], v_bit);
1265 
1266   u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
1267   u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
1268   u[6] = vshlq_s32(u[6], v_bit);
1269 
1270   u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
1271   u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
1272   u[7] = vshlq_s32(u[7], v_bit);
1273 
1274   // stage 5
1275   addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1276   addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1277   addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1278   addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1279 
1280   // stage 6
1281   u[0] = v[0];
1282   u[1] = v[1];
1283   u[4] = v[4];
1284   u[5] = v[5];
1285 
1286   v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
1287   x = vmulq_n_s32(v[3], cospi[32]);
1288   u[2] = vaddq_s32(v[0], x);
1289   u[2] = vshlq_s32(u[2], v_bit);
1290 
1291   u[3] = vsubq_s32(v[0], x);
1292   u[3] = vshlq_s32(u[3], v_bit);
1293 
1294   v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
1295   x = vmulq_n_s32(v[7], cospi[32]);
1296   u[6] = vaddq_s32(v[0], x);
1297   u[6] = vshlq_s32(u[6], v_bit);
1298 
1299   u[7] = vsubq_s32(v[0], x);
1300   u[7] = vshlq_s32(u[7], v_bit);
1301 
1302   // stage 7
1303   if (do_cols) {
1304     out[1] = u[0];
1305     out[3] = vsubq_s32(kZero, u[4]);
1306     out[5] = u[6];
1307     out[7] = vsubq_s32(kZero, u[2]);
1308     out[9] = u[3];
1309     out[11] = vsubq_s32(kZero, u[7]);
1310     out[13] = u[5];
1311     out[15] = vsubq_s32(kZero, u[1]);
1312   } else {
1313     const int log_range_out = AOMMAX(16, bd + 6);
1314     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1315     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1316     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1317     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1318     neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1319                    &v_shift, &offset);
1320     neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1321                    &v_shift, &offset);
1322     neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out,
1323                    &clamp_hi_out, &v_shift, &offset);
1324     neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out,
1325                    &clamp_hi_out, &v_shift, &offset);
1326   }
1327 }
1328 
iidentity8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1329 static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
1330                             int bd, int out_shift) {
1331   (void)bit;
1332   out[0] = vaddq_s32(in[0], in[0]);
1333   out[1] = vaddq_s32(in[1], in[1]);
1334   out[2] = vaddq_s32(in[2], in[2]);
1335   out[3] = vaddq_s32(in[3], in[3]);
1336   out[4] = vaddq_s32(in[4], in[4]);
1337   out[5] = vaddq_s32(in[5], in[5]);
1338   out[6] = vaddq_s32(in[6], in[6]);
1339   out[7] = vaddq_s32(in[7], in[7]);
1340 
1341   if (!do_cols) {
1342     const int log_range = AOMMAX(16, bd + 6);
1343     const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1344     const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1345     round_shift_4x4(out, out_shift);
1346     round_shift_4x4(out + 4, out_shift);
1347     highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8);
1348   }
1349 }
1350 
get_recon_8x8(const uint16x8_t pred,int32x4_t res_lo,int32x4_t res_hi,int fliplr,int bd)1351 static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo,
1352                                 int32x4_t res_hi, int fliplr, int bd) {
1353   uint16x8x2_t x;
1354 
1355   if (fliplr) {
1356     res_lo = vrev64q_s32(res_lo);
1357     res_lo = vextq_s32(res_lo, res_lo, 2);
1358     res_hi = vrev64q_s32(res_hi);
1359     res_hi = vextq_s32(res_hi, res_hi, 2);
1360     x.val[0] = vreinterpretq_u16_s32(
1361         vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred))));
1362     x.val[1] = vreinterpretq_u16_s32(
1363         vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred))));
1364 
1365   } else {
1366     x.val[0] = vreinterpretq_u16_s32(
1367         vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred))));
1368     x.val[1] = vreinterpretq_u16_s32(
1369         vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred))));
1370   }
1371 
1372   uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
1373                                vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
1374   const uint16x8_t vmin = vdupq_n_u16(0);
1375   const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
1376   return highbd_clamp_u16(&x2, &vmin, &vmax);
1377 }
1378 
write_buffer_8x8(int32x4_t * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1379 static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride,
1380                              int fliplr, int flipud, int shift, int bd) {
1381   uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7;
1382   uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7;
1383   round_shift_8x8(in, shift);
1384 
1385   v0 = vld1q_u16(output + 0 * stride);
1386   v1 = vld1q_u16(output + 1 * stride);
1387   v2 = vld1q_u16(output + 2 * stride);
1388   v3 = vld1q_u16(output + 3 * stride);
1389   v4 = vld1q_u16(output + 4 * stride);
1390   v5 = vld1q_u16(output + 5 * stride);
1391   v6 = vld1q_u16(output + 6 * stride);
1392   v7 = vld1q_u16(output + 7 * stride);
1393 
1394   if (flipud) {
1395     u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1396     u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1397     u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1398     u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1399     u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1400     u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1401     u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1402     u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1403   } else {
1404     u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1405     u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1406     u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1407     u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1408     u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1409     u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1410     u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1411     u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1412   }
1413 
1414   vst1q_u16(output + 0 * stride, u0);
1415   vst1q_u16(output + 1 * stride, u1);
1416   vst1q_u16(output + 2 * stride, u2);
1417   vst1q_u16(output + 3 * stride, u3);
1418   vst1q_u16(output + 4 * stride, u4);
1419   vst1q_u16(output + 5 * stride, u5);
1420   vst1q_u16(output + 6 * stride, u6);
1421   vst1q_u16(output + 7 * stride, u7);
1422 }
1423 
av1_inv_txfm2d_add_8x8_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1424 void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output,
1425                                  int stride, TX_TYPE tx_type, int bd) {
1426   int32x4_t in[16], out[16];
1427   const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
1428 
1429   switch (tx_type) {
1430     case DCT_DCT:
1431       load_buffer_8x8(input, in);
1432       idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1433       transpose_8x8(out, in);
1434       idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1435       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1436       break;
1437     case DCT_ADST:
1438       load_buffer_8x8(input, in);
1439       iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1440       transpose_8x8(out, in);
1441       idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1442       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1443       break;
1444     case ADST_DCT:
1445       load_buffer_8x8(input, in);
1446       idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1447       transpose_8x8(out, in);
1448       iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1449       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1450       break;
1451     case ADST_ADST:
1452       load_buffer_8x8(input, in);
1453       iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1454       transpose_8x8(out, in);
1455       iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1456       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1457       break;
1458     case FLIPADST_DCT:
1459       load_buffer_8x8(input, in);
1460       idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1461       transpose_8x8(out, in);
1462       iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1463       write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1464       break;
1465     case DCT_FLIPADST:
1466       load_buffer_8x8(input, in);
1467       iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1468       transpose_8x8(out, in);
1469       idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1470       write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1471       break;
1472     case ADST_FLIPADST:
1473       load_buffer_8x8(input, in);
1474       iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1475       transpose_8x8(out, in);
1476       iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1477       write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1478       break;
1479     case FLIPADST_FLIPADST:
1480       load_buffer_8x8(input, in);
1481       iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1482       transpose_8x8(out, in);
1483       iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1484       write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
1485       break;
1486     case FLIPADST_ADST:
1487       load_buffer_8x8(input, in);
1488       iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1489       transpose_8x8(out, in);
1490       iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1491       write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1492       break;
1493     default: assert(0);
1494   }
1495 }
1496 
idct8x8_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1497 static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1498                               int do_cols, int bd, int out_shift) {
1499   const int32_t *cospi = cospi_arr(bit);
1500   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1501   int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1502   int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1503   int32x4_t x;
1504   const int32x4_t v_bit = vdupq_n_s32(-bit);
1505   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1506   // stage 0-1-2-3
1507   x = vmulq_n_s32(in[0], cospi[32]);
1508   x = vaddq_s32(vshlq_s32(x, v_bit), rnding);
1509 
1510   // stage 4-5
1511   if (!do_cols) {
1512     const int log_range_out = AOMMAX(16, bd + 6);
1513     clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
1514     clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1515 
1516     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1517     x = vaddq_s32(x, offset);
1518     x = vshlq_s32(x, vdupq_n_s32(-out_shift));
1519   }
1520 
1521   x = vmaxq_s32(x, clamp_lo);
1522   x = vminq_s32(x, clamp_hi);
1523   out[0] = x;
1524   out[1] = x;
1525   out[2] = x;
1526   out[3] = x;
1527   out[4] = x;
1528   out[5] = x;
1529   out[6] = x;
1530   out[7] = x;
1531 }
1532 
idct8x8_new_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1533 static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
1534                              int do_cols, int bd, int out_shift) {
1535   const int32_t *cospi = cospi_arr(bit);
1536   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1537   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1538   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1539   int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
1540   int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
1541   int32x4_t x, y;
1542   const int32x4_t v_bit = vdupq_n_s32(-bit);
1543   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1544 
1545   // stage 0
1546   // stage 1
1547   // stage 2
1548   u0 = in[0];
1549   u1 = in[4];
1550   u2 = in[2];
1551   u3 = in[6];
1552 
1553   x = vmlaq_n_s32(rnding, in[1], cospi[56]);
1554   u4 = vmlaq_n_s32(x, in[7], -cospi[8]);
1555   u4 = vshlq_s32(u4, v_bit);
1556 
1557   x = vmlaq_n_s32(rnding, in[1], cospi[8]);
1558   u7 = vmlaq_n_s32(x, in[7], cospi[56]);
1559   u7 = vshlq_s32(u7, v_bit);
1560 
1561   x = vmlaq_n_s32(rnding, in[5], cospi[24]);
1562   u5 = vmlaq_n_s32(x, in[3], -cospi[40]);
1563   u5 = vshlq_s32(u5, v_bit);
1564 
1565   x = vmlaq_n_s32(rnding, in[5], cospi[40]);
1566   u6 = vmlaq_n_s32(x, in[3], cospi[24]);
1567   u6 = vshlq_s32(u6, v_bit);
1568 
1569   // stage 3
1570   x = vmlaq_n_s32(rnding, u0, cospi[32]);
1571   y = vmulq_n_s32(u1, cospi[32]);
1572   v0 = vaddq_s32(x, y);
1573   v0 = vshlq_s32(v0, v_bit);
1574 
1575   v1 = vsubq_s32(x, y);
1576   v1 = vshlq_s32(v1, v_bit);
1577 
1578   x = vmlaq_n_s32(rnding, u2, cospi[48]);
1579   v2 = vmlaq_n_s32(x, u3, -cospi[16]);
1580   v2 = vshlq_s32(v2, v_bit);
1581 
1582   x = vmlaq_n_s32(rnding, u2, cospi[16]);
1583   v3 = vmlaq_n_s32(x, u3, cospi[48]);
1584   v3 = vshlq_s32(v3, v_bit);
1585 
1586   addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1587   addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1588 
1589   // stage 4
1590   addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1591   addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1592   u4 = v4;
1593   u7 = v7;
1594 
1595   x = vmulq_n_s32(v5, cospi[32]);
1596   y = vmlaq_n_s32(rnding, v6, cospi[32]);
1597   u6 = vaddq_s32(y, x);
1598   u6 = vshlq_s32(u6, v_bit);
1599 
1600   u5 = vsubq_s32(y, x);
1601   u5 = vshlq_s32(u5, v_bit);
1602 
1603   // stage 5
1604   addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
1605   addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
1606   addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
1607   addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
1608 
1609   if (!do_cols) {
1610     const int log_range_out = AOMMAX(16, bd + 6);
1611     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1612     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1613     round_shift_4x4(out, out_shift);
1614     round_shift_4x4(out + 4, out_shift);
1615     highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8);
1616   }
1617 }
1618 
iadst8x8_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1619 static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1620                                int do_cols, int bd, int out_shift) {
1621   const int32_t *cospi = cospi_arr(bit);
1622   int32x4_t u[8], x;
1623   const int32x4_t v_bit = vdupq_n_s32(-bit);
1624   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1625   // stage 0-2
1626 
1627   u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]);
1628   u[0] = vshlq_s32(u[0], v_bit);
1629 
1630   u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]);
1631   u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit);
1632 
1633   // stage 3-4
1634   int32x4_t temp1, temp2;
1635   temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]);
1636   temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]);
1637   temp1 = vshlq_s32(temp1, v_bit);
1638   u[4] = temp1;
1639 
1640   temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]);
1641   u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]);
1642   u[5] = vshlq_s32(u[5], v_bit);
1643 
1644   // stage 5-6
1645   temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]);
1646   x = vmulq_n_s32(u[1], cospi[32]);
1647   u[2] = vaddq_s32(temp1, x);
1648   u[2] = vshlq_s32(u[2], v_bit);
1649 
1650   u[3] = vsubq_s32(temp1, x);
1651   u[3] = vshlq_s32(u[3], v_bit);
1652 
1653   temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]);
1654   x = vmulq_n_s32(u[5], cospi[32]);
1655   u[6] = vaddq_s32(temp1, x);
1656   u[6] = vshlq_s32(u[6], v_bit);
1657 
1658   u[7] = vsubq_s32(temp1, x);
1659   u[7] = vshlq_s32(u[7], v_bit);
1660 
1661   // stage 7
1662   if (do_cols) {
1663     out[0] = u[0];
1664     out[1] = vnegq_s32(u[4]);
1665     out[2] = u[6];
1666     out[3] = vnegq_s32(u[2]);
1667     out[4] = u[3];
1668     out[5] = vnegq_s32(u[7]);
1669     out[6] = u[5];
1670     out[7] = vnegq_s32(u[1]);
1671   } else {
1672     const int log_range_out = AOMMAX(16, bd + 6);
1673     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1674     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1675     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1676     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1677     neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1678                    &v_shift, &offset);
1679     neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1680                    &v_shift, &offset);
1681     neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1682                    &v_shift, &offset);
1683     neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1684                    &v_shift, &offset);
1685   }
1686 }
1687 
iadst8x8_new_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1688 static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
1689                               int do_cols, int bd, int out_shift) {
1690   const int32_t *cospi = cospi_arr(bit);
1691   // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1692   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1693   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1694   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1695   int32x4_t u[8], v[8], x;
1696   const int32x4_t v_bit = vdupq_n_s32(-bit);
1697   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1698   // stage 0-2
1699 
1700   u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]);
1701   u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
1702   u[0] = vshlq_s32(u[0], v_bit);
1703 
1704   u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]);
1705   u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
1706   u[1] = vshlq_s32(u[1], v_bit);
1707 
1708   // (2)
1709   u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]);
1710   u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]);
1711   u[2] = vshlq_s32(u[2], v_bit);
1712 
1713   u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]);
1714   u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]);
1715   u[3] = vshlq_s32(u[3], v_bit);
1716 
1717   // (3)
1718   u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]);
1719   u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]);
1720   u[4] = vshlq_s32(u[4], v_bit);
1721 
1722   u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]);
1723   u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]);
1724   u[5] = vshlq_s32(u[5], v_bit);
1725 
1726   // (4)
1727   u[6] = vmulq_n_s32(in[1], cospi[52]);
1728   u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]);
1729   u[6] = vaddq_s32(u[6], rnding);
1730   u[6] = vshlq_s32(u[6], v_bit);
1731 
1732   u[7] = vmulq_n_s32(in[1], cospi[12]);
1733   u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]);
1734   u[7] = vaddq_s32(u[7], rnding);
1735   u[7] = vshlq_s32(u[7], v_bit);
1736 
1737   // stage 3
1738   addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1739   addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1740   addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1741   addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1742 
1743   // stage 4
1744   u[0] = v[0];
1745   u[1] = v[1];
1746   u[2] = v[2];
1747   u[3] = v[3];
1748 
1749   u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
1750   u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
1751   u[4] = vshlq_s32(u[4], v_bit);
1752 
1753   u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
1754   u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
1755   u[5] = vshlq_s32(u[5], v_bit);
1756 
1757   u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]);
1758   u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]);
1759   u[6] = vshlq_s32(u[6], v_bit);
1760 
1761   u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
1762   u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
1763   u[7] = vshlq_s32(u[7], v_bit);
1764 
1765   // stage 5
1766   addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1767   addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1768   addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1769   addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1770 
1771   // stage 6
1772   u[0] = v[0];
1773   u[1] = v[1];
1774   u[4] = v[4];
1775   u[5] = v[5];
1776 
1777   v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
1778   x = vmulq_n_s32(v[3], cospi[32]);
1779   u[2] = vaddq_s32(v[0], x);
1780   u[2] = vshlq_s32(u[2], v_bit);
1781 
1782   u[3] = vsubq_s32(v[0], x);
1783   u[3] = vshlq_s32(u[3], v_bit);
1784 
1785   v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
1786   x = vmulq_n_s32(v[7], cospi[32]);
1787   u[6] = vaddq_s32(v[0], x);
1788   u[6] = vshlq_s32(u[6], v_bit);
1789 
1790   u[7] = vsubq_s32(v[0], x);
1791   u[7] = vshlq_s32(u[7], v_bit);
1792 
1793   // stage 7
1794   if (do_cols) {
1795     out[0] = u[0];
1796     out[1] = vnegq_s32(u[4]);
1797     out[2] = u[6];
1798     out[3] = vnegq_s32(u[2]);
1799     out[4] = u[3];
1800     out[5] = vnegq_s32(u[7]);
1801     out[6] = u[5];
1802     out[7] = vnegq_s32(u[1]);
1803   } else {
1804     const int log_range_out = AOMMAX(16, bd + 6);
1805     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1806     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1807     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1808     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1809     neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1810                    &v_shift, &offset);
1811     neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1812                    &v_shift, &offset);
1813     neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1814                    &v_shift, &offset);
1815     neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1816                    &v_shift, &offset);
1817   }
1818 }
1819 
idct16x16_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1820 static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1821                                 int do_cols, int bd, int out_shift) {
1822   const int32_t *cospi = cospi_arr(bit);
1823   int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1824   int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1825   int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1826   const int32x4_t v_bit = vdupq_n_s32(-bit);
1827   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1828   // stage 0-4
1829   in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]);
1830   in[0] = vshlq_s32(in[0], v_bit);
1831 
1832   // stage 5-7
1833   if (!do_cols) {
1834     log_range = AOMMAX(16, bd + 6);
1835     clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1836     clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1837     if (out_shift != 0) {
1838       int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1839       in[0] = vaddq_s32(in[0], offset);
1840       in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift));
1841     }
1842   }
1843 
1844   in[0] = vmaxq_s32(in[0], clamp_lo);
1845   in[0] = vminq_s32(in[0], clamp_hi);
1846   out[0] = in[0];
1847   out[1] = in[0];
1848   out[2] = in[0];
1849   out[3] = in[0];
1850   out[4] = in[0];
1851   out[5] = in[0];
1852   out[6] = in[0];
1853   out[7] = in[0];
1854   out[8] = in[0];
1855   out[9] = in[0];
1856   out[10] = in[0];
1857   out[11] = in[0];
1858   out[12] = in[0];
1859   out[13] = in[0];
1860   out[14] = in[0];
1861   out[15] = in[0];
1862 }
1863 
idct16x16_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1864 static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
1865                                 int do_cols, int bd, int out_shift) {
1866   const int32_t *cospi = cospi_arr(bit);
1867   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1868   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1869   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1870   const int32x4_t v_bit = vdupq_n_s32(-bit);
1871   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1872   int32x4_t u[16], x, y;
1873   // stage 0-1
1874   u[0] = in[0];
1875   u[2] = in[4];
1876   u[4] = in[2];
1877   u[6] = in[6];
1878   u[8] = in[1];
1879   u[10] = in[5];
1880   u[12] = in[3];
1881   u[14] = in[7];
1882 
1883   // stage 2
1884   u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
1885   u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
1886 
1887   u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
1888   u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
1889 
1890   u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
1891   u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
1892 
1893   u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
1894   u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
1895 
1896   // stage 3
1897   u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
1898   u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
1899   u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding);
1900   u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding);
1901 
1902   addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1903   addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1904   addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1905   addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1906 
1907   // stage 4
1908   x = vmlaq_n_s32(rnding, u[0], cospi[32]);
1909   u[0] = vshlq_s32(x, v_bit);
1910   u[1] = u[0];
1911 
1912   u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
1913   u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
1914 
1915   addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1916   addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1917 
1918   x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
1919                              &rnding);
1920   u[14] =
1921       half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
1922   u[9] = x;
1923   y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit,
1924                              &rnding);
1925   u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit,
1926                                  &rnding);
1927   u[10] = y;
1928 
1929   // stage 5
1930   addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1931   addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1932 
1933   x = vmulq_n_s32(u[5], cospi[32]);
1934   y = vmlaq_n_s32(rnding, u[6], cospi[32]);
1935   u[5] = vsubq_s32(y, x);
1936   u[5] = vshlq_s32(u[5], v_bit);
1937 
1938   u[6] = vaddq_s32(y, x);
1939   u[6] = vshlq_s32(u[6], v_bit);
1940 
1941   addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1942   addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1943   addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1944   addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1945 
1946   // stage 6
1947   addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1948   addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1949   addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1950   addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1951 
1952   x = vmulq_n_s32(u[10], cospi[32]);
1953   y = vmlaq_n_s32(rnding, u[13], cospi[32]);
1954   u[10] = vsubq_s32(y, x);
1955   u[10] = vshlq_s32(u[10], v_bit);
1956 
1957   u[13] = vaddq_s32(x, y);
1958   u[13] = vshlq_s32(u[13], v_bit);
1959 
1960   x = vmulq_n_s32(u[11], cospi[32]);
1961   y = vmlaq_n_s32(rnding, u[12], cospi[32]);
1962   u[11] = vsubq_s32(y, x);
1963   u[11] = vshlq_s32(u[11], v_bit);
1964 
1965   u[12] = vaddq_s32(x, y);
1966   u[12] = vshlq_s32(u[12], v_bit);
1967   // stage 7
1968   addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1969   addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1970   addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1971   addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1972   addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1973   addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1974   addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1975   addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1976 
1977   if (!do_cols) {
1978     const int log_range_out = AOMMAX(16, bd + 6);
1979     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1980     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1981     round_shift_8x8(out, out_shift);
1982     highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1983   }
1984 }
1985 
iadst16x16_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1986 static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1987                                  int do_cols, int bd, int out_shift) {
1988   const int32_t *cospi = cospi_arr(bit);
1989   int32x4_t v[16], x, y, temp1, temp2;
1990   const int32x4_t v_bit = vdupq_n_s32(-bit);
1991   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1992   // stage 0
1993   // stage 1
1994   // stage 2
1995   v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
1996   v[0] = vshlq_s32(v[0], v_bit);
1997 
1998   v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
1999   v[1] = vshlq_s32(v[1], v_bit);
2000 
2001   // stage 3
2002   v[8] = v[0];
2003   v[9] = v[1];
2004 
2005   // stage 4
2006   temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]);
2007   temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]);
2008   temp1 = vshlq_s32(temp1, v_bit);
2009 
2010   temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]);
2011   temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]);
2012   temp2 = vshlq_s32(temp2, v_bit);
2013   v[8] = temp1;
2014   v[9] = temp2;
2015 
2016   // stage 5
2017   v[4] = v[0];
2018   v[5] = v[1];
2019   v[12] = v[8];
2020   v[13] = v[9];
2021 
2022   // stage 6
2023   temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]);
2024   temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]);
2025   temp1 = vshlq_s32(temp1, v_bit);
2026 
2027   temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]);
2028   temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]);
2029   temp2 = vshlq_s32(temp2, v_bit);
2030   v[4] = temp1;
2031   v[5] = temp2;
2032 
2033   temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]);
2034   temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]);
2035   temp1 = vshlq_s32(temp1, v_bit);
2036 
2037   temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]);
2038   temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]);
2039   temp2 = vshlq_s32(temp2, v_bit);
2040   v[12] = temp1;
2041   v[13] = temp2;
2042 
2043   // stage 7
2044   v[2] = v[0];
2045   v[3] = v[1];
2046   v[6] = v[4];
2047   v[7] = v[5];
2048   v[10] = v[8];
2049   v[11] = v[9];
2050   v[14] = v[12];
2051   v[15] = v[13];
2052 
2053   // stage 8
2054   y = vmlaq_n_s32(rnding, v[2], cospi[32]);
2055   x = vmulq_n_s32(v[3], cospi[32]);
2056   v[2] = vaddq_s32(y, x);
2057   v[2] = vshlq_s32(v[2], v_bit);
2058 
2059   v[3] = vsubq_s32(y, x);
2060   v[3] = vshlq_s32(v[3], v_bit);
2061 
2062   y = vmlaq_n_s32(rnding, v[6], cospi[32]);
2063   x = vmulq_n_s32(v[7], cospi[32]);
2064   v[6] = vaddq_s32(y, x);
2065   v[6] = vshlq_s32(v[6], v_bit);
2066 
2067   v[7] = vsubq_s32(y, x);
2068   v[7] = vshlq_s32(v[7], v_bit);
2069 
2070   y = vmlaq_n_s32(rnding, v[10], cospi[32]);
2071   x = vmulq_n_s32(v[11], cospi[32]);
2072   v[10] = vaddq_s32(y, x);
2073   v[10] = vshlq_s32(v[10], v_bit);
2074 
2075   v[11] = vsubq_s32(y, x);
2076   v[11] = vshlq_s32(v[11], v_bit);
2077 
2078   y = vmlaq_n_s32(rnding, v[14], cospi[32]);
2079   x = vmulq_n_s32(v[15], cospi[32]);
2080   v[14] = vaddq_s32(y, x);
2081   v[14] = vshlq_s32(v[14], v_bit);
2082 
2083   v[15] = vsubq_s32(y, x);
2084   v[15] = vshlq_s32(v[15], v_bit);
2085 
2086   // stage 9
2087   if (do_cols) {
2088     out[0] = v[0];
2089     out[1] = vnegq_s32(v[8]);
2090     out[2] = v[12];
2091     out[3] = vnegq_s32(v[4]);
2092     out[4] = v[6];
2093     out[5] = vnegq_s32(v[14]);
2094     out[6] = v[10];
2095     out[7] = vnegq_s32(v[2]);
2096     out[8] = v[3];
2097     out[9] = vnegq_s32(v[11]);
2098     out[10] = v[15];
2099     out[11] = vnegq_s32(v[7]);
2100     out[12] = v[5];
2101     out[13] = vnegq_s32(v[13]);
2102     out[14] = v[9];
2103     out[15] = vnegq_s32(v[1]);
2104   } else {
2105     const int log_range_out = AOMMAX(16, bd + 6);
2106     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2107     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2108     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
2109     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
2110     neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2111                    &v_shift, &offset);
2112     neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
2113                    &clamp_hi_out, &v_shift, &offset);
2114     neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
2115                    &clamp_hi_out, &v_shift, &offset);
2116     neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
2117                    &clamp_hi_out, &v_shift, &offset);
2118     neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
2119                    &clamp_hi_out, &v_shift, &offset);
2120     neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
2121                    &clamp_hi_out, &v_shift, &offset);
2122     neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
2123                    &clamp_hi_out, &v_shift, &offset);
2124     neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
2125                    &clamp_hi_out, &v_shift, &offset);
2126   }
2127 }
2128 
iadst16x16_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2129 static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
2130                                  int do_cols, int bd, int out_shift) {
2131   const int32_t *cospi = cospi_arr(bit);
2132   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2133   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2134   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2135   int32x4_t zero = vdupq_n_s32(0);
2136   int32x4_t u[16], x, y;
2137   const int32x4_t v_bit = vdupq_n_s32(-bit);
2138   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
2139   // stage 0-2
2140   u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
2141   u[0] = vshlq_s32(u[0], v_bit);
2142 
2143   u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
2144   u[1] = vshlq_s32(u[1], v_bit);
2145 
2146   u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]);
2147   u[2] = vshlq_s32(u[2], v_bit);
2148 
2149   u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]);
2150   u[3] = vshlq_s32(u[3], v_bit);
2151 
2152   u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]);
2153   u[4] = vshlq_s32(u[4], v_bit);
2154 
2155   u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]);
2156   u[5] = vshlq_s32(u[5], v_bit);
2157 
2158   u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]);
2159   u[6] = vshlq_s32(u[6], v_bit);
2160 
2161   u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]);
2162   u[7] = vshlq_s32(u[7], v_bit);
2163 
2164   u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
2165   u[8] = vshlq_s32(u[8], v_bit);
2166 
2167   u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
2168   u[9] = vshlq_s32(u[9], v_bit);
2169 
2170   u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
2171   u[10] = vshlq_s32(u[10], v_bit);
2172 
2173   u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
2174   u[11] = vshlq_s32(u[11], v_bit);
2175 
2176   u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
2177   u[12] = vshlq_s32(u[12], v_bit);
2178 
2179   u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
2180   u[13] = vshlq_s32(u[13], v_bit);
2181 
2182   u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
2183   u[14] = vshlq_s32(u[14], v_bit);
2184 
2185   u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
2186   u[15] = vshlq_s32(u[15], v_bit);
2187 
2188   // stage 3
2189   addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2190   addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2191   addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2192   addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2193   addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2194   addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2195   addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2196   addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2197 
2198   // stage 4
2199   y = vmlaq_n_s32(rnding, u[8], cospi[56]);
2200   u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
2201   u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]);
2202   u[8] = vshlq_s32(u[8], v_bit);
2203 
2204   u[9] = vmlsq_n_s32(y, u[9], cospi[8]);
2205   u[9] = vshlq_s32(u[9], v_bit);
2206 
2207   y = vmlaq_n_s32(rnding, u[10], cospi[24]);
2208   u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
2209   u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]);
2210   u[10] = vshlq_s32(u[10], v_bit);
2211 
2212   u[11] = vmlsq_n_s32(y, u[11], cospi[40]);
2213   u[11] = vshlq_s32(u[11], v_bit);
2214 
2215   y = vmlaq_n_s32(rnding, u[12], cospi[8]);
2216   u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]);
2217   u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]);
2218   u[12] = vshlq_s32(u[12], v_bit);
2219 
2220   u[13] = vmlaq_n_s32(y, u[13], cospi[56]);
2221   u[13] = vshlq_s32(u[13], v_bit);
2222 
2223   y = vmlaq_n_s32(rnding, u[14], cospi[40]);
2224   u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]);
2225   u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]);
2226   u[14] = vshlq_s32(u[14], v_bit);
2227 
2228   u[15] = vmlaq_n_s32(y, u[15], cospi[24]);
2229   u[15] = vshlq_s32(u[15], v_bit);
2230 
2231   // stage 5
2232   addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2233   addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2234   addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2235   addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2236   addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2237   addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2238   addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2239   addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2240 
2241   // stage 6
2242   y = vmlaq_n_s32(rnding, u[4], cospi[48]);
2243   u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
2244   u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]);
2245   u[4] = vshlq_s32(u[4], v_bit);
2246 
2247   u[5] = vmlsq_n_s32(y, u[5], cospi[16]);
2248   u[5] = vshlq_s32(u[5], v_bit);
2249 
2250   y = vmlaq_n_s32(rnding, u[6], cospi[16]);
2251   u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]);
2252   u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]);
2253   u[6] = vshlq_s32(u[6], v_bit);
2254 
2255   u[7] = vmlaq_n_s32(y, u[7], cospi[48]);
2256   u[7] = vshlq_s32(u[7], v_bit);
2257 
2258   y = vmlaq_n_s32(rnding, u[12], cospi[48]);
2259   u[12] = vmulq_n_s32(u[12], cospi[16]);
2260   u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]);
2261   u[12] = vshlq_s32(u[12], v_bit);
2262 
2263   u[13] = vmlsq_n_s32(y, u[13], cospi[16]);
2264   u[13] = vshlq_s32(u[13], v_bit);
2265 
2266   y = vmlaq_n_s32(rnding, u[14], cospi[16]);
2267   u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]);
2268   u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]);
2269   u[14] = vshlq_s32(u[14], v_bit);
2270 
2271   u[15] = vmlaq_n_s32(y, u[15], cospi[48]);
2272   u[15] = vshlq_s32(u[15], v_bit);
2273 
2274   // stage 7
2275   addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2276   addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2277   addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2278   addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2279   addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2280   addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2281   addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2282   addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2283 
2284   // stage 8
2285   y = vmlaq_n_s32(rnding, u[2], cospi[32]);
2286   x = vmulq_n_s32(u[3], cospi[32]);
2287   u[2] = vaddq_s32(y, x);
2288   u[2] = vshlq_s32(u[2], v_bit);
2289 
2290   u[3] = vsubq_s32(y, x);
2291   u[3] = vshlq_s32(u[3], v_bit);
2292   y = vmlaq_n_s32(rnding, u[6], cospi[32]);
2293   x = vmulq_n_s32(u[7], cospi[32]);
2294   u[6] = vaddq_s32(y, x);
2295   u[6] = vshlq_s32(u[6], v_bit);
2296 
2297   u[7] = vsubq_s32(y, x);
2298   u[7] = vshlq_s32(u[7], v_bit);
2299 
2300   y = vmlaq_n_s32(rnding, u[10], cospi[32]);
2301   x = vmulq_n_s32(u[11], cospi[32]);
2302   u[10] = vaddq_s32(y, x);
2303   u[10] = vshlq_s32(u[10], v_bit);
2304 
2305   u[11] = vsubq_s32(y, x);
2306   u[11] = vshlq_s32(u[11], v_bit);
2307 
2308   y = vmlaq_n_s32(rnding, u[14], cospi[32]);
2309   x = vmulq_n_s32(u[15], cospi[32]);
2310   u[14] = vaddq_s32(y, x);
2311   u[14] = vshlq_s32(u[14], v_bit);
2312 
2313   u[15] = vsubq_s32(y, x);
2314   u[15] = vshlq_s32(u[15], v_bit);
2315 
2316   // stage 9
2317   if (do_cols) {
2318     out[0] = u[0];
2319     out[1] = vsubq_s32(zero, u[8]);
2320     out[2] = u[12];
2321     out[3] = vsubq_s32(zero, u[4]);
2322     out[4] = u[6];
2323     out[5] = vsubq_s32(zero, u[14]);
2324     out[6] = u[10];
2325     out[7] = vsubq_s32(zero, u[2]);
2326     out[8] = u[3];
2327     out[9] = vsubq_s32(zero, u[11]);
2328     out[10] = u[15];
2329     out[11] = vsubq_s32(zero, u[7]);
2330     out[12] = u[5];
2331     out[13] = vsubq_s32(zero, u[13]);
2332     out[14] = u[9];
2333     out[15] = vsubq_s32(zero, u[1]);
2334   } else {
2335     const int log_range_out = AOMMAX(16, bd + 6);
2336     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2337     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2338     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
2339     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
2340     neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2341                    &v_shift, &offset);
2342     neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out,
2343                    &clamp_hi_out, &v_shift, &offset);
2344     neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out,
2345                    &clamp_hi_out, &v_shift, &offset);
2346     neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out,
2347                    &clamp_hi_out, &v_shift, &offset);
2348     neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out,
2349                    &clamp_hi_out, &v_shift, &offset);
2350     neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out,
2351                    &clamp_hi_out, &v_shift, &offset);
2352     neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out,
2353                    &clamp_hi_out, &v_shift, &offset);
2354     neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out,
2355                    &clamp_hi_out, &v_shift, &offset);
2356   }
2357 }
2358 
idct16x16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2359 static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
2360                            int bd, int out_shift) {
2361   const int32_t *cospi = cospi_arr(bit);
2362   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2363   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2364   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2365   int32x4_t u[16], v[16], x, y;
2366   const int32x4_t v_bit = vdupq_n_s32(-bit);
2367   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
2368 
2369   {
2370     // stage 0-1
2371     u[0] = in[0];
2372     u[1] = in[8];
2373     u[2] = in[4];
2374     u[3] = in[12];
2375     u[4] = in[2];
2376     u[5] = in[10];
2377     u[6] = in[6];
2378     u[7] = in[14];
2379     u[8] = in[1];
2380     u[9] = in[9];
2381     u[10] = in[5];
2382     u[11] = in[13];
2383     u[12] = in[3];
2384     u[13] = in[11];
2385     u[14] = in[7];
2386     u[15] = in[15];
2387 
2388     // stage 2
2389     v[0] = u[0];
2390     v[1] = u[1];
2391     v[2] = u[2];
2392     v[3] = u[3];
2393     v[4] = u[4];
2394     v[5] = u[5];
2395     v[6] = u[6];
2396     v[7] = u[7];
2397 
2398     v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit,
2399                                   &rnding);
2400     v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit,
2401                                   &rnding);
2402     v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13],
2403                                    &v_bit, &rnding);
2404     v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12],
2405                                    &v_bit, &rnding);
2406     v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit,
2407                             &rnding);
2408     v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit,
2409                             &rnding);
2410     v[14] =
2411         half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding);
2412     v[15] =
2413         half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding);
2414 
2415     // stage 3
2416     u[0] = v[0];
2417     u[1] = v[1];
2418     u[2] = v[2];
2419     u[3] = v[3];
2420     u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit,
2421                                   &rnding);
2422     u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit,
2423                                   &rnding);
2424     u[6] =
2425         half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding);
2426     u[7] =
2427         half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding);
2428     addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2429     addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2430     addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2431     addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2432 
2433     // stage 4
2434     x = vmlaq_n_s32(rnding, u[0], cospi[32]);
2435     y = vmulq_n_s32(u[1], cospi[32]);
2436     v[0] = vaddq_s32(x, y);
2437     v[0] = vshlq_s32(v[0], v_bit);
2438 
2439     v[1] = vsubq_s32(x, y);
2440     v[1] = vshlq_s32(v[1], v_bit);
2441 
2442     v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit,
2443                                   &rnding);
2444     v[3] =
2445         half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding);
2446     addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2447     addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2448     v[8] = u[8];
2449     v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
2450                                   &rnding);
2451     v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
2452                                    &v_bit, &rnding);
2453     v[11] = u[11];
2454     v[12] = u[12];
2455     v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
2456                                    &v_bit, &rnding);
2457     v[14] =
2458         half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
2459     v[15] = u[15];
2460 
2461     // stage 5
2462     addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2463     addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2464     u[4] = v[4];
2465 
2466     x = vmulq_n_s32(v[5], cospi[32]);
2467     y = vmlaq_n_s32(rnding, v[6], cospi[32]);
2468     u[5] = vsubq_s32(y, x);
2469     u[5] = vshlq_s32(u[5], v_bit);
2470 
2471     u[6] = vaddq_s32(y, x);
2472     u[6] = vshlq_s32(u[6], v_bit);
2473 
2474     u[7] = v[7];
2475     addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2476     addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2477     addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2478     addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2479 
2480     // stage 6
2481     addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2482     addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2483     addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2484     addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2485     v[8] = u[8];
2486     v[9] = u[9];
2487 
2488     x = vmulq_n_s32(u[10], cospi[32]);
2489     y = vmlaq_n_s32(rnding, u[13], cospi[32]);
2490     v[10] = vsubq_s32(y, x);
2491     v[10] = vshlq_s32(v[10], v_bit);
2492 
2493     v[13] = vaddq_s32(x, y);
2494     v[13] = vshlq_s32(v[13], v_bit);
2495 
2496     x = vmulq_n_s32(u[11], cospi[32]);
2497     y = vmlaq_n_s32(rnding, u[12], cospi[32]);
2498     v[11] = vsubq_s32(y, x);
2499     v[11] = vshlq_s32(v[11], v_bit);
2500 
2501     v[12] = vaddq_s32(x, y);
2502     v[12] = vshlq_s32(v[12], v_bit);
2503 
2504     v[14] = u[14];
2505     v[15] = u[15];
2506 
2507     // stage 7
2508     addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2509     addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2510     addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2511     addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2512     addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2513     addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2514     addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2515     addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2516 
2517     if (!do_cols) {
2518       const int log_range_out = AOMMAX(16, bd + 6);
2519       const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2520       const int32x4_t clamp_hi_out =
2521           vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2522       round_shift_8x8(out, out_shift);
2523       highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2524     }
2525   }
2526 }
2527 
iadst16x16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2528 static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
2529                             int bd, int out_shift) {
2530   const int32_t *cospi = cospi_arr(bit);
2531   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2532   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2533   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2534   const int32x4_t zero = vdupq_n_s32(0);
2535   const int32x4_t v_bit = vdupq_n_s32(-bit);
2536   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
2537   int32x4_t u[16], v[16], x, y;
2538   // Calculate the column 0, 1, 2, 3
2539   // stage 0
2540   // stage 1
2541   // stage 2
2542   v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]);
2543   v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]);
2544   v[0] = vshlq_s32(v[0], v_bit);
2545 
2546   v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]);
2547   v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]);
2548   v[1] = vshlq_s32(v[1], v_bit);
2549 
2550   v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]);
2551   v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]);
2552   v[2] = vshlq_s32(v[2], v_bit);
2553 
2554   v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]);
2555   v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]);
2556   v[3] = vshlq_s32(v[3], v_bit);
2557 
2558   v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]);
2559   v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]);
2560   v[4] = vshlq_s32(v[4], v_bit);
2561 
2562   v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]);
2563   v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]);
2564   v[5] = vshlq_s32(v[5], v_bit);
2565 
2566   v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]);
2567   v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]);
2568   v[6] = vshlq_s32(v[6], v_bit);
2569 
2570   v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]);
2571   v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]);
2572   v[7] = vshlq_s32(v[7], v_bit);
2573 
2574   v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
2575   v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]);
2576   v[8] = vshlq_s32(v[8], v_bit);
2577 
2578   v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
2579   v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]);
2580   v[9] = vshlq_s32(v[9], v_bit);
2581 
2582   v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
2583   v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]);
2584   v[10] = vshlq_s32(v[10], v_bit);
2585 
2586   v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
2587   v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]);
2588   v[11] = vshlq_s32(v[11], v_bit);
2589 
2590   v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
2591   v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]);
2592   v[12] = vshlq_s32(v[12], v_bit);
2593 
2594   v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
2595   v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]);
2596   v[13] = vshlq_s32(v[13], v_bit);
2597 
2598   v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
2599   v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]);
2600   v[14] = vshlq_s32(v[14], v_bit);
2601 
2602   v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
2603   v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]);
2604   v[15] = vshlq_s32(v[15], v_bit);
2605 
2606   // stage 3
2607   addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2608   addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2609   addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2610   addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2611   addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2612   addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2613   addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2614   addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2615 
2616   // stage 4
2617   v[0] = u[0];
2618   v[1] = u[1];
2619   v[2] = u[2];
2620   v[3] = u[3];
2621   v[4] = u[4];
2622   v[5] = u[5];
2623   v[6] = u[6];
2624   v[7] = u[7];
2625 
2626   v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
2627   v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]);
2628   v[8] = vshlq_s32(v[8], v_bit);
2629 
2630   v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]);
2631   v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]);
2632   v[9] = vshlq_s32(v[9], v_bit);
2633 
2634   v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
2635   v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]);
2636   v[10] = vshlq_s32(v[10], v_bit);
2637 
2638   v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]);
2639   v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]);
2640   v[11] = vshlq_s32(v[11], v_bit);
2641 
2642   v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]);
2643   v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]);
2644   v[12] = vshlq_s32(v[12], v_bit);
2645 
2646   v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]);
2647   v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]);
2648   v[13] = vshlq_s32(v[13], v_bit);
2649 
2650   v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]);
2651   v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]);
2652   v[14] = vshlq_s32(v[14], v_bit);
2653 
2654   v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]);
2655   v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]);
2656   v[15] = vshlq_s32(v[15], v_bit);
2657 
2658   // stage 5
2659   addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2660   addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2661   addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2662   addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2663   addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2664   addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2665   addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2666   addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2667 
2668   // stage 6
2669   v[0] = u[0];
2670   v[1] = u[1];
2671   v[2] = u[2];
2672   v[3] = u[3];
2673 
2674   v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
2675   v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]);
2676   v[4] = vshlq_s32(v[4], v_bit);
2677 
2678   v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]);
2679   v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]);
2680   v[5] = vshlq_s32(v[5], v_bit);
2681 
2682   v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]);
2683   v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]);
2684   v[6] = vshlq_s32(v[6], v_bit);
2685 
2686   v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]);
2687   v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]);
2688   v[7] = vshlq_s32(v[7], v_bit);
2689 
2690   v[8] = u[8];
2691   v[9] = u[9];
2692   v[10] = u[10];
2693   v[11] = u[11];
2694 
2695   v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]);
2696   v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]);
2697   v[12] = vshlq_s32(v[12], v_bit);
2698 
2699   v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]);
2700   v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]);
2701   v[13] = vshlq_s32(v[13], v_bit);
2702 
2703   v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]);
2704   v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]);
2705   v[14] = vshlq_s32(v[14], v_bit);
2706 
2707   v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]);
2708   v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]);
2709   v[15] = vshlq_s32(v[15], v_bit);
2710 
2711   // stage 7
2712   addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2713   addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2714   addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2715   addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2716   addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2717   addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2718   addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2719   addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2720 
2721   // stage 8
2722   v[0] = u[0];
2723   v[1] = u[1];
2724 
2725   y = vmlaq_n_s32(rnding, u[2], cospi[32]);
2726   x = vmulq_n_s32(u[3], cospi[32]);
2727   v[2] = vaddq_s32(y, x);
2728   v[2] = vshlq_s32(v[2], v_bit);
2729 
2730   v[3] = vsubq_s32(y, x);
2731   v[3] = vshlq_s32(v[3], v_bit);
2732 
2733   v[4] = u[4];
2734   v[5] = u[5];
2735 
2736   y = vmlaq_n_s32(rnding, u[6], cospi[32]);
2737   x = vmulq_n_s32(u[7], cospi[32]);
2738   v[6] = vaddq_s32(y, x);
2739   v[6] = vshlq_s32(v[6], v_bit);
2740 
2741   v[7] = vsubq_s32(y, x);
2742   v[7] = vshlq_s32(v[7], v_bit);
2743 
2744   v[8] = u[8];
2745   v[9] = u[9];
2746 
2747   y = vmlaq_n_s32(rnding, u[10], cospi[32]);
2748   x = vmulq_n_s32(u[11], cospi[32]);
2749   v[10] = vaddq_s32(y, x);
2750   v[10] = vshlq_s32(v[10], v_bit);
2751 
2752   v[11] = vsubq_s32(y, x);
2753   v[11] = vshlq_s32(v[11], v_bit);
2754 
2755   v[12] = u[12];
2756   v[13] = u[13];
2757 
2758   y = vmlaq_n_s32(rnding, u[14], cospi[32]);
2759   x = vmulq_n_s32(u[15], cospi[32]);
2760   v[14] = vaddq_s32(y, x);
2761   v[14] = vshlq_s32(v[14], v_bit);
2762 
2763   v[15] = vsubq_s32(y, x);
2764   v[15] = vshlq_s32(v[15], v_bit);
2765 
2766   // stage 9
2767   if (do_cols) {
2768     out[0] = v[0];
2769     out[1] = vsubq_s32(zero, v[8]);
2770     out[2] = v[12];
2771     out[3] = vsubq_s32(zero, v[4]);
2772     out[4] = v[6];
2773     out[5] = vsubq_s32(zero, v[14]);
2774     out[6] = v[10];
2775     out[7] = vsubq_s32(zero, v[2]);
2776     out[8] = v[3];
2777     out[9] = vsubq_s32(zero, v[11]);
2778     out[10] = v[15];
2779     out[11] = vsubq_s32(zero, v[7]);
2780     out[12] = v[5];
2781     out[13] = vsubq_s32(zero, v[13]);
2782     out[14] = v[9];
2783     out[15] = vsubq_s32(zero, v[1]);
2784   } else {
2785     const int log_range_out = AOMMAX(16, bd + 6);
2786     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2787     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2788     const int32x4_t v_shift = vdupq_n_s32(-out_shift);
2789     int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
2790     neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2791                    &v_shift, &offset);
2792     neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
2793                    &clamp_hi_out, &v_shift, &offset);
2794     neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
2795                    &clamp_hi_out, &v_shift, &offset);
2796     neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
2797                    &clamp_hi_out, &v_shift, &offset);
2798     neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
2799                    &clamp_hi_out, &v_shift, &offset);
2800     neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
2801                    &clamp_hi_out, &v_shift, &offset);
2802     neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
2803                    &clamp_hi_out, &v_shift, &offset);
2804     neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
2805                    &clamp_hi_out, &v_shift, &offset);
2806   }
2807 }
2808 
iidentity16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2809 static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit,
2810                              int do_cols, int bd, int out_shift) {
2811   (void)bit;
2812   int32x2_t fact = vdup_n_s32(2 * NewSqrt2);
2813   int32x4x2_t a0;
2814   int32x4_t zero = vdupq_n_s32(0);
2815   const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
2816   for (int i = 0; i < 16; i++) {
2817     a0.val[0] = vreinterpretq_s32_s64(
2818         vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
2819     a0.val[0] = vreinterpretq_s32_s64(
2820         vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
2821     a0.val[1] = vextq_s32(in[i], zero, 1);
2822     a0.val[1] = vreinterpretq_s32_s64(
2823         vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
2824     a0.val[1] = vreinterpretq_s32_s64(
2825         vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
2826     a0 = vzipq_s32(a0.val[0], a0.val[1]);
2827 #if AOM_ARCH_AARCH64
2828     out[i] = vreinterpretq_s32_s64(vzip1q_s64(
2829         vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
2830 #else
2831     out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
2832 #endif
2833   }
2834 
2835   if (!do_cols) {
2836     const int log_range = AOMMAX(16, bd + 6);
2837     const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2838     const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2839     round_shift_8x8(out, out_shift);
2840     highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16);
2841   }
2842 }
2843 
idct64_stage8_neon(int32x4_t * u,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)2844 static inline void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
2845                                       const int32x4_t *clamp_lo,
2846                                       const int32x4_t *clamp_hi,
2847                                       const int32x4_t *v_bit,
2848                                       const int32x4_t *rnding) {
2849   int i;
2850   int32x4_t temp1, temp2, temp3, temp4;
2851   temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit,
2852                                  rnding);
2853   u[13] =
2854       half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding);
2855   u[10] = temp1;
2856   temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit,
2857                                  rnding);
2858   u[12] =
2859       half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding);
2860   u[11] = temp2;
2861 
2862   for (i = 16; i < 20; ++i) {
2863     addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2864     addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
2865   }
2866 
2867   temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit,
2868                                  rnding);
2869   temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit,
2870                                  rnding);
2871   temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit,
2872                                  rnding);
2873   temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit,
2874                                  rnding);
2875   u[56] =
2876       half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding);
2877   u[57] =
2878       half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding);
2879   u[58] =
2880       half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding);
2881   u[59] =
2882       half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding);
2883   u[36] = temp1;
2884   u[37] = temp2;
2885   u[38] = temp3;
2886   u[39] = temp4;
2887 
2888   temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit,
2889                                  rnding);
2890   temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit,
2891                                  rnding);
2892   temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit,
2893                                  rnding);
2894   temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit,
2895                                  rnding);
2896   u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit,
2897                                  rnding);
2898   u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit,
2899                                  rnding);
2900   u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit,
2901                                  rnding);
2902   u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit,
2903                                  rnding);
2904   u[40] = temp1;
2905   u[41] = temp2;
2906   u[42] = temp3;
2907   u[43] = temp4;
2908 }
2909 
idct64_stage9_neon(int32x4_t * u,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)2910 static inline void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
2911                                       const int32x4_t *clamp_lo,
2912                                       const int32x4_t *clamp_hi,
2913                                       const int32x4_t *v_bit,
2914                                       const int32x4_t *rnding) {
2915   int i;
2916   int32x4_t temp1, temp2, temp3, temp4;
2917   for (i = 0; i < 8; ++i) {
2918     addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
2919   }
2920   temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit,
2921                                  rnding);
2922   temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit,
2923                                  rnding);
2924   temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit,
2925                                  rnding);
2926   temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit,
2927                                  rnding);
2928   u[24] =
2929       half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding);
2930   u[25] =
2931       half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding);
2932   u[26] =
2933       half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding);
2934   u[27] =
2935       half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding);
2936   u[20] = temp1;
2937   u[21] = temp2;
2938   u[22] = temp3;
2939   u[23] = temp4;
2940   for (i = 32; i < 40; i++) {
2941     addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
2942   }
2943 
2944   for (i = 48; i < 56; i++) {
2945     addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
2946   }
2947 }
2948 
idct64_stage10_neon(int32x4_t * u,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)2949 static inline void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
2950                                        const int32x4_t *clamp_lo,
2951                                        const int32x4_t *clamp_hi,
2952                                        const int32x4_t *v_bit,
2953                                        const int32x4_t *rnding) {
2954   int32x4_t temp1, temp2, temp3, temp4;
2955   for (int i = 0; i < 16; i++) {
2956     addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
2957   }
2958   temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit,
2959                                  rnding);
2960   temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit,
2961                                  rnding);
2962   temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit,
2963                                  rnding);
2964   temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit,
2965                                  rnding);
2966   u[52] =
2967       half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding);
2968   u[53] =
2969       half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding);
2970   u[54] =
2971       half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding);
2972   u[55] =
2973       half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding);
2974   u[40] = temp1;
2975   u[41] = temp2;
2976   u[42] = temp3;
2977   u[43] = temp4;
2978 
2979   temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit,
2980                                  rnding);
2981   temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit,
2982                                  rnding);
2983   temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit,
2984                                  rnding);
2985   temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit,
2986                                  rnding);
2987   u[48] =
2988       half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding);
2989   u[49] =
2990       half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding);
2991   u[50] =
2992       half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding);
2993   u[51] =
2994       half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding);
2995   u[44] = temp1;
2996   u[45] = temp2;
2997   u[46] = temp3;
2998   u[47] = temp4;
2999 }
3000 
idct64_stage11_neon(int32x4_t * u,int32x4_t * out,int do_cols,int bd,int out_shift,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi)3001 static inline void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
3002                                        int do_cols, int bd, int out_shift,
3003                                        const int32x4_t *clamp_lo,
3004                                        const int32x4_t *clamp_hi) {
3005   for (int i = 0; i < 32; i++) {
3006     addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
3007   }
3008 
3009   if (!do_cols) {
3010     const int log_range_out = AOMMAX(16, bd + 6);
3011     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
3012     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
3013     for (int i = 0; i < 64; i += 4) {
3014       round_shift_4x4(out + i, out_shift);
3015       highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4);
3016     }
3017   }
3018 }
3019 
idct64x64_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3020 static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
3021                                 int do_cols, int bd, int out_shift) {
3022   const int32_t *cospi = cospi_arr(bit);
3023   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3024   int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3025   int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3026 
3027   const int32x4_t v_bit = vdupq_n_s32(-bit);
3028   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3029   {
3030     int32x4_t x;
3031 
3032     // stage 1
3033     // stage 2
3034     // stage 3
3035     // stage 4
3036     // stage 5
3037     // stage 6
3038     x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding);
3039 
3040     // stage 8
3041     // stage 9
3042     // stage 10
3043     // stage 11
3044     if (!do_cols) {
3045       const int log_range_out = AOMMAX(16, bd + 6);
3046       clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
3047       clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
3048       if (out_shift != 0) {
3049         int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
3050         x = vaddq_s32(x, offset);
3051         x = vshlq_s32(x, vdupq_n_s32(-out_shift));
3052       }
3053     }
3054     x = vmaxq_s32(x, clamp_lo);
3055     x = vminq_s32(x, clamp_hi);
3056     out[0] = x;
3057     out[1] = x;
3058     out[2] = x;
3059     out[3] = x;
3060     out[4] = x;
3061     out[5] = x;
3062     out[6] = x;
3063     out[7] = x;
3064     out[8] = x;
3065     out[9] = x;
3066     out[10] = x;
3067     out[11] = x;
3068     out[12] = x;
3069     out[13] = x;
3070     out[14] = x;
3071     out[15] = x;
3072     out[16] = x;
3073     out[17] = x;
3074     out[18] = x;
3075     out[19] = x;
3076     out[20] = x;
3077     out[21] = x;
3078     out[22] = x;
3079     out[23] = x;
3080     out[24] = x;
3081     out[25] = x;
3082     out[26] = x;
3083     out[27] = x;
3084     out[28] = x;
3085     out[29] = x;
3086     out[30] = x;
3087     out[31] = x;
3088     out[32] = x;
3089     out[33] = x;
3090     out[34] = x;
3091     out[35] = x;
3092     out[36] = x;
3093     out[37] = x;
3094     out[38] = x;
3095     out[39] = x;
3096     out[40] = x;
3097     out[41] = x;
3098     out[42] = x;
3099     out[43] = x;
3100     out[44] = x;
3101     out[45] = x;
3102     out[46] = x;
3103     out[47] = x;
3104     out[48] = x;
3105     out[49] = x;
3106     out[50] = x;
3107     out[51] = x;
3108     out[52] = x;
3109     out[53] = x;
3110     out[54] = x;
3111     out[55] = x;
3112     out[56] = x;
3113     out[57] = x;
3114     out[58] = x;
3115     out[59] = x;
3116     out[60] = x;
3117     out[61] = x;
3118     out[62] = x;
3119     out[63] = x;
3120   }
3121 }
3122 
idct64x64_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3123 static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
3124                                 int do_cols, int bd, int out_shift) {
3125   int i, j;
3126   const int32_t *cospi = cospi_arr(bit);
3127   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3128   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3129   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3130   const int32x4_t v_bit = vdupq_n_s32(-bit);
3131   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3132   {
3133     int32x4_t u[64];
3134 
3135     // stage 1
3136     u[0] = in[0];
3137     u[8] = in[4];
3138     u[16] = in[2];
3139     u[24] = in[6];
3140     u[32] = in[1];
3141     u[40] = in[5];
3142     u[48] = in[3];
3143     u[56] = in[7];
3144 
3145     // stage 2
3146     u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
3147     u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
3148     u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
3149     u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
3150     u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
3151     u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
3152     u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
3153     u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
3154 
3155     // stage 3
3156     u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
3157     u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
3158     u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
3159     u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
3160     u[33] = u[32];
3161     u[38] = u[39];
3162     u[41] = u[40];
3163     u[46] = u[47];
3164     u[49] = u[48];
3165     u[54] = u[55];
3166     u[57] = u[56];
3167     u[62] = u[63];
3168 
3169     // stage 4
3170     int32x4_t temp1, temp2;
3171     u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
3172     u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
3173     u[17] = u[16];
3174     u[22] = u[23];
3175     u[25] = u[24];
3176     u[30] = u[31];
3177 
3178     temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
3179                                    &v_bit, &rnding);
3180     u[62] =
3181         half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
3182     u[33] = temp1;
3183 
3184     temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
3185                                    &v_bit, &rnding);
3186     u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
3187                                    &v_bit, &rnding);
3188     u[57] = temp2;
3189 
3190     temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
3191                                    &v_bit, &rnding);
3192     u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
3193                             &rnding);
3194     u[41] = temp1;
3195 
3196     temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
3197                                    &v_bit, &rnding);
3198     u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
3199                                    &v_bit, &rnding);
3200     u[46] = temp2;
3201 
3202     // stage 5
3203     u[9] = u[8];
3204     u[14] = u[15];
3205 
3206     temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30],
3207                                    &v_bit, &rnding);
3208     u[30] =
3209         half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
3210     u[17] = temp1;
3211 
3212     temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
3213                                    &v_bit, &rnding);
3214     u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
3215                                    &v_bit, &rnding);
3216     u[22] = temp2;
3217 
3218     u[35] = u[32];
3219     u[34] = u[33];
3220     u[36] = u[39];
3221     u[37] = u[38];
3222     u[43] = u[40];
3223     u[42] = u[41];
3224     u[44] = u[47];
3225     u[45] = u[46];
3226     u[51] = u[48];
3227     u[50] = u[49];
3228     u[52] = u[55];
3229     u[53] = u[54];
3230     u[59] = u[56];
3231     u[58] = u[57];
3232     u[60] = u[63];
3233     u[61] = u[62];
3234 
3235     // stage 6
3236     temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3237     u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3238     u[0] = temp1;
3239 
3240     temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14],
3241                                    &v_bit, &rnding);
3242     u[14] =
3243         half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
3244     u[9] = temp2;
3245     u[19] = u[16];
3246     u[18] = u[17];
3247     u[20] = u[23];
3248     u[21] = u[22];
3249     u[27] = u[24];
3250     u[26] = u[25];
3251     u[28] = u[31];
3252     u[29] = u[30];
3253 
3254     temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
3255                                    &v_bit, &rnding);
3256     u[61] =
3257         half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
3258     u[34] = temp1;
3259     temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
3260                                    &v_bit, &rnding);
3261     u[60] =
3262         half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
3263     u[35] = temp2;
3264     temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
3265                                    &v_bit, &rnding);
3266     u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
3267                                    &v_bit, &rnding);
3268     u[36] = temp1;
3269     temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
3270                                    &v_bit, &rnding);
3271     u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
3272                                    &v_bit, &rnding);
3273     u[37] = temp2;
3274     temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
3275                                    &v_bit, &rnding);
3276     u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
3277                             &rnding);
3278     u[42] = temp1;
3279     temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
3280                                    &v_bit, &rnding);
3281     u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
3282                             &rnding);
3283     u[43] = temp2;
3284     temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
3285                                    &v_bit, &rnding);
3286     u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
3287                                    &v_bit, &rnding);
3288     u[44] = temp1;
3289     temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
3290                                    &v_bit, &rnding);
3291     u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
3292                                    &v_bit, &rnding);
3293     u[45] = temp2;
3294 
3295     // stage 7
3296     u[3] = u[0];
3297     u[2] = u[1];
3298     u[11] = u[8];
3299     u[10] = u[9];
3300     u[12] = u[15];
3301     u[13] = u[14];
3302 
3303     temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
3304                                    &v_bit, &rnding);
3305     u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
3306                             &rnding);
3307     u[18] = temp1;
3308     temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
3309                                    &v_bit, &rnding);
3310     u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
3311                             &rnding);
3312     u[19] = temp2;
3313     temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
3314                                    &v_bit, &rnding);
3315     u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
3316                                    &v_bit, &rnding);
3317     u[20] = temp1;
3318     temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
3319                                    &v_bit, &rnding);
3320     u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
3321                                    &v_bit, &rnding);
3322     u[21] = temp2;
3323     for (i = 32; i < 64; i += 16) {
3324       for (j = i; j < i + 4; j++) {
3325         addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3326         addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3327                     &clamp_hi);
3328       }
3329     }
3330 
3331     // stage 8
3332     u[7] = u[0];
3333     u[6] = u[1];
3334     u[5] = u[2];
3335     u[4] = u[3];
3336     u[9] = u[9];
3337 
3338     idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3339 
3340     // stage 9
3341     idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3342 
3343     // stage 10
3344     idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3345 
3346     // stage 11
3347     idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3348   }
3349 }
3350 
idct64x64_low16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3351 static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
3352                                  int do_cols, int bd, int out_shift) {
3353   int i, j;
3354   const int32_t *cospi = cospi_arr(bit);
3355   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3356   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3357   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3358   const int32x4_t v_bit = vdupq_n_s32(-bit);
3359   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3360 
3361   {
3362     int32x4_t u[64];
3363     int32x4_t tmp1, tmp2, tmp3, tmp4;
3364     // stage 1
3365     u[0] = in[0];
3366     u[32] = in[1];
3367     u[36] = in[9];
3368     u[40] = in[5];
3369     u[44] = in[13];
3370     u[48] = in[3];
3371     u[52] = in[11];
3372     u[56] = in[7];
3373     u[60] = in[15];
3374     u[16] = in[2];
3375     u[20] = in[10];
3376     u[24] = in[6];
3377     u[28] = in[14];
3378     u[4] = in[8];
3379     u[8] = in[4];
3380     u[12] = in[12];
3381 
3382     // stage 2
3383     u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
3384     u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
3385     u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
3386     u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
3387     u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
3388     u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
3389     u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
3390     u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
3391     u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
3392     u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
3393     u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
3394     u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
3395     u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
3396     u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
3397     u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
3398     u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
3399 
3400     // stage 3
3401     u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
3402     u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
3403     u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding);
3404     u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding);
3405     u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding);
3406     u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding);
3407     u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
3408     u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
3409     u[33] = u[32];
3410     u[34] = u[35];
3411     u[37] = u[36];
3412     u[38] = u[39];
3413     u[41] = u[40];
3414     u[42] = u[43];
3415     u[45] = u[44];
3416     u[46] = u[47];
3417     u[49] = u[48];
3418     u[50] = u[51];
3419     u[53] = u[52];
3420     u[54] = u[55];
3421     u[57] = u[56];
3422     u[58] = u[59];
3423     u[61] = u[60];
3424     u[62] = u[63];
3425 
3426     // stage 4
3427     u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
3428     u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
3429     u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
3430     u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
3431 
3432     u[17] = u[16];
3433     u[18] = u[19];
3434     u[21] = u[20];
3435     u[22] = u[23];
3436     u[25] = u[24];
3437     u[26] = u[27];
3438     u[29] = u[28];
3439     u[30] = u[31];
3440 
3441     tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit,
3442                                   &rnding);
3443     tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit,
3444                                   &rnding);
3445     tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
3446                                   &v_bit, &rnding);
3447     tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
3448                                   &v_bit, &rnding);
3449     u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
3450                                    &v_bit, &rnding);
3451     u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
3452                             &rnding);
3453     u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
3454                                    &v_bit, &rnding);
3455     u[62] =
3456         half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
3457     u[33] = tmp1;
3458     u[34] = tmp2;
3459     u[37] = tmp3;
3460     u[38] = tmp4;
3461 
3462     tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
3463                                   &v_bit, &rnding);
3464     tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
3465                                   &v_bit, &rnding);
3466     tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit,
3467                            &rnding);
3468     tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
3469                                   &v_bit, &rnding);
3470     u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
3471                                    &v_bit, &rnding);
3472     u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
3473                             &rnding);
3474     u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
3475                                    &v_bit, &rnding);
3476     u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
3477                             &rnding);
3478     u[41] = tmp1;
3479     u[42] = tmp2;
3480     u[45] = tmp3;
3481     u[46] = tmp4;
3482 
3483     // stage 5
3484     u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
3485     u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
3486 
3487     u[9] = u[8];
3488     u[10] = u[11];
3489     u[13] = u[12];
3490     u[14] = u[15];
3491 
3492     tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit,
3493                                   &rnding);
3494     tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit,
3495                                   &rnding);
3496     tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26],
3497                                   &v_bit, &rnding);
3498     tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
3499                                   &v_bit, &rnding);
3500     u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
3501                                    &v_bit, &rnding);
3502     u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit,
3503                             &rnding);
3504     u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29],
3505                                    &v_bit, &rnding);
3506     u[30] =
3507         half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
3508     u[17] = tmp1;
3509     u[18] = tmp2;
3510     u[21] = tmp3;
3511     u[22] = tmp4;
3512 
3513     for (i = 32; i < 64; i += 8) {
3514       addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3515                   &clamp_hi);
3516       addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3517                   &clamp_hi);
3518 
3519       addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3520                   &clamp_hi);
3521       addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3522                   &clamp_hi);
3523     }
3524 
3525     // stage 6
3526     tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3527     u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3528     u[0] = tmp1;
3529     u[5] = u[4];
3530     u[6] = u[7];
3531 
3532     tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
3533                                   &rnding);
3534     u[14] =
3535         half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
3536     u[9] = tmp1;
3537     tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13],
3538                                   &v_bit, &rnding);
3539     u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
3540                                    &v_bit, &rnding);
3541     u[10] = tmp2;
3542 
3543     for (i = 16; i < 32; i += 8) {
3544       addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3545                   &clamp_hi);
3546       addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3547                   &clamp_hi);
3548 
3549       addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3550                   &clamp_hi);
3551       addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3552                   &clamp_hi);
3553     }
3554 
3555     tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit,
3556                                   &rnding);
3557     tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit,
3558                                   &rnding);
3559     tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit,
3560                                   &rnding);
3561     tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit,
3562                                   &rnding);
3563     u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
3564                                    &v_bit, &rnding);
3565     u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
3566                                    &v_bit, &rnding);
3567     u[60] =
3568         half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
3569     u[61] =
3570         half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
3571     u[34] = tmp1;
3572     u[35] = tmp2;
3573     u[36] = tmp3;
3574     u[37] = tmp4;
3575 
3576     tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
3577                                   &v_bit, &rnding);
3578     tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
3579                                   &v_bit, &rnding);
3580     tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
3581                                   &v_bit, &rnding);
3582     tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
3583                                   &v_bit, &rnding);
3584     u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
3585                                    &v_bit, &rnding);
3586     u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
3587                                    &v_bit, &rnding);
3588     u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
3589                             &rnding);
3590     u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
3591                             &rnding);
3592     u[42] = tmp1;
3593     u[43] = tmp2;
3594     u[44] = tmp3;
3595     u[45] = tmp4;
3596 
3597     // stage 7
3598     u[3] = u[0];
3599     u[2] = u[1];
3600     tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit,
3601                                   &rnding);
3602     u[6] =
3603         half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding);
3604     u[5] = tmp1;
3605     addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3606     addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3607     addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3608     addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3609 
3610     tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
3611                                   &v_bit, &rnding);
3612     tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
3613                                   &v_bit, &rnding);
3614     tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
3615                                   &v_bit, &rnding);
3616     tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
3617                                   &v_bit, &rnding);
3618     u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
3619                                    &v_bit, &rnding);
3620     u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
3621                                    &v_bit, &rnding);
3622     u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
3623                             &rnding);
3624     u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
3625                             &rnding);
3626     u[18] = tmp1;
3627     u[19] = tmp2;
3628     u[20] = tmp3;
3629     u[21] = tmp4;
3630 
3631     for (i = 32; i < 64; i += 16) {
3632       for (j = i; j < i + 4; j++) {
3633         addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3634         addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3635                     &clamp_hi);
3636       }
3637     }
3638 
3639     // stage 8
3640     for (i = 0; i < 4; ++i) {
3641       addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3642     }
3643 
3644     idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3645 
3646     // stage 9
3647     idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3648 
3649     // stage 10
3650     idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3651 
3652     // stage 11
3653     idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3654   }
3655 }
3656 
idct64x64_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3657 static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
3658                            int bd, int out_shift) {
3659   int i, j;
3660   const int32_t *cospi = cospi_arr(bit);
3661   const int32x4_t v_bit = vdupq_n_s32(-bit);
3662   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3663 
3664   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3665   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3666   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3667 
3668   {
3669     int32x4_t u[64], v[64];
3670 
3671     // stage 1
3672     u[32] = in[1];
3673     u[34] = in[17];
3674     u[36] = in[9];
3675     u[38] = in[25];
3676     u[40] = in[5];
3677     u[42] = in[21];
3678     u[44] = in[13];
3679     u[46] = in[29];
3680     u[48] = in[3];
3681     u[50] = in[19];
3682     u[52] = in[11];
3683     u[54] = in[27];
3684     u[56] = in[7];
3685     u[58] = in[23];
3686     u[60] = in[15];
3687     u[62] = in[31];
3688 
3689     v[16] = in[2];
3690     v[18] = in[18];
3691     v[20] = in[10];
3692     v[22] = in[26];
3693     v[24] = in[6];
3694     v[26] = in[22];
3695     v[28] = in[14];
3696     v[30] = in[30];
3697 
3698     u[8] = in[4];
3699     u[10] = in[20];
3700     u[12] = in[12];
3701     u[14] = in[28];
3702 
3703     v[4] = in[8];
3704     v[6] = in[24];
3705 
3706     u[0] = in[0];
3707     u[2] = in[16];
3708 
3709     // stage 2
3710     v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
3711     v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding);
3712     v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding);
3713     v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
3714     v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
3715     v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding);
3716     v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding);
3717     v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
3718     v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
3719     v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding);
3720     v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding);
3721     v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
3722     v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
3723     v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding);
3724     v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding);
3725     v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
3726     v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
3727     v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding);
3728     v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding);
3729     v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
3730     v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
3731     v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding);
3732     v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding);
3733     v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
3734     v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
3735     v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding);
3736     v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding);
3737     v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
3738     v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
3739     v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding);
3740     v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding);
3741     v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
3742 
3743     // stage 3
3744     u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding);
3745     u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding);
3746     u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding);
3747     u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding);
3748     u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding);
3749     u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding);
3750     u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding);
3751     u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding);
3752     u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding);
3753     u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding);
3754     u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding);
3755     u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding);
3756     u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding);
3757     u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding);
3758     u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding);
3759     u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding);
3760 
3761     for (i = 32; i < 64; i += 4) {
3762       addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3763                   &clamp_hi);
3764       addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3765                   &clamp_hi);
3766     }
3767 
3768     // stage 4
3769     v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
3770     v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
3771     v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
3772     v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
3773     v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
3774     v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
3775     v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
3776     v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
3777 
3778     for (i = 16; i < 32; i += 4) {
3779       addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3780                   &clamp_hi);
3781       addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3782                   &clamp_hi);
3783     }
3784 
3785     for (i = 32; i < 64; i += 4) {
3786       v[i + 0] = u[i + 0];
3787       v[i + 3] = u[i + 3];
3788     }
3789 
3790     v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
3791                                    &v_bit, &rnding);
3792     v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61],
3793                                    &v_bit, &rnding);
3794     v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
3795                                    &v_bit, &rnding);
3796     v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
3797                                    &v_bit, &rnding);
3798     v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
3799                                    &v_bit, &rnding);
3800     v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
3801                                    &v_bit, &rnding);
3802     v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50],
3803                                    &v_bit, &rnding);
3804     v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
3805                                    &v_bit, &rnding);
3806     v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
3807                                    &v_bit, &rnding);
3808     v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
3809                             &rnding);
3810     v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
3811                                    &v_bit, &rnding);
3812     v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
3813                             &rnding);
3814     v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
3815                                    &v_bit, &rnding);
3816     v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
3817                             &rnding);
3818     v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
3819                                    &v_bit, &rnding);
3820     v[62] =
3821         half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
3822 
3823     // stage 5
3824     u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding);
3825     u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding);
3826     u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding);
3827     u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding);
3828 
3829     for (i = 8; i < 16; i += 4) {
3830       addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3831                   &clamp_hi);
3832       addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3833                   &clamp_hi);
3834     }
3835 
3836     for (i = 16; i < 32; i += 4) {
3837       u[i + 0] = v[i + 0];
3838       u[i + 3] = v[i + 3];
3839     }
3840 
3841     u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30],
3842                                    &v_bit, &rnding);
3843     u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29],
3844                                    &v_bit, &rnding);
3845     u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26],
3846                                    &v_bit, &rnding);
3847     u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25],
3848                                    &v_bit, &rnding);
3849     u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25],
3850                                    &v_bit, &rnding);
3851     u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit,
3852                             &rnding);
3853     u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29],
3854                                    &v_bit, &rnding);
3855     u[30] =
3856         half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding);
3857 
3858     for (i = 32; i < 64; i += 8) {
3859       addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3860                   &clamp_hi);
3861       addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3862                   &clamp_hi);
3863 
3864       addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3865                   &clamp_hi);
3866       addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3867                   &clamp_hi);
3868     }
3869 
3870     // stage 6
3871     v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3872     v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3873     v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
3874     v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
3875 
3876     addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3877     addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3878 
3879     for (i = 8; i < 16; i += 4) {
3880       v[i + 0] = u[i + 0];
3881       v[i + 3] = u[i + 3];
3882     }
3883 
3884     v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
3885                                   &rnding);
3886     v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
3887                                    &v_bit, &rnding);
3888     v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
3889                                    &v_bit, &rnding);
3890     v[14] =
3891         half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
3892 
3893     for (i = 16; i < 32; i += 8) {
3894       addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
3895                   &clamp_hi);
3896       addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
3897                   &clamp_hi);
3898 
3899       addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
3900                   &clamp_hi);
3901       addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
3902                   &clamp_hi);
3903     }
3904 
3905     for (i = 32; i < 64; i += 8) {
3906       v[i + 0] = u[i + 0];
3907       v[i + 1] = u[i + 1];
3908       v[i + 6] = u[i + 6];
3909       v[i + 7] = u[i + 7];
3910     }
3911 
3912     v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
3913                                    &v_bit, &rnding);
3914     v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
3915                                    &v_bit, &rnding);
3916     v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
3917                                    &v_bit, &rnding);
3918     v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
3919                                    &v_bit, &rnding);
3920     v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
3921                                    &v_bit, &rnding);
3922     v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
3923                                    &v_bit, &rnding);
3924     v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
3925                                    &v_bit, &rnding);
3926     v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
3927                                    &v_bit, &rnding);
3928     v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
3929                                    &v_bit, &rnding);
3930     v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
3931                                    &v_bit, &rnding);
3932     v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
3933                             &rnding);
3934     v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
3935                             &rnding);
3936     v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
3937                                    &v_bit, &rnding);
3938     v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
3939                                    &v_bit, &rnding);
3940     v[60] =
3941         half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
3942     v[61] =
3943         half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
3944 
3945     // stage 7
3946     addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3947     addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3948 
3949     u[4] = v[4];
3950     u[7] = v[7];
3951     u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit,
3952                                   &rnding);
3953     u[6] =
3954         half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding);
3955 
3956     addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3957     addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3958     addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3959     addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3960 
3961     for (i = 16; i < 32; i += 8) {
3962       u[i + 0] = v[i + 0];
3963       u[i + 1] = v[i + 1];
3964       u[i + 6] = v[i + 6];
3965       u[i + 7] = v[i + 7];
3966     }
3967 
3968     u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29],
3969                                    &v_bit, &rnding);
3970     u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28],
3971                                    &v_bit, &rnding);
3972     u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27],
3973                                    &v_bit, &rnding);
3974     u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26],
3975                                    &v_bit, &rnding);
3976     u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26],
3977                                    &v_bit, &rnding);
3978     u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27],
3979                                    &v_bit, &rnding);
3980     u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit,
3981                             &rnding);
3982     u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit,
3983                             &rnding);
3984 
3985     for (i = 32; i < 64; i += 16) {
3986       for (j = i; j < i + 4; j++) {
3987         addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3988         addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3989                     &clamp_hi);
3990       }
3991     }
3992 
3993     // stage 8
3994     for (i = 0; i < 4; ++i) {
3995       addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
3996     }
3997 
3998     v[8] = u[8];
3999     v[9] = u[9];
4000     v[14] = u[14];
4001     v[15] = u[15];
4002 
4003     v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13],
4004                                    &v_bit, &rnding);
4005     v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12],
4006                                    &v_bit, &rnding);
4007     v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit,
4008                             &rnding);
4009     v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit,
4010                             &rnding);
4011 
4012     for (i = 16; i < 20; ++i) {
4013       addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4014       addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4015                   &clamp_hi);
4016     }
4017 
4018     for (i = 32; i < 36; ++i) {
4019       v[i] = u[i];
4020       v[i + 12] = u[i + 12];
4021       v[i + 16] = u[i + 16];
4022       v[i + 28] = u[i + 28];
4023     }
4024 
4025     v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59],
4026                                    &v_bit, &rnding);
4027     v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58],
4028                                    &v_bit, &rnding);
4029     v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57],
4030                                    &v_bit, &rnding);
4031     v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56],
4032                                    &v_bit, &rnding);
4033     v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55],
4034                                    &v_bit, &rnding);
4035     v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54],
4036                                    &v_bit, &rnding);
4037     v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53],
4038                                    &v_bit, &rnding);
4039     v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52],
4040                                    &v_bit, &rnding);
4041     v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52],
4042                                    &v_bit, &rnding);
4043     v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53],
4044                                    &v_bit, &rnding);
4045     v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54],
4046                                    &v_bit, &rnding);
4047     v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55],
4048                                    &v_bit, &rnding);
4049     v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit,
4050                             &rnding);
4051     v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit,
4052                             &rnding);
4053     v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit,
4054                             &rnding);
4055     v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit,
4056                             &rnding);
4057 
4058     // stage 9
4059     for (i = 0; i < 8; ++i) {
4060       addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4061     }
4062 
4063     for (i = 16; i < 20; ++i) {
4064       u[i] = v[i];
4065       u[i + 12] = v[i + 12];
4066     }
4067 
4068     u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27],
4069                                    &v_bit, &rnding);
4070     u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26],
4071                                    &v_bit, &rnding);
4072     u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25],
4073                                    &v_bit, &rnding);
4074     u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24],
4075                                    &v_bit, &rnding);
4076     u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit,
4077                             &rnding);
4078     u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit,
4079                             &rnding);
4080     u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit,
4081                             &rnding);
4082     u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit,
4083                             &rnding);
4084 
4085     for (i = 32; i < 40; i++) {
4086       addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4087     }
4088 
4089     for (i = 48; i < 56; i++) {
4090       addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4091     }
4092 
4093     // stage 10
4094     for (i = 0; i < 16; i++) {
4095       addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4096     }
4097 
4098     for (i = 32; i < 40; i++) v[i] = u[i];
4099 
4100     v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55],
4101                                    &v_bit, &rnding);
4102     v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54],
4103                                    &v_bit, &rnding);
4104     v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53],
4105                                    &v_bit, &rnding);
4106     v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52],
4107                                    &v_bit, &rnding);
4108     v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51],
4109                                    &v_bit, &rnding);
4110     v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50],
4111                                    &v_bit, &rnding);
4112     v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49],
4113                                    &v_bit, &rnding);
4114     v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48],
4115                                    &v_bit, &rnding);
4116     v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit,
4117                             &rnding);
4118     v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit,
4119                             &rnding);
4120     v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit,
4121                             &rnding);
4122     v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit,
4123                             &rnding);
4124     v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit,
4125                             &rnding);
4126     v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit,
4127                             &rnding);
4128     v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit,
4129                             &rnding);
4130     v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit,
4131                             &rnding);
4132 
4133     for (i = 56; i < 64; i++) v[i] = u[i];
4134 
4135     // stage 11
4136     for (i = 0; i < 32; i++) {
4137       addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4138                   &clamp_hi);
4139     }
4140 
4141     if (!do_cols) {
4142       const int log_range_out = AOMMAX(16, bd + 6);
4143       const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
4144       const int32x4_t clamp_hi_out =
4145           vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4146       for (i = 0; i < 64; i += 4) {
4147         round_shift_4x4(out + i, out_shift);
4148         highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
4149                               4);
4150       }
4151     }
4152   }
4153 }
4154 
idct32x32_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4155 static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
4156                                 int do_cols, int bd, int out_shift) {
4157   const int32_t *cospi = cospi_arr(bit);
4158   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4159   int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4160   int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4161   int32x4_t bf1;
4162   const int32x4_t v_bit = vdupq_n_s32(-bit);
4163   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4164   // stage 0-1
4165   bf1 = in[0];
4166 
4167   // stage 2-5
4168   bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding);
4169 
4170   // stage 6-9
4171   if (do_cols) {
4172     bf1 = vmaxq_s32(bf1, clamp_lo);
4173     bf1 = vminq_s32(bf1, clamp_hi);
4174   } else {
4175     const int log_range_out = AOMMAX(16, bd + 6);
4176     clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
4177     clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4178     if (out_shift != 0) {
4179       bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift));
4180     }
4181   }
4182 
4183   bf1 = vmaxq_s32(bf1, clamp_lo);
4184   bf1 = vminq_s32(bf1, clamp_hi);
4185 
4186   for (int i = 0; i < 32; i++) out[i] = bf1;
4187 }
4188 
idct32x32_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4189 static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
4190                                 int do_cols, int bd, int out_shift) {
4191   const int32_t *cospi = cospi_arr(bit);
4192   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4193   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4194   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4195   int32x4_t bf1[32];
4196   const int32x4_t v_bit = vdupq_n_s32(-bit);
4197   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4198 
4199   // stage 0-1
4200   bf1[0] = in[0];
4201   bf1[4] = in[4];
4202   bf1[8] = in[2];
4203   bf1[12] = in[6];
4204   bf1[16] = in[1];
4205   bf1[20] = in[5];
4206   bf1[24] = in[3];
4207   bf1[28] = in[7];
4208 
4209   // stage 2
4210   bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
4211   bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
4212   bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
4213   bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
4214   bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
4215   bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
4216   bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
4217   bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
4218 
4219   // stage 3
4220   bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
4221   bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
4222 
4223   bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
4224   bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
4225   bf1[17] = bf1[16];
4226   bf1[18] = bf1[19];
4227   bf1[21] = bf1[20];
4228   bf1[22] = bf1[23];
4229   bf1[25] = bf1[24];
4230   bf1[26] = bf1[27];
4231   bf1[29] = bf1[28];
4232   bf1[30] = bf1[31];
4233 
4234   // stage 4 :
4235   bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
4236   bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
4237 
4238   bf1[9] = bf1[8];
4239   bf1[10] = bf1[11];
4240   bf1[13] = bf1[12];
4241   bf1[14] = bf1[15];
4242 
4243   idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
4244 
4245   // stage 5
4246   bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
4247   bf1[1] = bf1[0];
4248   bf1[5] = bf1[4];
4249   bf1[6] = bf1[7];
4250 
4251   idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4252 
4253   // stage 6
4254   bf1[3] = bf1[0];
4255   bf1[2] = bf1[1];
4256 
4257   idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4258 
4259   // stage 7
4260   idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4261 
4262   // stage 8
4263   idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4264 
4265   // stage 9
4266   idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4267 }
4268 
idct32x32_low16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4269 static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
4270                                  int do_cols, int bd, int out_shift) {
4271   const int32_t *cospi = cospi_arr(bit);
4272   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4273   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4274   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4275   int32x4_t bf1[32];
4276   const int32x4_t v_bit = vdupq_n_s32(-bit);
4277   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4278 
4279   // stage 0-1
4280 
4281   bf1[0] = in[0];
4282   bf1[2] = in[8];
4283   bf1[4] = in[4];
4284   bf1[6] = in[12];
4285   bf1[8] = in[2];
4286   bf1[10] = in[10];
4287   bf1[12] = in[6];
4288   bf1[14] = in[14];
4289   bf1[16] = in[1];
4290   bf1[18] = in[9];
4291   bf1[20] = in[5];
4292   bf1[22] = in[13];
4293   bf1[24] = in[3];
4294   bf1[26] = in[11];
4295   bf1[28] = in[7];
4296   bf1[30] = in[15];
4297 
4298   // stage 2
4299   bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
4300   bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
4301   bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding);
4302   bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding);
4303   bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding);
4304   bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding);
4305   bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
4306   bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
4307   bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
4308   bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
4309   bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding);
4310   bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding);
4311   bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding);
4312   bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding);
4313   bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
4314   bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
4315 
4316   // stage 3
4317   bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
4318   bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
4319   bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding);
4320   bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding);
4321   bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding);
4322   bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding);
4323   bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
4324   bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
4325 
4326   addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4327   addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4328   addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4329   addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4330   addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4331   addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4332   addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4333   addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4334   // stage 4
4335   bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
4336   bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
4337   bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding);
4338   bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding);
4339 
4340   addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4341   addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4342   addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4343   addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4344 
4345   idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
4346 
4347   // stage 5
4348   bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
4349   bf1[1] = bf1[0];
4350   bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding);
4351   bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding);
4352 
4353   addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4354   addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4355 
4356   idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4357 
4358   // stage 6
4359   addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4360   addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4361 
4362   idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4363 
4364   // stage 7
4365   idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4366 
4367   // stage 8
4368   idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4369   // stage 9
4370   idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4371 }
4372 
idct32x32_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4373 static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
4374                            int bd, int out_shift) {
4375   const int32_t *cospi = cospi_arr(bit);
4376   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4377   const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4378   const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4379   int32x4_t bf1[32], bf0[32];
4380   const int32x4_t v_bit = vdupq_n_s32(-bit);
4381   const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4382   // stage 0
4383   // stage 1
4384   bf1[0] = in[0];
4385   bf1[1] = in[16];
4386   bf1[2] = in[8];
4387   bf1[3] = in[24];
4388   bf1[4] = in[4];
4389   bf1[5] = in[20];
4390   bf1[6] = in[12];
4391   bf1[7] = in[28];
4392   bf1[8] = in[2];
4393   bf1[9] = in[18];
4394   bf1[10] = in[10];
4395   bf1[11] = in[26];
4396   bf1[12] = in[6];
4397   bf1[13] = in[22];
4398   bf1[14] = in[14];
4399   bf1[15] = in[30];
4400   bf1[16] = in[1];
4401   bf1[17] = in[17];
4402   bf1[18] = in[9];
4403   bf1[19] = in[25];
4404   bf1[20] = in[5];
4405   bf1[21] = in[21];
4406   bf1[22] = in[13];
4407   bf1[23] = in[29];
4408   bf1[24] = in[3];
4409   bf1[25] = in[19];
4410   bf1[26] = in[11];
4411   bf1[27] = in[27];
4412   bf1[28] = in[7];
4413   bf1[29] = in[23];
4414   bf1[30] = in[15];
4415   bf1[31] = in[31];
4416 
4417   // stage 2
4418   for (int i = 0; i < 16; i++) bf0[i] = bf1[i];
4419 
4420   bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31],
4421                                    &v_bit, &rnding);
4422   bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30],
4423                                    &v_bit, &rnding);
4424   bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29],
4425                                    &v_bit, &rnding);
4426   bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28],
4427                                    &v_bit, &rnding);
4428   bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27],
4429                                    &v_bit, &rnding);
4430   bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26],
4431                                    &v_bit, &rnding);
4432   bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25],
4433                                    &v_bit, &rnding);
4434   bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24],
4435                                    &v_bit, &rnding);
4436   bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit,
4437                             &rnding);
4438   bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit,
4439                             &rnding);
4440   bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit,
4441                             &rnding);
4442   bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit,
4443                             &rnding);
4444   bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit,
4445                             &rnding);
4446   bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit,
4447                             &rnding);
4448   bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit,
4449                             &rnding);
4450   bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit,
4451                             &rnding);
4452 
4453   // stage 3
4454   for (int i = 0; i < 8; i++) bf1[i] = bf0[i];
4455 
4456   bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15],
4457                                   &v_bit, &rnding);
4458   bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14],
4459                                   &v_bit, &rnding);
4460   bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13],
4461                                    &v_bit, &rnding);
4462   bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12],
4463                                    &v_bit, &rnding);
4464   bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit,
4465                             &rnding);
4466   bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit,
4467                             &rnding);
4468   bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit,
4469                             &rnding);
4470   bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit,
4471                             &rnding);
4472 
4473   addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4474   addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4475   addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4476   addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4477   addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4478   addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4479   addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4480   addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4481 
4482   // stage 4
4483   bf0[0] = bf1[0];
4484   bf0[1] = bf1[1];
4485   bf0[2] = bf1[2];
4486   bf0[3] = bf1[3];
4487   bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7],
4488                                   &v_bit, &rnding);
4489   bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6],
4490                                   &v_bit, &rnding);
4491   bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit,
4492                            &rnding);
4493   bf0[7] =
4494       half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding);
4495 
4496   addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4497   addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4498   addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4499   addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4500 
4501   bf0[16] = bf1[16];
4502   bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
4503                                    &v_bit, &rnding);
4504   bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
4505                                    &v_bit, &rnding);
4506   bf0[19] = bf1[19];
4507   bf0[20] = bf1[20];
4508   bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
4509                                    &v_bit, &rnding);
4510   bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
4511                                    &v_bit, &rnding);
4512   bf0[23] = bf1[23];
4513   bf0[24] = bf1[24];
4514   bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
4515                                    &v_bit, &rnding);
4516   bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit,
4517                             &rnding);
4518   bf0[27] = bf1[27];
4519   bf0[28] = bf1[28];
4520   bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
4521                                    &v_bit, &rnding);
4522   bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit,
4523                             &rnding);
4524   bf0[31] = bf1[31];
4525 
4526   // stage 5
4527   bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit,
4528                            &rnding);
4529   bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1],
4530                                   &v_bit, &rnding);
4531   bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3],
4532                                   &v_bit, &rnding);
4533   bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit,
4534                            &rnding);
4535   addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4536   addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4537   bf1[8] = bf0[8];
4538   bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14],
4539                                   &v_bit, &rnding);
4540   bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13],
4541                                    &v_bit, &rnding);
4542   bf1[11] = bf0[11];
4543   bf1[12] = bf0[12];
4544   bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13],
4545                                    &v_bit, &rnding);
4546   bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit,
4547                             &rnding);
4548   bf1[15] = bf0[15];
4549   addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
4550   addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
4551   addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
4552   addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
4553   addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
4554   addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
4555   addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
4556   addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
4557 
4558   // stage 6
4559   addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
4560   addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
4561   bf0[4] = bf1[4];
4562   bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
4563                                   &v_bit, &rnding);
4564   bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit,
4565                            &rnding);
4566   bf0[7] = bf1[7];
4567   addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
4568   addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
4569   addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
4570   addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
4571   bf0[16] = bf1[16];
4572   bf0[17] = bf1[17];
4573   bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
4574                                    &v_bit, &rnding);
4575   bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
4576                                    &v_bit, &rnding);
4577   bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
4578                                    &v_bit, &rnding);
4579   bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
4580                                    &v_bit, &rnding);
4581   bf0[22] = bf1[22];
4582   bf0[23] = bf1[23];
4583   bf0[24] = bf1[24];
4584   bf0[25] = bf1[25];
4585   bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
4586                                    &v_bit, &rnding);
4587   bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
4588                                    &v_bit, &rnding);
4589   bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit,
4590                             &rnding);
4591   bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit,
4592                             &rnding);
4593   bf0[30] = bf1[30];
4594   bf0[31] = bf1[31];
4595 
4596   // stage 7
4597   addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
4598   addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
4599   addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
4600   addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
4601   bf1[8] = bf0[8];
4602   bf1[9] = bf0[9];
4603   bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13],
4604                                    &v_bit, &rnding);
4605   bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12],
4606                                    &v_bit, &rnding);
4607   bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit,
4608                             &rnding);
4609   bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit,
4610                             &rnding);
4611   bf1[14] = bf0[14];
4612   bf1[15] = bf0[15];
4613   addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
4614   addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
4615   addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
4616   addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
4617   addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
4618   addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
4619   addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
4620   addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
4621 
4622   // stage 8
4623   addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
4624   addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
4625   addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
4626   addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
4627   addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
4628   addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
4629   addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
4630   addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
4631   bf0[16] = bf1[16];
4632   bf0[17] = bf1[17];
4633   bf0[18] = bf1[18];
4634   bf0[19] = bf1[19];
4635   bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
4636                                    &v_bit, &rnding);
4637   bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
4638                                    &v_bit, &rnding);
4639   bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
4640                                    &v_bit, &rnding);
4641   bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
4642                                    &v_bit, &rnding);
4643   bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit,
4644                             &rnding);
4645   bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit,
4646                             &rnding);
4647   bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit,
4648                             &rnding);
4649   bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit,
4650                             &rnding);
4651   bf0[28] = bf1[28];
4652   bf0[29] = bf1[29];
4653   bf0[30] = bf1[30];
4654   bf0[31] = bf1[31];
4655 
4656   // stage 9
4657   addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
4658   addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
4659   addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
4660   addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
4661   addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
4662   addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
4663   addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
4664   addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
4665   addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
4666   addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
4667   addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
4668   addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
4669   addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
4670   addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
4671   addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
4672   addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
4673 
4674   if (!do_cols) {
4675     const int log_range_out = AOMMAX(16, bd + 6);
4676     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
4677     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4678     round_shift_8x8(out, out_shift);
4679     round_shift_8x8(out + 16, out_shift);
4680     highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
4681   }
4682 }
4683 
iidentity32_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4684 static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit,
4685                              int do_cols, int bd, int out_shift) {
4686   (void)bit;
4687   for (int i = 0; i < 32; i += 16) {
4688     out[i] = vshlq_n_s32(in[i], 2);
4689     out[i + 1] = vshlq_n_s32(in[i + 1], 2);
4690     out[i + 2] = vshlq_n_s32(in[i + 2], 2);
4691     out[i + 3] = vshlq_n_s32(in[i + 3], 2);
4692     out[i + 4] = vshlq_n_s32(in[i + 4], 2);
4693     out[i + 5] = vshlq_n_s32(in[i + 5], 2);
4694     out[i + 6] = vshlq_n_s32(in[i + 6], 2);
4695     out[i + 7] = vshlq_n_s32(in[i + 7], 2);
4696     out[i + 8] = vshlq_n_s32(in[i + 8], 2);
4697     out[i + 9] = vshlq_n_s32(in[i + 9], 2);
4698     out[i + 10] = vshlq_n_s32(in[i + 10], 2);
4699     out[i + 11] = vshlq_n_s32(in[i + 11], 2);
4700     out[i + 12] = vshlq_n_s32(in[i + 12], 2);
4701     out[i + 13] = vshlq_n_s32(in[i + 13], 2);
4702     out[i + 14] = vshlq_n_s32(in[i + 14], 2);
4703     out[i + 15] = vshlq_n_s32(in[i + 15], 2);
4704   }
4705 
4706   if (!do_cols) {
4707     const int log_range_out = AOMMAX(16, bd + 6);
4708     const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
4709     const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4710     round_shift_8x8(out, out_shift);
4711     round_shift_8x8(out + 16, out_shift);
4712     highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
4713   }
4714 }
4715 
4716 // 1D itx types
4717 typedef enum ATTRIBUTE_PACKED {
4718   IDCT_1D,
4719   IADST_1D,
4720   IFLIPADST_1D = IADST_1D,
4721   IIDENTITY_1D,
4722   ITX_TYPES_1D,
4723 } ITX_TYPE_1D;
4724 
4725 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
4726   IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
4727   IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
4728   IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
4729   IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
4730 };
4731 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
4732   IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
4733   IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
4734   IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
4735   IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
4736 };
4737 
4738 static const transform_1d_neon
4739     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4740       {
4741           { idct4x4_neon, NULL, NULL, NULL },
4742           { iadst4x4_neon, NULL, NULL, NULL },
4743           { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL },
4744       },
4745       { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL },
4746         { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL },
4747         { iidentity8_neon, iidentity8_neon, NULL, NULL } },
4748       {
4749           { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL },
4750           { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL },
4751           { iidentity16_neon, NULL, iidentity16_neon, NULL },
4752       },
4753       { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon,
4754           idct32x32_neon },
4755         { NULL, NULL, NULL, NULL },
4756         { iidentity32_neon, NULL, NULL, NULL } },
4757       { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon,
4758           idct64x64_neon },
4759         { NULL, NULL, NULL, NULL },
4760         { NULL, NULL, NULL, NULL } }
4761     };
4762 
av1_inv_txfm2d_add_4x8_neon(const tran_low_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4763 void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output,
4764                                  int stride, TX_TYPE tx_type, const int bd) {
4765   TX_SIZE tx_size = TX_4X8;
4766   int32x4_t buf1[32] = { vdupq_n_s32(0) };
4767 
4768   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4769   const int txw_idx = get_txw_idx(tx_size);
4770   const int txh_idx = get_txh_idx(tx_size);
4771   const int txfm_size_col = tx_size_wide[tx_size];
4772   const int txfm_size_row = tx_size_high[tx_size];
4773   const transform_1d_neon row_txfm =
4774       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
4775   const transform_1d_neon col_txfm =
4776       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
4777   const int input_stride = AOMMIN(32, txfm_size_row);
4778 
4779   assert(col_txfm != NULL);
4780   assert(row_txfm != NULL);
4781   int ud_flip, lr_flip;
4782   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4783 
4784   // 1st stage: column transform
4785   int32x4_t buf0[8];
4786   load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
4787   load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
4788   round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row);
4789   row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4790   row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
4791 
4792   if (lr_flip) {
4793     TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
4794                   buf1[3]);
4795 
4796     TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
4797                   buf1[7]);
4798   } else {
4799     TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
4800                   buf1[3]);
4801 
4802     TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
4803                   buf1[7]);
4804   }
4805 
4806   // 2nd stage: column transform
4807   col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
4808 
4809   round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
4810 
4811   // write to buffer
4812   highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
4813                                bd);
4814 }
4815 
av1_inv_txfm2d_add_8x4_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4816 void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output,
4817                                  int stride, TX_TYPE tx_type, const int bd) {
4818   TX_SIZE tx_size = TX_8X4;
4819   int32x4_t buf1[8];
4820   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4821   const int txw_idx = get_txw_idx(tx_size);
4822   const int txh_idx = get_txh_idx(tx_size);
4823   const int txfm_size_col = tx_size_wide[tx_size];
4824   const int txfm_size_row = tx_size_high[tx_size];
4825   const transform_1d_neon row_txfm =
4826       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
4827   const transform_1d_neon col_txfm =
4828       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
4829 
4830   assert(col_txfm != NULL);
4831   assert(row_txfm != NULL);
4832   int ud_flip, lr_flip;
4833   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4834 
4835   // 1st stage: column transform
4836   int32x4_t buf0[8];
4837   const int32_t *input_row = input;
4838   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
4839 
4840   round_shift_rect_array_32_neon(buf0, buf0, txfm_size_col);
4841   row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4842 
4843   int32x4_t *buf1_ptr;
4844   if (lr_flip) {
4845     flip_buf_neon(buf0, buf1, txfm_size_col);
4846     buf1_ptr = buf1;
4847   } else {
4848     buf1_ptr = buf0;
4849   }
4850 
4851   // 2nd stage: column transform
4852   for (int i = 0; i < 2; i++) {
4853     int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
4854     transpose_4x4(buf1_cur, buf1_cur);
4855     col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
4856   }
4857   round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
4858   // write to buffer
4859   highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row,
4860                                bd);
4861 }
4862 
av1_inv_txfm2d_add_4x16_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4863 void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
4864                                   int stride, TX_TYPE tx_type, const int bd) {
4865   TX_SIZE tx_size = TX_4X16;
4866   int32x4_t buf1[16];
4867   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4868   const int txw_idx = get_txw_idx(tx_size);
4869   const int txh_idx = get_txh_idx(tx_size);
4870   const int txfm_size_col = tx_size_wide[tx_size];
4871   const int txfm_size_row = tx_size_high[tx_size];
4872   const int buf_size_h_div8 = txfm_size_row >> 2;
4873   const transform_1d_neon row_txfm =
4874       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
4875   const transform_1d_neon col_txfm =
4876       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
4877   const int input_stride = AOMMIN(32, txfm_size_row);
4878 
4879   assert(col_txfm != NULL);
4880   assert(row_txfm != NULL);
4881   int ud_flip, lr_flip;
4882   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4883 
4884   // 1st stage: column transform
4885   int32x4_t buf0[16];
4886   for (int i = 0; i < (txfm_size_row >> 2); i++) {
4887     const int32_t *input_row = input + i * 4;
4888     int32x4_t *buf0_cur = buf0 + i * 4;
4889     load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
4890     row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
4891   }
4892 
4893   if (lr_flip) {
4894     for (int j = 0; j < buf_size_h_div8; ++j) {
4895       TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
4896                     buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
4897                     buf1[4 * j + 3]);
4898     }
4899   } else {
4900     for (int j = 0; j < buf_size_h_div8; ++j) {
4901       TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
4902                     buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
4903                     buf1[4 * j + 2], buf1[4 * j + 3]);
4904     }
4905   }
4906 
4907   // 2nd stage: column transform
4908   col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
4909 
4910   round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
4911 
4912   // write to buffer
4913   highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
4914                                bd);
4915 }
4916 
av1_inv_txfm2d_add_16x4_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4917 void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
4918                                   int stride, TX_TYPE tx_type, const int bd) {
4919   TX_SIZE tx_size = TX_16X4;
4920   int32x4_t buf1[16];
4921   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4922   const int txw_idx = get_txw_idx(tx_size);
4923   const int txh_idx = get_txh_idx(tx_size);
4924   const int txfm_size_col = tx_size_wide[tx_size];
4925   const int txfm_size_row = tx_size_high[tx_size];
4926   const int buf_size_w_div8 = txfm_size_col >> 2;
4927   const transform_1d_neon row_txfm =
4928       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
4929   const transform_1d_neon col_txfm =
4930       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
4931 
4932   assert(col_txfm != NULL);
4933   assert(row_txfm != NULL);
4934   int ud_flip, lr_flip;
4935   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4936 
4937   // 1st stage: column transform
4938   int32x4_t buf0[16];
4939   const int32_t *input_row = input;
4940   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
4941 
4942   row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4943 
4944   int32x4_t *buf1_ptr;
4945   if (lr_flip) {
4946     flip_buf_neon(buf0, buf1, txfm_size_col);
4947     buf1_ptr = buf1;
4948   } else {
4949     buf1_ptr = buf0;
4950   }
4951 
4952   // 2nd stage: column transform
4953   for (int i = 0; i < buf_size_w_div8; i++) {
4954     int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
4955     transpose_4x4(buf1_cur, buf1_cur);
4956     col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
4957   }
4958   round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
4959 
4960   // write to buffer
4961   for (int i = 0; i < (txfm_size_col >> 3); i++) {
4962     highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
4963                                  output + 8 * i, stride, ud_flip, txfm_size_row,
4964                                  bd);
4965   }
4966 }
4967 
4968 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
4969   0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
4970   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4971 };
4972 
4973 // Transform block width in log2 for eob (size of 64 map to 32)
4974 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
4975   2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
4976 };
4977 
4978 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
4979   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
4980 };
4981 
4982 DECLARE_ALIGNED(16, static const int16_t,
4983                 av1_eob_to_eobxy_16x16_default[16]) = {
4984   0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
4985   0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
4986 };
4987 
4988 DECLARE_ALIGNED(16, static const int16_t,
4989                 av1_eob_to_eobxy_32x32_default[32]) = {
4990   0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4991   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4992   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4993   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4994 };
4995 
4996 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
4997   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
4998   0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
4999 };
5000 
5001 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
5002   0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
5003 };
5004 
5005 DECLARE_ALIGNED(16, static const int16_t,
5006                 av1_eob_to_eobxy_16x32_default[32]) = {
5007   0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
5008   0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
5009   0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
5010   0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
5011 };
5012 
5013 DECLARE_ALIGNED(16, static const int16_t,
5014                 av1_eob_to_eobxy_32x16_default[16]) = {
5015   0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
5016   0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
5017 };
5018 
5019 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
5020   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
5021   0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
5022   0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
5023   0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
5024 };
5025 
5026 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
5027   0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
5028 };
5029 
5030 DECLARE_ALIGNED(16, static const int16_t *,
5031                 av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
5032   NULL,
5033   av1_eob_to_eobxy_8x8_default,
5034   av1_eob_to_eobxy_16x16_default,
5035   av1_eob_to_eobxy_32x32_default,
5036   av1_eob_to_eobxy_32x32_default,
5037   NULL,
5038   NULL,
5039   av1_eob_to_eobxy_8x16_default,
5040   av1_eob_to_eobxy_16x8_default,
5041   av1_eob_to_eobxy_16x32_default,
5042   av1_eob_to_eobxy_32x16_default,
5043   av1_eob_to_eobxy_32x32_default,
5044   av1_eob_to_eobxy_32x32_default,
5045   NULL,
5046   NULL,
5047   av1_eob_to_eobxy_8x32_default,
5048   av1_eob_to_eobxy_32x8_default,
5049   av1_eob_to_eobxy_16x32_default,
5050   av1_eob_to_eobxy_32x16_default,
5051 };
5052 
highbd_get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size,int eob)5053 static inline void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
5054                                                      TX_SIZE tx_size, int eob) {
5055   if (eob == 1) {
5056     *eobx = 0;
5057     *eoby = 0;
5058     return;
5059   }
5060 
5061   const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
5062   const int eob_row = (eob - 1) >> tx_w_log2;
5063   const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
5064   *eobx = eobxy & 0xFF;
5065   *eoby = eobxy >> 8;
5066 }
5067 
get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size)5068 static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
5069                                               TX_SIZE tx_size) {
5070   if (tx_size == 2) {
5071     *eoby = 15, *eobx = 15;
5072   } else if (tx_size == 3) {
5073     *eoby = 31, *eobx = 31;
5074   } else if (tx_size == 4) {
5075     *eoby = 31, *eobx = 31;
5076   } else if (tx_size == 7) {
5077     *eoby = 15, *eobx = 7;
5078   } else if (tx_size == 8) {
5079     *eoby = 7, *eobx = 15;
5080   } else if (tx_size == 9) {
5081     *eoby = 31, *eobx = 15;
5082   } else if (tx_size == 10) {
5083     *eoby = 15, *eobx = 31;
5084   } else if (tx_size == 11) {
5085     *eoby = 31, *eobx = 31;
5086   } else if (tx_size == 12) {
5087     *eoby = 31, *eobx = 31;
5088   } else if (tx_size == 15) {
5089     *eoby = 31, *eobx = 7;
5090   } else if (tx_size == 16) {
5091     *eoby = 7, *eobx = 31;
5092   } else if (tx_size == 17) {
5093     *eoby = 31, *eobx = 15;
5094   } else if (tx_size == 18) {
5095     *eoby = 15, *eobx = 31;
5096   } else {
5097     *eoby = 0, *eobx = 0;
5098   }
5099 }
5100 
get_eobx_eoby_scan_v_identity(int * eobx,int * eoby,TX_SIZE tx_size)5101 static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
5102                                                  TX_SIZE tx_size) {
5103   const int txfm_size_row = tx_size_high[tx_size];
5104   *eoby = AOMMIN(32, txfm_size_row) - 1;
5105   *eobx = 0;
5106 }
5107 
get_eobx_eoby_scan_h_identity(int * eobx,int * eoby,TX_SIZE tx_size)5108 static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
5109                                                  TX_SIZE tx_size) {
5110   const int txfm_size_col = tx_size_wide[tx_size];
5111   *eobx = AOMMIN(32, txfm_size_col) - 1;
5112   *eoby = 0;
5113 }
5114 
inv_txfm2d_add_h_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5115 static void inv_txfm2d_add_h_identity_neon(const int32_t *input,
5116                                            uint16_t *output, int stride,
5117                                            TX_TYPE tx_type, TX_SIZE tx_size,
5118                                            const int bd) {
5119   int32x4_t buf1[64];
5120   int eobx, eoby;
5121   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size);
5122   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5123   const int txw_idx = get_txw_idx(tx_size);
5124   const int txh_idx = get_txh_idx(tx_size);
5125   const int txfm_size_col = tx_size_wide[tx_size];
5126   const int txfm_size_row = tx_size_high[tx_size];
5127   const int buf_size_w = AOMMIN(32, txfm_size_col);
5128   const int buf_size_w_div4 = buf_size_w >> 2;
5129   const int buf_size_h_div8 = (eoby + 8) >> 3;
5130   const int row_max = AOMMIN(32, txfm_size_row);
5131   const int input_stride = row_max;
5132   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5133   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
5134   const transform_1d_neon row_txfm =
5135       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5136   assert(row_txfm != NULL);
5137   const transform_1d_neon col_txfm =
5138       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
5139   assert(col_txfm != NULL);
5140   int ud_flip, lr_flip;
5141   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5142 
5143   for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
5144     int32x4_t buf0[16];
5145     load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5146     if (rect_type == 1 || rect_type == -1) {
5147       round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
5148     }
5149     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5150 
5151     int32x4_t *_buf1 = buf1 + i * 4;
5152 
5153     for (int j = 0; j < buf_size_w_div4; ++j) {
5154       int32x4_t *buf0_cur = buf0 + j * 4;
5155       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5156                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5157       _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5158       _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5159       _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5160       _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5161     }
5162   }
5163   for (int i = 0; i < buf_size_w_div4; i++) {
5164     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5165              bd, 0);
5166 
5167     round_shift_array_32_neon(buf1 + i * txfm_size_row,
5168                               buf1 + i * txfm_size_row, txfm_size_row,
5169                               -shift[1]);
5170   }
5171 
5172   // write to buffer
5173   for (int i = 0; i < (txfm_size_col >> 3); i++) {
5174     highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5175                                  stride, ud_flip, txfm_size_row, bd);
5176   }
5177 }
5178 
inv_txfm2d_add_v_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5179 static void inv_txfm2d_add_v_identity_neon(const int32_t *input,
5180                                            uint16_t *output, int stride,
5181                                            TX_TYPE tx_type, TX_SIZE tx_size,
5182                                            const int bd) {
5183   int32x4_t buf1[64];
5184   int eobx, eoby;
5185   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size);
5186   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5187   const int txw_idx = get_txw_idx(tx_size);
5188   const int txh_idx = get_txh_idx(tx_size);
5189   const int txfm_size_col = tx_size_wide[tx_size];
5190   const int txfm_size_row = tx_size_high[tx_size];
5191   const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
5192   const int row_max = AOMMIN(32, txfm_size_row);
5193   const int input_stride = row_max;
5194   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5195   const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
5196   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5197   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
5198   const transform_1d_neon row_txfm =
5199       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
5200   assert(row_txfm != NULL);
5201   const transform_1d_neon col_txfm =
5202       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5203   assert(col_txfm != NULL);
5204   int ud_flip, lr_flip;
5205   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5206 
5207   for (int i = 0; i < (row_max >> 2); ++i) {
5208     int32x4_t buf0[16];
5209     load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5210                             buf_size_nonzero_w);
5211     if (rect_type == 1 || rect_type == -1) {
5212       round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
5213     }
5214     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5215 
5216     int32x4_t *_buf1 = buf1 + i * 4;
5217     if (lr_flip) {
5218       for (int j = 0; j < buf_size_w_div4; ++j) {
5219         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5220                       buf0[4 * j],
5221                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
5222                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
5223                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
5224                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
5225       }
5226     } else {
5227       for (int j = 0; j < buf_size_w_div4; ++j) {
5228         TRANSPOSE_4X4(
5229             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5230             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5231             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5232       }
5233     }
5234   }
5235   for (int i = 0; i < buf_size_w_div4; i++) {
5236     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5237              bd, 0);
5238 
5239     round_shift_array_32_neon(buf1 + i * txfm_size_row,
5240                               buf1 + i * txfm_size_row, txfm_size_row,
5241                               -shift[1]);
5242   }
5243 
5244   // write to buffer
5245   {
5246     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5247       highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5248                                    stride, ud_flip, txfm_size_row, bd);
5249     }
5250   }
5251 }
5252 
inv_txfm2d_add_idtx_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5253 static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output,
5254                                      int stride, TX_TYPE tx_type,
5255                                      TX_SIZE tx_size, const int bd) {
5256   int32x4_t buf1[64 * 4];
5257   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5258   const int txw_idx = get_txw_idx(tx_size);
5259   const int txh_idx = get_txh_idx(tx_size);
5260   const int txfm_size_col = tx_size_wide[tx_size];
5261   const int txfm_size_row = tx_size_high[tx_size];
5262   const int row_max = AOMMIN(32, txfm_size_row);
5263   const int input_stride = row_max;
5264   const int buf_size_w = AOMMIN(32, txfm_size_col);
5265   const int buf_size_w_div4 = buf_size_w >> 2;
5266   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5267   const transform_1d_neon row_txfm =
5268       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5269   assert(row_txfm != NULL);
5270   const transform_1d_neon col_txfm =
5271       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5272   assert(col_txfm != NULL);
5273   for (int i = 0; i < (row_max >> 2); ++i) {
5274     int32x4_t buf0[32];
5275     load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5276     if (rect_type == 1 || rect_type == -1) {
5277       round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
5278     }
5279     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5280 
5281     int32x4_t *_buf1 = buf1 + i * 4;
5282     for (int j = 0; j < buf_size_w_div4; ++j) {
5283       int32x4_t *buf0_cur = buf0 + j * 4;
5284       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5285                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5286       _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5287       _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5288       _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5289       _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5290     }
5291   }
5292   for (int i = 0; i < buf_size_w_div4; i++) {
5293     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5294              bd, 0);
5295 
5296     round_shift_array_32_neon(buf1 + i * txfm_size_row,
5297                               buf1 + i * txfm_size_row, txfm_size_row,
5298                               -shift[1]);
5299   }
5300 
5301   // write to buffer
5302   {
5303     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5304       highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5305                                    stride, 0, txfm_size_row, bd);
5306     }
5307   }
5308 }
5309 
inv_txfm2d_add_no_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5310 static void inv_txfm2d_add_no_identity_neon(const int32_t *input,
5311                                             uint16_t *output, int stride,
5312                                             TX_TYPE tx_type, TX_SIZE tx_size,
5313                                             const int bd) {
5314   int32x4_t buf1[64 * 16];
5315   int eobx, eoby;
5316   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size);
5317   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5318   const int txw_idx = get_txw_idx(tx_size);
5319   const int txh_idx = get_txh_idx(tx_size);
5320   const int txfm_size_col = tx_size_wide[tx_size];
5321   const int txfm_size_row = tx_size_high[tx_size];
5322   const int buf_size_w_div4 = txfm_size_col >> 2;
5323   const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
5324   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5325   const int input_stride = AOMMIN(32, txfm_size_row);
5326   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5327 
5328   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5329   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5330   const transform_1d_neon row_txfm =
5331       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5332   const transform_1d_neon col_txfm =
5333       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5334 
5335   assert(col_txfm != NULL);
5336   assert(row_txfm != NULL);
5337   int ud_flip, lr_flip;
5338   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5339   // 1st stage: column transform
5340   for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5341     int32x4_t buf0[64];
5342     load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5343                             buf_size_nonzero_w);
5344     if (rect_type == 1 || rect_type == -1) {
5345       round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
5346     }
5347     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5348 
5349     int32x4_t *_buf1 = &buf1[i * 4];
5350 
5351     if (lr_flip) {
5352       for (int j = 0; j < buf_size_w_div4; ++j) {
5353         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5354                       buf0[4 * j],
5355                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
5356                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
5357                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
5358                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
5359       }
5360     } else {
5361       for (int j = 0; j < buf_size_w_div4; ++j) {
5362         TRANSPOSE_4X4(
5363             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5364             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5365             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5366       }
5367     }
5368   }
5369   // 2nd stage: column transform
5370   for (int i = 0; i < buf_size_w_div4; i++) {
5371     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5372              bd, 0);
5373 
5374     round_shift_array_32_neon(buf1 + i * txfm_size_row,
5375                               buf1 + i * txfm_size_row, txfm_size_row,
5376                               -shift[1]);
5377   }
5378 
5379   // write to buffer
5380   {
5381     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5382       highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5383                                    stride, ud_flip, txfm_size_row, bd);
5384     }
5385   }
5386 }
5387 
highbd_inv_txfm2d_add_no_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5388 static void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input,
5389                                                    uint16_t *output, int stride,
5390                                                    TX_TYPE tx_type,
5391                                                    TX_SIZE tx_size, int eob,
5392                                                    const int bd) {
5393   int32x4_t buf1[64 * 16];
5394   int eobx, eoby;
5395   highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5396   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5397   const int txw_idx = get_txw_idx(tx_size);
5398   const int txh_idx = get_txh_idx(tx_size);
5399   const int txfm_size_col = tx_size_wide[tx_size];
5400   const int txfm_size_row = tx_size_high[tx_size];
5401   const int buf_size_w_div8 = txfm_size_col >> 2;
5402   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5403   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5404   const int input_stride = AOMMIN(32, txfm_size_col);
5405   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5406 
5407   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5408   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5409   const transform_1d_neon row_txfm =
5410       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5411   const transform_1d_neon col_txfm =
5412       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5413 
5414   assert(col_txfm != NULL);
5415   assert(row_txfm != NULL);
5416   int ud_flip, lr_flip;
5417   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5418   // 1st stage: column transform
5419   for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5420     int32x4_t buf0[64];
5421     const int32_t *input_row = input + i * input_stride * 4;
5422     for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
5423       int32x4_t *buf0_cur = &buf0[j * 4];
5424       load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5425 
5426       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5427                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5428     }
5429     if (rect_type == 1 || rect_type == -1) {
5430       round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w_div8 << 3);
5431     }
5432     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5433 
5434     int32x4_t *_buf1 = &buf1[i * 4];
5435 
5436     if (lr_flip) {
5437       for (int j = 0; j < buf_size_w_div8; ++j) {
5438         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5439                       buf0[4 * j],
5440                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5441                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5442                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5443                       _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5444       }
5445     } else {
5446       for (int j = 0; j < buf_size_w_div8; ++j) {
5447         TRANSPOSE_4X4(
5448             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5449             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5450             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5451       }
5452     }
5453   }
5454   // 2nd stage: column transform
5455   for (int i = 0; i < buf_size_w_div8; i++) {
5456     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5457              bd, 0);
5458 
5459     round_shift_array_32_neon(buf1 + i * txfm_size_row,
5460                               buf1 + i * txfm_size_row, txfm_size_row,
5461                               -shift[1]);
5462   }
5463 
5464   // write to buffer
5465   {
5466     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5467       highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5468                                    stride, ud_flip, txfm_size_row, bd);
5469     }
5470   }
5471 }
5472 
highbd_inv_txfm2d_add_universe_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5473 static void highbd_inv_txfm2d_add_universe_neon(const int32_t *input,
5474                                                 uint8_t *output, int stride,
5475                                                 TX_TYPE tx_type,
5476                                                 TX_SIZE tx_size, int eob,
5477                                                 const int bd) {
5478   switch (tx_type) {
5479     case DCT_DCT:
5480     case ADST_DCT:
5481     case DCT_ADST:
5482     case ADST_ADST:
5483     case FLIPADST_DCT:
5484     case DCT_FLIPADST:
5485     case FLIPADST_FLIPADST:
5486     case ADST_FLIPADST:
5487     case FLIPADST_ADST:
5488       highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
5489                                              stride, tx_type, tx_size, eob, bd);
5490       break;
5491     case V_DCT:
5492     case V_ADST:
5493     case V_FLIPADST:
5494       inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5495                                      tx_type, tx_size, bd);
5496       break;
5497     case H_DCT:
5498     case H_ADST:
5499     case H_FLIPADST:
5500       inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5501                                      tx_type, tx_size, bd);
5502       break;
5503     case IDTX:
5504       inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5505                                tx_type, tx_size, bd);
5506       break;
5507     default: assert(0); break;
5508   }
5509 }
5510 
inv_txfm2d_add_universe_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5511 static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output,
5512                                          int stride, TX_TYPE tx_type,
5513                                          TX_SIZE tx_size, const int bd) {
5514   switch (tx_type) {
5515     case DCT_DCT:
5516     case ADST_DCT:
5517     case DCT_ADST:
5518     case ADST_ADST:
5519     case FLIPADST_DCT:
5520     case DCT_FLIPADST:
5521     case FLIPADST_FLIPADST:
5522     case ADST_FLIPADST:
5523     case FLIPADST_ADST:
5524       inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
5525                                       stride, tx_type, tx_size, bd);
5526       break;
5527     case V_DCT:
5528     case V_ADST:
5529     case V_FLIPADST:
5530       inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5531                                      tx_type, tx_size, bd);
5532       break;
5533     case H_DCT:
5534     case H_ADST:
5535     case H_FLIPADST:
5536       inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5537                                      tx_type, tx_size, bd);
5538       break;
5539     case IDTX:
5540       inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5541                                tx_type, tx_size, bd);
5542       break;
5543     default: assert(0); break;
5544   }
5545 }
5546 
highbd_inv_txfm_add_8x8_neon(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5547 static void highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
5548                                          int stride,
5549                                          const TxfmParam *txfm_param) {
5550   int bd = txfm_param->bd;
5551   const TX_TYPE tx_type = txfm_param->tx_type;
5552   const int32_t *src = cast_to_int32(input);
5553   switch (tx_type) {
5554     case IDTX:
5555     case H_DCT:
5556     case H_ADST:
5557     case H_FLIPADST:
5558     case V_DCT:
5559     case V_ADST:
5560     case V_FLIPADST:
5561       highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type,
5562                                           txfm_param->tx_size, txfm_param->eob,
5563                                           bd);
5564       break;
5565     default:
5566       av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride,
5567                                   tx_type, bd);
5568       break;
5569   }
5570 }
5571 
highbd_inv_txfm_add_4x4_neon(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5572 static void highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
5573                                          int stride,
5574                                          const TxfmParam *txfm_param) {
5575   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5576   int eob = txfm_param->eob;
5577   int bd = txfm_param->bd;
5578   int lossless = txfm_param->lossless;
5579   const int32_t *src = cast_to_int32(input);
5580   const TX_TYPE tx_type = txfm_param->tx_type;
5581   if (lossless) {
5582     assert(tx_type == DCT_DCT);
5583     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5584     return;
5585   }
5586   av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5587                               bd);
5588 }
5589 
av1_inv_txfm2d_add_8x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5590 void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest,
5591                                   int stride, TX_TYPE tx_type, const int bd) {
5592   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16,
5593                                bd);
5594 }
5595 
av1_inv_txfm2d_add_16x8_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5596 void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest,
5597                                   int stride, TX_TYPE tx_type, const int bd) {
5598   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8,
5599                                bd);
5600 }
5601 
av1_inv_txfm2d_add_16x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5602 void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest,
5603                                    int stride, TX_TYPE tx_type, const int bd) {
5604   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5605                                TX_16X32, bd);
5606 }
5607 
av1_inv_txfm2d_add_32x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5608 void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest,
5609                                    int stride, TX_TYPE tx_type, const int bd) {
5610   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5611                                TX_32X16, bd);
5612 }
5613 
av1_inv_txfm2d_add_32x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5614 void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest,
5615                                    int stride, TX_TYPE tx_type, const int bd) {
5616   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5617                                TX_32X32, bd);
5618 }
5619 
av1_inv_txfm2d_add_64x64_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5620 void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest,
5621                                    int stride, TX_TYPE tx_type, const int bd) {
5622   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5623                                TX_64X64, bd);
5624 }
5625 
av1_inv_txfm2d_add_32x64_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5626 void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest,
5627                                    int stride, TX_TYPE tx_type, const int bd) {
5628   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5629                                TX_32X64, bd);
5630 }
5631 
av1_inv_txfm2d_add_64x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5632 void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest,
5633                                    int stride, TX_TYPE tx_type, const int bd) {
5634   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5635                                TX_64X32, bd);
5636 }
5637 
av1_inv_txfm2d_add_64x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5638 void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest,
5639                                    int stride, TX_TYPE tx_type, const int bd) {
5640   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5641                                TX_64X16, bd);
5642 }
5643 
av1_inv_txfm2d_add_16x64_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5644 void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest,
5645                                    int stride, TX_TYPE tx_type, const int bd) {
5646   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5647                                TX_16X64, bd);
5648 }
5649 
av1_inv_txfm2d_add_16x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5650 static void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input,
5651                                           uint16_t *dest, int stride,
5652                                           TX_TYPE tx_type, const int bd) {
5653   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5654                                TX_16X16, bd);
5655 }
5656 
av1_inv_txfm2d_add_32x8_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5657 void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest,
5658                                   int stride, TX_TYPE tx_type, const int bd) {
5659   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8,
5660                                bd);
5661 }
5662 
av1_inv_txfm2d_add_8x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5663 void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest,
5664                                   int stride, TX_TYPE tx_type, const int bd) {
5665   inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32,
5666                                bd);
5667 }
5668 
av1_highbd_inv_txfm_add_neon(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5669 void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
5670                                   int stride, const TxfmParam *txfm_param) {
5671   const TX_SIZE tx_size = txfm_param->tx_size;
5672 
5673   TX_TYPE tx_type = txfm_param->tx_type;
5674   int bd = txfm_param->bd;
5675   switch (tx_size) {
5676     case TX_8X8:
5677       highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
5678       break;
5679     case TX_4X8:
5680       av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5681                                   txfm_param->tx_type, txfm_param->bd);
5682       break;
5683     case TX_8X4:
5684       av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5685                                   txfm_param->tx_type, txfm_param->bd);
5686       break;
5687     case TX_4X4:
5688       highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
5689       break;
5690     case TX_16X4:
5691       av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5692                                    txfm_param->tx_type, txfm_param->bd);
5693       break;
5694     case TX_4X16:
5695       av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5696                                    txfm_param->tx_type, txfm_param->bd);
5697       break;
5698     case TX_8X16:
5699       av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type,
5700                                    bd);
5701       break;
5702     case TX_16X8:
5703       av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type,
5704                                    bd);
5705       break;
5706     case TX_16X32:
5707       av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type,
5708                                     bd);
5709       break;
5710     case TX_32X16:
5711       av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type,
5712                                     bd);
5713       break;
5714     case TX_16X16:
5715       av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type,
5716                                     bd);
5717       break;
5718     case TX_32X32:
5719       av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type,
5720                                     bd);
5721       break;
5722     case TX_64X64:
5723       av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type,
5724                                     bd);
5725       break;
5726     case TX_32X64:
5727       av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type,
5728                                     bd);
5729       break;
5730     case TX_64X32:
5731       av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type,
5732                                     bd);
5733       break;
5734     case TX_16X64:
5735       av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type,
5736                                     bd);
5737       break;
5738     case TX_64X16:
5739       av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type,
5740                                     bd);
5741       break;
5742     case TX_32X8:
5743       av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type,
5744                                    bd);
5745       break;
5746     case TX_8X32:
5747       av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type,
5748                                    bd);
5749       break;
5750   }
5751 }
5752