xref: /aosp_15_r20/external/libaom/av1/common/arm/av1_inv_txfm_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom_dsp/arm/transpose_neon.h"
19 #include "av1/common/av1_inv_txfm1d.h"
20 #include "av1/common/av1_inv_txfm1d_cfg.h"
21 #include "av1/common/av1_txfm.h"
22 #include "av1/common/enums.h"
23 #include "av1/common/idct.h"
24 #include "av1/common/arm/av1_inv_txfm_neon.h"
25 
26 // 1D itx types
27 typedef enum ATTRIBUTE_PACKED {
28   IDCT_1D,
29   IADST_1D,
30   IFLIPADST_1D = IADST_1D,
31   IIDENTITY_1D,
32   ITX_TYPES_1D,
33 } ITX_TYPE_1D;
34 
35 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
36   IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
37   IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
38   IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
39   IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
40 };
41 
42 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
43   IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
44   IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
45   IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
46   IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
47 };
48 
49 // 1D functions
50 static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
51   { av1_idct4, av1_iadst4, av1_iidentity4_c },
52   { av1_idct8, av1_iadst8, av1_iidentity8_c },
53   { av1_idct16, av1_iadst16, av1_iidentity16_c },
54   { av1_idct32, NULL, NULL },
55   { av1_idct64, NULL, NULL },
56 };
57 
lowbd_add_flip_buffer_8xn_neon(int16x8_t * in,uint8_t * output,int stride,int flipud,const int height)58 static inline void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
59                                                   uint8_t *output, int stride,
60                                                   int flipud,
61                                                   const int height) {
62   int j = flipud ? (height - 1) : 0;
63   const int step = flipud ? -1 : 1;
64   int16x8_t temp_output;
65   for (int i = 0; i < height; ++i, j += step) {
66     temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
67     temp_output = vaddq_s16(temp_output, in[j]);
68     vst1_u8(output, vqmovun_s16(temp_output));
69     output += stride;
70   }
71 }
72 
lowbd_get_recon_16x16_neon(const uint8x16_t pred,int16x8_t res0,int16x8_t res1)73 static inline uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
74                                                     int16x8_t res0,
75                                                     int16x8_t res1) {
76   int16x8_t temp_output[2];
77   uint8x16_t temp_output_8q;
78   temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
79   temp_output[0] = vaddq_s16(temp_output[0], res0);
80   temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
81   temp_output[1] = vaddq_s16(temp_output[1], res1);
82   temp_output_8q =
83       vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
84   return temp_output_8q;
85 }
86 
lowbd_add_flip_buffer_16xn_neon(int16x8_t * in,uint8_t * output,int stride,int flipud,int height)87 static inline void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
88                                                    uint8_t *output, int stride,
89                                                    int flipud, int height) {
90   uint8x16_t temp_output_8q;
91   int j = flipud ? (height - 1) : 0;
92   const int step = flipud ? -1 : 1;
93   for (int i = 0; i < height; ++i, j += step) {
94     temp_output_8q = vld1q_u8(output + i * stride);
95     temp_output_8q =
96         lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
97     vst1q_u8((output + i * stride), temp_output_8q);
98   }
99 }
100 
lowbd_inv_txfm2d_memset_neon(int16x8_t * a,int size,int value)101 static inline void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
102                                                 int value) {
103   for (int i = 0; i < size; i++) {
104     a[i] = vdupq_n_s16((int16_t)value);
105   }
106 }
107 
btf_16_lane_0_1_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)108 static inline void btf_16_lane_0_1_neon(const int16x8_t in0,
109                                         const int16x8_t in1, const int16x4_t c,
110                                         int16x8_t *t0, int16x8_t *t1) {
111   int32x4_t s0[2], s1[2];
112   int16x4_t v0[2], v1[2];
113 
114   s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
115   s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
116   s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
117   s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
118 
119   s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
120   s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
121   s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
122   s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
123 
124   v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
125   v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
126   v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
127   v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
128 
129   *t0 = vcombine_s16(v0[0], v0[1]);
130   *t1 = vcombine_s16(v1[0], v1[1]);
131 }
132 
btf_16_lane_1_0_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)133 static inline void btf_16_lane_1_0_neon(const int16x8_t in0,
134                                         const int16x8_t in1, const int16x4_t c,
135                                         int16x8_t *t0, int16x8_t *t1) {
136   int32x4_t s0[2], s1[2];
137   int16x4_t v0[2], v1[2];
138 
139   s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
140   s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
141   s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
142   s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
143 
144   s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
145   s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
146   s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
147   s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
148 
149   v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
150   v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
151   v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
152   v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
153 
154   *t0 = vcombine_s16(v0[0], v0[1]);
155   *t1 = vcombine_s16(v1[0], v1[1]);
156 }
157 
btf_16_lane_2_3_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)158 static inline void btf_16_lane_2_3_neon(const int16x8_t in0,
159                                         const int16x8_t in1, const int16x4_t c,
160                                         int16x8_t *t0, int16x8_t *t1) {
161   int32x4_t s0[2], s1[2];
162   int16x4_t v0[2], v1[2];
163 
164   s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
165   s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
166   s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
167   s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
168 
169   s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
170   s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
171   s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
172   s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
173 
174   v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
175   v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
176   v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
177   v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
178 
179   *t0 = vcombine_s16(v0[0], v0[1]);
180   *t1 = vcombine_s16(v1[0], v1[1]);
181 }
182 
btf_16_neon(const int16x8_t in0,int16_t coef1,int16_t coef2,int16x8_t * t0,int16x8_t * t1)183 static inline void btf_16_neon(const int16x8_t in0, int16_t coef1,
184                                int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
185   int32x4_t s0_l, s0_h, s1_l, s1_h;
186   int16x4_t v0[2], v1[2];
187 
188   s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
189   s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
190   s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
191   s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
192 
193   v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
194   v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
195   v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
196   v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
197 
198   *t0 = vcombine_s16(v0[0], v0[1]);
199   *t1 = vcombine_s16(v1[0], v1[1]);
200 }
201 
btf_16_lane_3_2_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)202 static inline void btf_16_lane_3_2_neon(const int16x8_t in0,
203                                         const int16x8_t in1, const int16x4_t c,
204                                         int16x8_t *t0, int16x8_t *t1) {
205   int32x4_t s0[2], s1[2];
206   int16x4_t v0[2], v1[2];
207 
208   s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
209   s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
210   s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
211   s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
212 
213   s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
214   s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
215   s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
216   s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
217 
218   v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
219   v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
220   v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
221   v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
222 
223   *t0 = vcombine_s16(v0[0], v0[1]);
224   *t1 = vcombine_s16(v1[0], v1[1]);
225 }
226 
btf_16_half_neon(int16x8_t * const x,const int16x4_t c)227 static inline void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
228   int32x4_t t0[2], t1[2];
229   int16x4_t v0[2], v1[2];
230 
231   // Don't add/sub before multiply, which will overflow in iadst8.
232   const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
233   const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
234   const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
235   const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
236 
237   t0[0] = vaddq_s32(x0_lo, x1_lo);
238   t0[1] = vaddq_s32(x0_hi, x1_hi);
239   t1[0] = vsubq_s32(x0_lo, x1_lo);
240   t1[1] = vsubq_s32(x0_hi, x1_hi);
241 
242   v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
243   v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
244   v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
245   v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
246 
247   x[0] = vcombine_s16(v0[0], v0[1]);
248   x[1] = vcombine_s16(v1[0], v1[1]);
249 }
250 
set_s16x4_neon(const int16_t c0,const int16_t c1,const int16_t c2,const int16_t c3)251 static inline int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
252                                        const int16_t c2, const int16_t c3) {
253   int16x4_t val = vdup_n_s16(c0);
254   val = vset_lane_s16(c1, val, 1);
255   val = vset_lane_s16(c2, val, 2);
256   val = vset_lane_s16(c3, val, 3);
257   return val;
258 }
259 
iadst8_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)260 static inline void iadst8_neon(int16x8_t *const in, int16x8_t *out,
261                                int8_t cos_bit) {
262   const int32_t *cospi = cospi_arr(cos_bit);
263 
264   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
265                                       (int16_t)cospi[20], (int16_t)cospi[44]);
266   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
267                                       (int16_t)cospi[52], (int16_t)cospi[12]);
268   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
269                                       (int16_t)cospi[16], (int16_t)cospi[48]);
270 
271   int16x8_t x[8];
272   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
273 
274   // Stage 1
275   x[0] = in[7];
276   x[1] = in[0];
277   x[2] = in[5];
278   x[3] = in[2];
279   x[4] = in[3];
280   x[5] = in[4];
281   x[6] = in[1];
282   x[7] = in[6];
283 
284   // Stage 2
285   btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
286   btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
287   btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
288   btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
289 
290   // Stage 3
291   x[0] = vqaddq_s16(s0, s4);
292   x[1] = vqaddq_s16(s1, s5);
293   x[2] = vqaddq_s16(s2, s6);
294   x[3] = vqaddq_s16(s3, s7);
295   x[4] = vqsubq_s16(s0, s4);
296   x[5] = vqsubq_s16(s1, s5);
297   x[6] = vqsubq_s16(s2, s6);
298   x[7] = vqsubq_s16(s3, s7);
299 
300   // Stage 4
301   s0 = x[0];
302   s1 = x[1];
303   s2 = x[2];
304   s3 = x[3];
305   btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
306   btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
307 
308   // Stage 5
309   x[0] = vqaddq_s16(s0, s2);
310   x[1] = vqaddq_s16(s1, s3);
311   x[2] = vqsubq_s16(s0, s2);
312   x[3] = vqsubq_s16(s1, s3);
313   x[4] = vqaddq_s16(s4, s6);
314   x[5] = vqaddq_s16(s5, s7);
315   x[6] = vqsubq_s16(s4, s6);
316   x[7] = vqsubq_s16(s5, s7);
317 
318   // stage 6
319   btf_16_half_neon(x + 2, c2);
320   btf_16_half_neon(x + 6, c2);
321 
322   // Stage 7
323   out[0] = x[0];
324   out[1] = vqnegq_s16(x[4]);
325   out[2] = x[6];
326   out[3] = vqnegq_s16(x[2]);
327   out[4] = x[3];
328   out[5] = vqnegq_s16(x[7]);
329   out[6] = x[5];
330   out[7] = vqnegq_s16(x[1]);
331 }
332 
iadst8_low1_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)333 static inline void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
334                                     int8_t cos_bit) {
335   const int32_t *cospi = cospi_arr(cos_bit);
336   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
337                                       (int16_t)cospi[16], (int16_t)cospi[48]);
338 
339   int16x8_t x[8];
340   int16x8_t s0, s1, s4, s5;
341 
342   // Stage 1
343   x[1] = in[0];
344 
345   // Stage 2
346 
347   btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
348 
349   // Stage 3
350   x[0] = s0;
351   x[1] = s1;
352   x[4] = s0;
353   x[5] = s1;
354 
355   // Stage 4
356   s0 = x[0];
357   s1 = x[1];
358   btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
359 
360   // Stage 5
361   x[0] = s0;
362   x[1] = s1;
363   x[2] = s0;
364   x[3] = s1;
365   x[4] = s4;
366   x[5] = s5;
367   x[6] = s4;
368   x[7] = s5;
369 
370   // stage 6
371   btf_16_half_neon(x + 2, c2);
372   btf_16_half_neon(x + 6, c2);
373 
374   // Stage 7
375   out[0] = x[0];
376   out[1] = vqnegq_s16(x[4]);
377   out[2] = x[6];
378   out[3] = vqnegq_s16(x[2]);
379   out[4] = x[3];
380   out[5] = vqnegq_s16(x[7]);
381   out[6] = x[5];
382   out[7] = vqnegq_s16(x[1]);
383 }
384 
idct8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)385 static inline void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
386   const int32_t *cospi = cospi_arr(cos_bit);
387   int16x8_t step1[8], step2[8];
388   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
389                                       (int16_t)cospi[40], (int16_t)cospi[24]);
390   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
391                                       (int16_t)cospi[16], (int16_t)cospi[48]);
392 
393   // stage 2
394   btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
395   btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
396 
397   // stage 3
398   btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
399   btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
400   step2[4] = vqaddq_s16(step1[4], step1[5]);
401   step2[5] = vqsubq_s16(step1[4], step1[5]);
402   step2[6] = vqsubq_s16(step1[7], step1[6]);
403   step2[7] = vqaddq_s16(step1[7], step1[6]);
404 
405   // stage 4
406   step1[0] = vqaddq_s16(step2[0], step2[3]);
407   step1[1] = vqaddq_s16(step2[1], step2[2]);
408   step1[2] = vqsubq_s16(step2[1], step2[2]);
409   step1[3] = vqsubq_s16(step2[0], step2[3]);
410   btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
411 
412   // stage 5
413   out[0] = vqaddq_s16(step1[0], step2[7]);
414   out[1] = vqaddq_s16(step1[1], step1[6]);
415   out[2] = vqaddq_s16(step1[2], step1[5]);
416   out[3] = vqaddq_s16(step1[3], step2[4]);
417   out[4] = vqsubq_s16(step1[3], step2[4]);
418   out[5] = vqsubq_s16(step1[2], step1[5]);
419   out[6] = vqsubq_s16(step1[1], step1[6]);
420   out[7] = vqsubq_s16(step1[0], step2[7]);
421 }
422 
idct8_low1_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)423 static inline void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
424                                    int8_t cos_bit) {
425   const int32_t *cospi = cospi_arr(cos_bit);
426   int16x8_t step1;
427   int32x4_t t32[2];
428 
429   // stage 1
430   // stage 2
431   // stage 3
432   t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
433   t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
434 
435   step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
436                        vrshrn_n_s32(t32[1], INV_COS_BIT));
437 
438   // stage 4
439   // stage 5
440   out[0] = step1;
441   out[1] = step1;
442   out[2] = step1;
443   out[3] = step1;
444   out[4] = step1;
445   out[5] = step1;
446   out[6] = step1;
447   out[7] = step1;
448 }
449 
round_shift_array_16_neon(int16x8_t * arr,int size,int bit)450 static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
451   assert(!(size % 4));
452   if (!bit) return;
453   const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
454   for (int i = 0; i < size; i++) {
455     arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
456   }
457 }
458 
flip_buf_ud_neon(int16x8_t * input,int size)459 static inline void flip_buf_ud_neon(int16x8_t *input, int size) {
460   int16x8_t temp[8];
461   for (int i = 0; i < size; ++i) {
462     temp[i] = input[size - 1 - i];
463   }
464   for (int i = 0; i < size; ++i) {
465     input[i] = temp[i];
466   }
467 }
468 
load_buffer_32bit_to_16bit_neon(const int32_t * input,int stride,int16x8_t * const a,int out_size)469 static inline void load_buffer_32bit_to_16bit_neon(const int32_t *input,
470                                                    int stride,
471                                                    int16x8_t *const a,
472                                                    int out_size) {
473   for (int i = 0; i < out_size; ++i) {
474     a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
475                         vmovn_s32(vld1q_s32(input + 4)));
476     input += stride;
477   }
478 }
479 
480 static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
481                                          4 * 5793 };
482 
identity_txfm_round_neon(int16x8_t * input,int16x8_t * output,int txw_idx,int8_t size,int bit)483 static inline void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
484                                             int txw_idx, int8_t size, int bit) {
485   const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
486   int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
487   int16x4_t low_i16, high_i16;
488   int32x4_t low_i32, high_i32;
489   for (int i = 0; i < size; i++) {
490     int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
491     int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
492     low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
493     high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
494     low_i16 = vqmovn_s32(low_i32);
495     high_i16 = vqmovn_s32(high_i32);
496     output[i] = vcombine_s16(low_i16, high_i16);
497   }
498 }
499 
round_shift_for_rect(int16x8_t * input,int16x8_t * output,int size)500 static inline void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
501                                         int size) {
502   int32x4_t out_low, out_high;
503   int16x4_t low, high;
504 
505   for (int z = 0; z < size; ++z) {
506     out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
507     out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
508 
509     low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
510     high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
511 
512     output[z] = vcombine_s16(low, high);
513   }
514 }
515 
idct16_low1_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)516 static inline void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
517                                     int8_t cos_bit) {
518   const int32_t *cospi = cospi_arr(cos_bit);
519   int16x8_t step1;
520   int32x4_t t32[2];
521 
522   // stage 4
523 
524   t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
525   t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
526   step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
527                        vrshrn_n_s32(t32[1], INV_COS_BIT));
528 
529   // stage 6
530   // stage 7
531   out[0] = step1;
532   out[1] = step1;
533   out[2] = step1;
534   out[3] = step1;
535   out[4] = step1;
536   out[5] = step1;
537   out[6] = step1;
538   out[7] = step1;
539   out[8] = step1;
540   out[9] = step1;
541   out[10] = step1;
542   out[11] = step1;
543   out[12] = step1;
544   out[13] = step1;
545   out[14] = step1;
546   out[15] = step1;
547 }
548 
idct16_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)549 static inline void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
550   const int32_t *cospi = cospi_arr(cos_bit);
551   int16x8_t step1[16], step2[16];
552 
553   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
554                                       (int16_t)cospi[36], (int16_t)cospi[28]);
555   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
556                                       (int16_t)cospi[52], (int16_t)cospi[12]);
557   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
558                                       (int16_t)cospi[40], (int16_t)cospi[24]);
559   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
560                                       (int16_t)cospi[16], (int16_t)cospi[48]);
561   const int16x4_t c4 =
562       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
563                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
564   // stage 2
565 
566   btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
567   btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
568   btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
569   btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
570 
571   step2[0] = in[0];
572   step2[1] = in[8];
573   step2[2] = in[4];
574   step2[3] = in[12];
575   step2[4] = in[2];
576   step2[5] = in[10];
577   step2[6] = in[6];
578   step2[7] = in[14];
579 
580   // stage 3
581 
582   btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
583   btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
584 
585   step1[0] = step2[0];
586   step1[1] = step2[1];
587   step1[2] = step2[2];
588   step1[3] = step2[3];
589   step1[8] = vqaddq_s16(step2[8], step2[9]);
590   step1[9] = vqsubq_s16(step2[8], step2[9]);
591   step1[10] = vqsubq_s16(step2[11], step2[10]);
592   step1[11] = vqaddq_s16(step2[11], step2[10]);
593   step1[12] = vqaddq_s16(step2[12], step2[13]);
594   step1[13] = vqsubq_s16(step2[12], step2[13]);
595   step1[14] = vqsubq_s16(step2[15], step2[14]);
596   step1[15] = vqaddq_s16(step2[15], step2[14]);
597 
598   // stage 4
599 
600   btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
601   btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
602   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
603   btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
604 
605   step2[4] = vqaddq_s16(step1[4], step1[5]);
606   step2[5] = vqsubq_s16(step1[4], step1[5]);
607   step2[6] = vqsubq_s16(step1[7], step1[6]);
608   step2[7] = vqaddq_s16(step1[7], step1[6]);
609   step2[8] = step1[8];
610   step2[11] = step1[11];
611   step2[12] = step1[12];
612   step2[15] = step1[15];
613 
614   // stage 5
615 
616   btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
617 
618   step1[0] = vqaddq_s16(step2[0], step2[3]);
619   step1[1] = vqaddq_s16(step2[1], step2[2]);
620   step1[2] = vqsubq_s16(step2[1], step2[2]);
621   step1[3] = vqsubq_s16(step2[0], step2[3]);
622   step1[4] = step2[4];
623   step1[7] = step2[7];
624   step1[8] = vqaddq_s16(step2[8], step2[11]);
625   step1[9] = vqaddq_s16(step2[9], step2[10]);
626   step1[10] = vqsubq_s16(step2[9], step2[10]);
627   step1[11] = vqsubq_s16(step2[8], step2[11]);
628   step1[12] = vqsubq_s16(step2[15], step2[12]);
629   step1[13] = vqsubq_s16(step2[14], step2[13]);
630   step1[14] = vqaddq_s16(step2[14], step2[13]);
631   step1[15] = vqaddq_s16(step2[15], step2[12]);
632 
633   // stage 6
634 
635   btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
636   btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
637 
638   step2[0] = vqaddq_s16(step1[0], step1[7]);
639   step2[1] = vqaddq_s16(step1[1], step1[6]);
640   step2[2] = vqaddq_s16(step1[2], step1[5]);
641   step2[3] = vqaddq_s16(step1[3], step1[4]);
642   step2[4] = vqsubq_s16(step1[3], step1[4]);
643   step2[5] = vqsubq_s16(step1[2], step1[5]);
644   step2[6] = vqsubq_s16(step1[1], step1[6]);
645   step2[7] = vqsubq_s16(step1[0], step1[7]);
646   step2[8] = step1[8];
647   step2[9] = step1[9];
648   step2[14] = step1[14];
649   step2[15] = step1[15];
650 
651   // stage 7
652   out[0] = vqaddq_s16(step2[0], step2[15]);
653   out[1] = vqaddq_s16(step2[1], step2[14]);
654   out[2] = vqaddq_s16(step2[2], step2[13]);
655   out[3] = vqaddq_s16(step2[3], step2[12]);
656   out[4] = vqaddq_s16(step2[4], step2[11]);
657   out[5] = vqaddq_s16(step2[5], step2[10]);
658   out[6] = vqaddq_s16(step2[6], step2[9]);
659   out[7] = vqaddq_s16(step2[7], step2[8]);
660   out[8] = vqsubq_s16(step2[7], step2[8]);
661   out[9] = vqsubq_s16(step2[6], step2[9]);
662   out[10] = vqsubq_s16(step2[5], step2[10]);
663   out[11] = vqsubq_s16(step2[4], step2[11]);
664   out[12] = vqsubq_s16(step2[3], step2[12]);
665   out[13] = vqsubq_s16(step2[2], step2[13]);
666   out[14] = vqsubq_s16(step2[1], step2[14]);
667   out[15] = vqsubq_s16(step2[0], step2[15]);
668 }
669 
idct16_low8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)670 static inline void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
671                                     int8_t cos_bit) {
672   const int32_t *cospi = cospi_arr(cos_bit);
673   int16x8_t step1[16], step2[16];
674   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
675                                       (int16_t)cospi[16], (int16_t)cospi[48]);
676   const int16x4_t c1 =
677       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
678                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
679 
680   // stage 1
681   // stage 2
682 
683   step2[0] = in[0];
684   step2[2] = in[4];
685   step2[4] = in[2];
686   step2[6] = in[6];
687 
688   btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
689   btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
690   btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
691   btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
692 
693   // stage 3
694 
695   btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
696   btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
697 
698   step1[0] = step2[0];
699   step1[2] = step2[2];
700   step1[8] = vqaddq_s16(step2[8], step2[9]);
701   step1[9] = vqsubq_s16(step2[8], step2[9]);
702   step1[10] = vqsubq_s16(step2[11], step2[10]);
703   step1[11] = vqaddq_s16(step2[11], step2[10]);
704   step1[12] = vqaddq_s16(step2[12], step2[13]);
705   step1[13] = vqsubq_s16(step2[12], step2[13]);
706   step1[14] = vqsubq_s16(step2[15], step2[14]);
707   step1[15] = vqaddq_s16(step2[15], step2[14]);
708 
709   // stage 4
710 
711   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
712   btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
713   btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
714   btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
715 
716   step2[4] = vqaddq_s16(step1[4], step1[5]);
717   step2[5] = vqsubq_s16(step1[4], step1[5]);
718   step2[6] = vqsubq_s16(step1[7], step1[6]);
719   step2[7] = vqaddq_s16(step1[7], step1[6]);
720   step2[8] = step1[8];
721   step2[11] = step1[11];
722   step2[12] = step1[12];
723   step2[15] = step1[15];
724 
725   // stage 5
726 
727   btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
728   step1[0] = vqaddq_s16(step2[0], step2[3]);
729   step1[1] = vqaddq_s16(step2[1], step2[2]);
730   step1[2] = vqsubq_s16(step2[1], step2[2]);
731   step1[3] = vqsubq_s16(step2[0], step2[3]);
732   step1[4] = step2[4];
733   step1[7] = step2[7];
734   step1[8] = vqaddq_s16(step2[8], step2[11]);
735   step1[9] = vqaddq_s16(step2[9], step2[10]);
736   step1[10] = vqsubq_s16(step2[9], step2[10]);
737   step1[11] = vqsubq_s16(step2[8], step2[11]);
738   step1[12] = vqsubq_s16(step2[15], step2[12]);
739   step1[13] = vqsubq_s16(step2[14], step2[13]);
740   step1[14] = vqaddq_s16(step2[14], step2[13]);
741   step1[15] = vqaddq_s16(step2[15], step2[12]);
742 
743   // stage 6
744   btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
745   btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
746 
747   step2[0] = vqaddq_s16(step1[0], step1[7]);
748   step2[1] = vqaddq_s16(step1[1], step1[6]);
749   step2[2] = vqaddq_s16(step1[2], step1[5]);
750   step2[3] = vqaddq_s16(step1[3], step1[4]);
751   step2[4] = vqsubq_s16(step1[3], step1[4]);
752   step2[5] = vqsubq_s16(step1[2], step1[5]);
753   step2[6] = vqsubq_s16(step1[1], step1[6]);
754   step2[7] = vqsubq_s16(step1[0], step1[7]);
755   step2[8] = step1[8];
756   step2[9] = step1[9];
757   step2[14] = step1[14];
758   step2[15] = step1[15];
759 
760   // stage 7
761 
762   out[0] = vqaddq_s16(step2[0], step2[15]);
763   out[1] = vqaddq_s16(step2[1], step2[14]);
764   out[2] = vqaddq_s16(step2[2], step2[13]);
765   out[3] = vqaddq_s16(step2[3], step2[12]);
766   out[4] = vqaddq_s16(step2[4], step2[11]);
767   out[5] = vqaddq_s16(step2[5], step2[10]);
768   out[6] = vqaddq_s16(step2[6], step2[9]);
769   out[7] = vqaddq_s16(step2[7], step2[8]);
770   out[8] = vqsubq_s16(step2[7], step2[8]);
771   out[9] = vqsubq_s16(step2[6], step2[9]);
772   out[10] = vqsubq_s16(step2[5], step2[10]);
773   out[11] = vqsubq_s16(step2[4], step2[11]);
774   out[12] = vqsubq_s16(step2[3], step2[12]);
775   out[13] = vqsubq_s16(step2[2], step2[13]);
776   out[14] = vqsubq_s16(step2[1], step2[14]);
777   out[15] = vqsubq_s16(step2[0], step2[15]);
778 }
779 
iadst16_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)780 static inline void iadst16_neon(int16x8_t *const in, int16x8_t *out,
781                                 int8_t cos_bit) {
782   const int32_t *cospi = cospi_arr(cos_bit);
783 
784   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
785                                       (int16_t)cospi[10], (int16_t)cospi[54]);
786   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
787                                       (int16_t)cospi[26], (int16_t)cospi[38]);
788   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
789                                       (int16_t)cospi[42], (int16_t)cospi[22]);
790   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
791                                       (int16_t)cospi[58], (int16_t)cospi[6]);
792   const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
793                                       (int16_t)cospi[40], (int16_t)cospi[24]);
794   const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
795                                       (int16_t)cospi[16], (int16_t)cospi[48]);
796 
797   int16x8_t x[16];
798   int16x8_t t[14];
799   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
800   int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
801 
802   // Stage 1
803   x[0] = in[15];
804   x[1] = in[0];
805   x[2] = in[13];
806   x[3] = in[2];
807   x[4] = in[11];
808   x[5] = in[4];
809   x[6] = in[9];
810   x[7] = in[6];
811   x[8] = in[7];
812   x[9] = in[8];
813   x[10] = in[5];
814   x[11] = in[10];
815   x[12] = in[3];
816   x[13] = in[12];
817   x[14] = in[1];
818   x[15] = in[14];
819 
820   // Stage 2
821   btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
822   btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
823   btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
824   btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
825   btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
826   btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
827   btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
828   btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
829 
830   // Stage 3
831   x[0] = vqaddq_s16(s0, s8);
832   x[1] = vqaddq_s16(s1, s9);
833   x[2] = vqaddq_s16(s2, s10);
834   x[3] = vqaddq_s16(s3, s11);
835   x[4] = vqaddq_s16(s4, s12);
836   x[5] = vqaddq_s16(s5, s13);
837   x[6] = vqaddq_s16(s6, s14);
838   x[7] = vqaddq_s16(s7, s15);
839   x[8] = vqsubq_s16(s0, s8);
840   x[9] = vqsubq_s16(s1, s9);
841   x[10] = vqsubq_s16(s2, s10);
842   x[11] = vqsubq_s16(s3, s11);
843   x[12] = vqsubq_s16(s4, s12);
844   x[13] = vqsubq_s16(s5, s13);
845   x[14] = vqsubq_s16(s6, s14);
846   x[15] = vqsubq_s16(s7, s15);
847 
848   // Stage 4
849   t[0] = x[0];
850   t[1] = x[1];
851   t[2] = x[2];
852   t[3] = x[3];
853   t[4] = x[4];
854   t[5] = x[5];
855   t[6] = x[6];
856   t[7] = x[7];
857   btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
858   btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
859   btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
860   btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
861 
862   // Stage 5
863   x[0] = vqaddq_s16(t[0], t[4]);
864   x[1] = vqaddq_s16(t[1], t[5]);
865   x[2] = vqaddq_s16(t[2], t[6]);
866   x[3] = vqaddq_s16(t[3], t[7]);
867   x[4] = vqsubq_s16(t[0], t[4]);
868   x[5] = vqsubq_s16(t[1], t[5]);
869   x[6] = vqsubq_s16(t[2], t[6]);
870   x[7] = vqsubq_s16(t[3], t[7]);
871   x[8] = vqaddq_s16(s8, s12);
872   x[9] = vqaddq_s16(s9, s13);
873   x[10] = vqaddq_s16(s10, s14);
874   x[11] = vqaddq_s16(s11, s15);
875   x[12] = vqsubq_s16(s8, s12);
876   x[13] = vqsubq_s16(s9, s13);
877   x[14] = vqsubq_s16(s10, s14);
878   x[15] = vqsubq_s16(s11, s15);
879 
880   // stage 6
881   t[0] = x[0];
882   t[1] = x[1];
883   t[2] = x[2];
884   t[3] = x[3];
885   btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
886   btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
887   t[8] = x[8];
888   t[9] = x[9];
889   t[10] = x[10];
890   t[11] = x[11];
891   btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
892   btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
893 
894   // Stage 7
895   x[0] = vqaddq_s16(t[0], t[2]);
896   x[1] = vqaddq_s16(t[1], t[3]);
897   x[2] = vqsubq_s16(t[0], t[2]);
898   x[3] = vqsubq_s16(t[1], t[3]);
899   x[4] = vqaddq_s16(s4, s6);
900   x[5] = vqaddq_s16(s5, s7);
901   x[6] = vqsubq_s16(s4, s6);
902   x[7] = vqsubq_s16(s5, s7);
903   x[8] = vqaddq_s16(t[8], t[10]);
904   x[9] = vqaddq_s16(t[9], t[11]);
905   x[10] = vqsubq_s16(t[8], t[10]);
906   x[11] = vqsubq_s16(t[9], t[11]);
907   x[12] = vqaddq_s16(s12, s14);
908   x[13] = vqaddq_s16(s13, s15);
909   x[14] = vqsubq_s16(s12, s14);
910   x[15] = vqsubq_s16(s13, s15);
911 
912   // Stage 8
913   btf_16_half_neon(x + 2, c5);
914   btf_16_half_neon(x + 6, c5);
915   btf_16_half_neon(x + 10, c5);
916   btf_16_half_neon(x + 14, c5);
917 
918   // Stage 9
919   out[0] = x[0];
920   out[1] = vqnegq_s16(x[8]);
921   out[2] = x[12];
922   out[3] = vqnegq_s16(x[4]);
923   out[4] = x[6];
924   out[5] = vqnegq_s16(x[14]);
925   out[6] = x[10];
926   out[7] = vqnegq_s16(x[2]);
927   out[8] = x[3];
928   out[9] = vqnegq_s16(x[11]);
929   out[10] = x[15];
930   out[11] = vqnegq_s16(x[7]);
931   out[12] = x[5];
932   out[13] = vqnegq_s16(x[13]);
933   out[14] = x[9];
934   out[15] = vqnegq_s16(x[1]);
935 }
936 
iadst16_low1_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)937 static inline void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
938                                      int8_t cos_bit) {
939   const int32_t *cospi = cospi_arr(cos_bit);
940   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
941                                       (int16_t)cospi[40], (int16_t)cospi[24]);
942   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
943                                       (int16_t)cospi[16], (int16_t)cospi[48]);
944 
945   int16x8_t x[16];
946   int16x8_t t[10];
947   int16x8_t s0, s1, s4, s5;
948   int16x8_t s8, s9, s12, s13;
949 
950   // Stage 1
951   x[1] = in[0];
952 
953   // Stage 2
954   btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
955 
956   // Stage 3
957   x[0] = s0;
958   x[1] = s1;
959   x[8] = s0;
960   x[9] = s1;
961 
962   // Stage 4
963   t[0] = x[0];
964   t[1] = x[1];
965   btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
966 
967   // Stage 5
968   x[0] = t[0];
969   x[1] = t[1];
970   x[4] = t[0];
971   x[5] = t[1];
972   x[8] = s8;
973   x[9] = s9;
974   x[12] = s8;
975   x[13] = s9;
976 
977   // stage 6
978   t[0] = x[0];
979   t[1] = x[1];
980   btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
981   t[8] = x[8];
982   t[9] = x[9];
983   btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
984 
985   // Stage 7
986   x[0] = t[0];
987   x[1] = t[1];
988   x[2] = t[0];
989   x[3] = t[1];
990   x[4] = s4;
991   x[5] = s5;
992   x[6] = s4;
993   x[7] = s5;
994   x[8] = t[8];
995   x[9] = t[9];
996   x[10] = t[8];
997   x[11] = t[9];
998   x[12] = s12;
999   x[13] = s13;
1000   x[14] = s12;
1001   x[15] = s13;
1002 
1003   // Stage 8
1004   btf_16_half_neon(x + 2, c1);
1005   btf_16_half_neon(x + 6, c1);
1006   btf_16_half_neon(x + 10, c1);
1007   btf_16_half_neon(x + 14, c1);
1008 
1009   // Stage 9
1010   out[0] = x[0];
1011   out[1] = vqnegq_s16(x[8]);
1012   out[2] = x[12];
1013   out[3] = vqnegq_s16(x[4]);
1014   out[4] = x[6];
1015   out[5] = vqnegq_s16(x[14]);
1016   out[6] = x[10];
1017   out[7] = vqnegq_s16(x[2]);
1018   out[8] = x[3];
1019   out[9] = vqnegq_s16(x[11]);
1020   out[10] = x[15];
1021   out[11] = vqnegq_s16(x[7]);
1022   out[12] = x[5];
1023   out[13] = vqnegq_s16(x[13]);
1024   out[14] = x[9];
1025   out[15] = vqnegq_s16(x[1]);
1026 }
1027 
iadst16_low8_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)1028 static inline void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
1029                                      int8_t cos_bit) {
1030   const int32_t *cospi = cospi_arr(cos_bit);
1031 
1032   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1033                                       (int16_t)cospi[40], (int16_t)cospi[24]);
1034   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1035                                       (int16_t)cospi[16], (int16_t)cospi[48]);
1036 
1037   int16x8_t x[16];
1038   int16x8_t t[14];
1039   int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1040   int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
1041 
1042   // Stage 1
1043   x[1] = in[0];
1044   x[3] = in[2];
1045   x[5] = in[4];
1046   x[7] = in[6];
1047   x[8] = in[7];
1048   x[10] = in[5];
1049   x[12] = in[3];
1050   x[14] = in[1];
1051 
1052   // Stage 2
1053   btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
1054   btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
1055   btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
1056   btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
1057 
1058   btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
1059   btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
1060   btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
1061   btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
1062 
1063   // Stage 3
1064   x[0] = vqaddq_s16(s0, s8);
1065   x[1] = vqaddq_s16(s1, s9);
1066   x[2] = vqaddq_s16(s2, s10);
1067   x[3] = vqaddq_s16(s3, s11);
1068   x[4] = vqaddq_s16(s4, s12);
1069   x[5] = vqaddq_s16(s5, s13);
1070   x[6] = vqaddq_s16(s6, s14);
1071   x[7] = vqaddq_s16(s7, s15);
1072   x[8] = vqsubq_s16(s0, s8);
1073   x[9] = vqsubq_s16(s1, s9);
1074   x[10] = vqsubq_s16(s2, s10);
1075   x[11] = vqsubq_s16(s3, s11);
1076   x[12] = vqsubq_s16(s4, s12);
1077   x[13] = vqsubq_s16(s5, s13);
1078   x[14] = vqsubq_s16(s6, s14);
1079   x[15] = vqsubq_s16(s7, s15);
1080 
1081   // Stage 4
1082   t[0] = x[0];
1083   t[1] = x[1];
1084   t[2] = x[2];
1085   t[3] = x[3];
1086   t[4] = x[4];
1087   t[5] = x[5];
1088   t[6] = x[6];
1089   t[7] = x[7];
1090   btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
1091   btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
1092   btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
1093   btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
1094 
1095   // Stage 5
1096   x[0] = vqaddq_s16(t[0], t[4]);
1097   x[1] = vqaddq_s16(t[1], t[5]);
1098   x[2] = vqaddq_s16(t[2], t[6]);
1099   x[3] = vqaddq_s16(t[3], t[7]);
1100   x[4] = vqsubq_s16(t[0], t[4]);
1101   x[5] = vqsubq_s16(t[1], t[5]);
1102   x[6] = vqsubq_s16(t[2], t[6]);
1103   x[7] = vqsubq_s16(t[3], t[7]);
1104   x[8] = vqaddq_s16(s8, s12);
1105   x[9] = vqaddq_s16(s9, s13);
1106   x[10] = vqaddq_s16(s10, s14);
1107   x[11] = vqaddq_s16(s11, s15);
1108   x[12] = vqsubq_s16(s8, s12);
1109   x[13] = vqsubq_s16(s9, s13);
1110   x[14] = vqsubq_s16(s10, s14);
1111   x[15] = vqsubq_s16(s11, s15);
1112 
1113   // stage 6
1114   t[0] = x[0];
1115   t[1] = x[1];
1116   t[2] = x[2];
1117   t[3] = x[3];
1118   btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
1119   btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
1120   t[8] = x[8];
1121   t[9] = x[9];
1122   t[10] = x[10];
1123   t[11] = x[11];
1124   btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
1125   btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
1126 
1127   // Stage 7
1128   x[0] = vqaddq_s16(t[0], t[2]);
1129   x[1] = vqaddq_s16(t[1], t[3]);
1130   x[2] = vqsubq_s16(t[0], t[2]);
1131   x[3] = vqsubq_s16(t[1], t[3]);
1132   x[4] = vqaddq_s16(s4, s6);
1133   x[5] = vqaddq_s16(s5, s7);
1134   x[6] = vqsubq_s16(s4, s6);
1135   x[7] = vqsubq_s16(s5, s7);
1136   x[8] = vqaddq_s16(t[8], t[10]);
1137   x[9] = vqaddq_s16(t[9], t[11]);
1138   x[10] = vqsubq_s16(t[8], t[10]);
1139   x[11] = vqsubq_s16(t[9], t[11]);
1140   x[12] = vqaddq_s16(s12, s14);
1141   x[13] = vqaddq_s16(s13, s15);
1142   x[14] = vqsubq_s16(s12, s14);
1143   x[15] = vqsubq_s16(s13, s15);
1144 
1145   // Stage 8
1146   btf_16_half_neon(x + 2, c1);
1147   btf_16_half_neon(x + 6, c1);
1148   btf_16_half_neon(x + 10, c1);
1149   btf_16_half_neon(x + 14, c1);
1150 
1151   // Stage 9
1152   out[0] = x[0];
1153   out[1] = vqnegq_s16(x[8]);
1154   out[2] = x[12];
1155   out[3] = vqnegq_s16(x[4]);
1156   out[4] = x[6];
1157   out[5] = vqnegq_s16(x[14]);
1158   out[6] = x[10];
1159   out[7] = vqnegq_s16(x[2]);
1160   out[8] = x[3];
1161   out[9] = vqnegq_s16(x[11]);
1162   out[10] = x[15];
1163   out[11] = vqnegq_s16(x[7]);
1164   out[12] = x[5];
1165   out[13] = vqnegq_s16(x[13]);
1166   out[14] = x[9];
1167   out[15] = vqnegq_s16(x[1]);
1168 }
1169 
idct32_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1170 static inline void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
1171   const int32_t *cospi = cospi_arr(cos_bit);
1172   int16x8_t step1[32], step2[32];
1173 
1174   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
1175                                       (int16_t)cospi[34], (int16_t)cospi[30]);
1176   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
1177                                       (int16_t)cospi[50], (int16_t)cospi[14]);
1178   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
1179                                       (int16_t)cospi[42], (int16_t)cospi[22]);
1180   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
1181                                       (int16_t)cospi[58], (int16_t)cospi[6]);
1182   const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
1183                                       (int16_t)cospi[36], (int16_t)cospi[28]);
1184   const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
1185                                       (int16_t)cospi[52], (int16_t)cospi[12]);
1186   const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1187                                       (int16_t)cospi[40], (int16_t)cospi[24]);
1188   const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1189                                       (int16_t)cospi[16], (int16_t)cospi[48]);
1190   const int16x4_t c8 =
1191       set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1192                      (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1193   const int16x4_t c9 =
1194       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1195                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
1196 
1197   // stage 2
1198 
1199   btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
1200   btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
1201   btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
1202   btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
1203   btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
1204   btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
1205   btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
1206   btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
1207 
1208   step2[0] = in[0];
1209   step2[1] = in[16];
1210   step2[2] = in[8];
1211   step2[3] = in[24];
1212   step2[4] = in[4];
1213   step2[5] = in[20];
1214   step2[6] = in[12];
1215   step2[7] = in[28];
1216   step2[8] = in[2];
1217   step2[9] = in[18];
1218   step2[10] = in[10];
1219   step2[11] = in[26];
1220   step2[12] = in[6];
1221   step2[13] = in[22];
1222   step2[14] = in[14];
1223   step2[15] = in[30];
1224 
1225   // stage 3
1226 
1227   btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
1228   btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
1229   btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
1230   btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
1231 
1232   step1[0] = step2[0];
1233   step1[1] = step2[1];
1234   step1[2] = step2[2];
1235   step1[3] = step2[3];
1236   step1[4] = step2[4];
1237   step1[5] = step2[5];
1238   step1[6] = step2[6];
1239   step1[7] = step2[7];
1240 
1241   step1[16] = vqaddq_s16(step2[16], step2[17]);
1242   step1[17] = vqsubq_s16(step2[16], step2[17]);
1243   step1[18] = vqsubq_s16(step2[19], step2[18]);
1244   step1[19] = vqaddq_s16(step2[19], step2[18]);
1245   step1[20] = vqaddq_s16(step2[20], step2[21]);
1246   step1[21] = vqsubq_s16(step2[20], step2[21]);
1247   step1[22] = vqsubq_s16(step2[23], step2[22]);
1248   step1[23] = vqaddq_s16(step2[23], step2[22]);
1249   step1[24] = vqaddq_s16(step2[24], step2[25]);
1250   step1[25] = vqsubq_s16(step2[24], step2[25]);
1251   step1[26] = vqsubq_s16(step2[27], step2[26]);
1252   step1[27] = vqaddq_s16(step2[27], step2[26]);
1253   step1[28] = vqaddq_s16(step2[28], step2[29]);
1254   step1[29] = vqsubq_s16(step2[28], step2[29]);
1255   step1[30] = vqsubq_s16(step2[31], step2[30]);
1256   step1[31] = vqaddq_s16(step2[31], step2[30]);
1257 
1258   // stage 4
1259 
1260   btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
1261   btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
1262   btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
1263   btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
1264   btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
1265   btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
1266 
1267   step2[0] = step1[0];
1268   step2[1] = step1[1];
1269   step2[2] = step1[2];
1270   step2[3] = step1[3];
1271   step2[8] = vqaddq_s16(step1[8], step1[9]);
1272   step2[9] = vqsubq_s16(step1[8], step1[9]);
1273   step2[10] = vqsubq_s16(step1[11], step1[10]);
1274   step2[11] = vqaddq_s16(step1[11], step1[10]);
1275   step2[12] = vqaddq_s16(step1[12], step1[13]);
1276   step2[13] = vqsubq_s16(step1[12], step1[13]);
1277   step2[14] = vqsubq_s16(step1[15], step1[14]);
1278   step2[15] = vqaddq_s16(step1[15], step1[14]);
1279   step2[16] = step1[16];
1280   step2[19] = step1[19];
1281   step2[20] = step1[20];
1282   step2[23] = step1[23];
1283   step2[24] = step1[24];
1284   step2[27] = step1[27];
1285   step2[28] = step1[28];
1286   step2[31] = step1[31];
1287 
1288   // stage 5
1289 
1290   btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
1291   btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
1292   btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
1293   btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
1294 
1295   step1[4] = vqaddq_s16(step2[4], step2[5]);
1296   step1[5] = vqsubq_s16(step2[4], step2[5]);
1297   step1[6] = vqsubq_s16(step2[7], step2[6]);
1298   step1[7] = vqaddq_s16(step2[7], step2[6]);
1299   step1[8] = step2[8];
1300   step1[11] = step2[11];
1301   step1[12] = step2[12];
1302   step1[15] = step2[15];
1303   step1[16] = vqaddq_s16(step2[16], step2[19]);
1304   step1[17] = vqaddq_s16(step2[17], step2[18]);
1305   step1[18] = vqsubq_s16(step2[17], step2[18]);
1306   step1[19] = vqsubq_s16(step2[16], step2[19]);
1307   step1[20] = vqsubq_s16(step2[23], step2[20]);
1308   step1[21] = vqsubq_s16(step2[22], step2[21]);
1309   step1[22] = vqaddq_s16(step2[22], step2[21]);
1310   step1[23] = vqaddq_s16(step2[23], step2[20]);
1311   step1[24] = vqaddq_s16(step2[24], step2[27]);
1312   step1[25] = vqaddq_s16(step2[25], step2[26]);
1313   step1[26] = vqsubq_s16(step2[25], step2[26]);
1314   step1[27] = vqsubq_s16(step2[24], step2[27]);
1315   step1[28] = vqsubq_s16(step2[31], step2[28]);
1316   step1[29] = vqsubq_s16(step2[30], step2[29]);
1317   step1[30] = vqaddq_s16(step2[30], step2[29]);
1318   step1[31] = vqaddq_s16(step2[31], step2[28]);
1319 
1320   // stage 6
1321 
1322   btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
1323   btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
1324   btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
1325   btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
1326   btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
1327 
1328   step2[0] = vqaddq_s16(step1[0], step1[3]);
1329   step2[1] = vqaddq_s16(step1[1], step1[2]);
1330   step2[2] = vqsubq_s16(step1[1], step1[2]);
1331   step2[3] = vqsubq_s16(step1[0], step1[3]);
1332   step2[4] = step1[4];
1333   step2[7] = step1[7];
1334   step2[8] = vqaddq_s16(step1[8], step1[11]);
1335   step2[9] = vqaddq_s16(step1[9], step1[10]);
1336   step2[10] = vqsubq_s16(step1[9], step1[10]);
1337   step2[11] = vqsubq_s16(step1[8], step1[11]);
1338   step2[12] = vqsubq_s16(step1[15], step1[12]);
1339   step2[13] = vqsubq_s16(step1[14], step1[13]);
1340   step2[14] = vqaddq_s16(step1[14], step1[13]);
1341   step2[15] = vqaddq_s16(step1[15], step1[12]);
1342   step2[16] = step1[16];
1343   step2[17] = step1[17];
1344   step2[22] = step1[22];
1345   step2[23] = step1[23];
1346   step2[24] = step1[24];
1347   step2[25] = step1[25];
1348   step2[30] = step1[30];
1349   step2[31] = step1[31];
1350 
1351   // stage 7
1352 
1353   btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
1354   btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
1355 
1356   step1[0] = vqaddq_s16(step2[0], step2[7]);
1357   step1[1] = vqaddq_s16(step2[1], step2[6]);
1358   step1[2] = vqaddq_s16(step2[2], step2[5]);
1359   step1[3] = vqaddq_s16(step2[3], step2[4]);
1360   step1[4] = vqsubq_s16(step2[3], step2[4]);
1361   step1[5] = vqsubq_s16(step2[2], step2[5]);
1362   step1[6] = vqsubq_s16(step2[1], step2[6]);
1363   step1[7] = vqsubq_s16(step2[0], step2[7]);
1364   step1[8] = step2[8];
1365   step1[9] = step2[9];
1366   step1[14] = step2[14];
1367   step1[15] = step2[15];
1368   step1[16] = vqaddq_s16(step2[16], step2[23]);
1369   step1[17] = vqaddq_s16(step2[17], step2[22]);
1370   step1[18] = vqaddq_s16(step2[18], step2[21]);
1371   step1[19] = vqaddq_s16(step2[19], step2[20]);
1372   step1[20] = vqsubq_s16(step2[19], step2[20]);
1373   step1[21] = vqsubq_s16(step2[18], step2[21]);
1374   step1[22] = vqsubq_s16(step2[17], step2[22]);
1375   step1[23] = vqsubq_s16(step2[16], step2[23]);
1376   step1[24] = vqsubq_s16(step2[31], step2[24]);
1377   step1[25] = vqsubq_s16(step2[30], step2[25]);
1378   step1[26] = vqsubq_s16(step2[29], step2[26]);
1379   step1[27] = vqsubq_s16(step2[28], step2[27]);
1380   step1[28] = vqaddq_s16(step2[27], step2[28]);
1381   step1[29] = vqaddq_s16(step2[26], step2[29]);
1382   step1[30] = vqaddq_s16(step2[25], step2[30]);
1383   step1[31] = vqaddq_s16(step2[24], step2[31]);
1384 
1385   // stage 8
1386 
1387   btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
1388   btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
1389   btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
1390   btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
1391 
1392   step2[0] = vqaddq_s16(step1[0], step1[15]);
1393   step2[1] = vqaddq_s16(step1[1], step1[14]);
1394   step2[2] = vqaddq_s16(step1[2], step1[13]);
1395   step2[3] = vqaddq_s16(step1[3], step1[12]);
1396   step2[4] = vqaddq_s16(step1[4], step1[11]);
1397   step2[5] = vqaddq_s16(step1[5], step1[10]);
1398   step2[6] = vqaddq_s16(step1[6], step1[9]);
1399   step2[7] = vqaddq_s16(step1[7], step1[8]);
1400   step2[8] = vqsubq_s16(step1[7], step1[8]);
1401   step2[9] = vqsubq_s16(step1[6], step1[9]);
1402   step2[10] = vqsubq_s16(step1[5], step1[10]);
1403   step2[11] = vqsubq_s16(step1[4], step1[11]);
1404   step2[12] = vqsubq_s16(step1[3], step1[12]);
1405   step2[13] = vqsubq_s16(step1[2], step1[13]);
1406   step2[14] = vqsubq_s16(step1[1], step1[14]);
1407   step2[15] = vqsubq_s16(step1[0], step1[15]);
1408   step2[16] = step1[16];
1409   step2[17] = step1[17];
1410   step2[18] = step1[18];
1411   step2[19] = step1[19];
1412   step2[28] = step1[28];
1413   step2[29] = step1[29];
1414   step2[30] = step1[30];
1415   step2[31] = step1[31];
1416 
1417   // stage 9
1418 
1419   out[0] = vqaddq_s16(step2[0], step2[31]);
1420   out[1] = vqaddq_s16(step2[1], step2[30]);
1421   out[2] = vqaddq_s16(step2[2], step2[29]);
1422   out[3] = vqaddq_s16(step2[3], step2[28]);
1423   out[4] = vqaddq_s16(step2[4], step2[27]);
1424   out[5] = vqaddq_s16(step2[5], step2[26]);
1425   out[6] = vqaddq_s16(step2[6], step2[25]);
1426   out[7] = vqaddq_s16(step2[7], step2[24]);
1427   out[8] = vqaddq_s16(step2[8], step2[23]);
1428   out[9] = vqaddq_s16(step2[9], step2[22]);
1429   out[10] = vqaddq_s16(step2[10], step2[21]);
1430   out[11] = vqaddq_s16(step2[11], step2[20]);
1431   out[12] = vqaddq_s16(step2[12], step2[19]);
1432   out[13] = vqaddq_s16(step2[13], step2[18]);
1433   out[14] = vqaddq_s16(step2[14], step2[17]);
1434   out[15] = vqaddq_s16(step2[15], step2[16]);
1435   out[16] = vqsubq_s16(step2[15], step2[16]);
1436   out[17] = vqsubq_s16(step2[14], step2[17]);
1437   out[18] = vqsubq_s16(step2[13], step2[18]);
1438   out[19] = vqsubq_s16(step2[12], step2[19]);
1439   out[20] = vqsubq_s16(step2[11], step2[20]);
1440   out[21] = vqsubq_s16(step2[10], step2[21]);
1441   out[22] = vqsubq_s16(step2[9], step2[22]);
1442   out[23] = vqsubq_s16(step2[8], step2[23]);
1443   out[24] = vqsubq_s16(step2[7], step2[24]);
1444   out[25] = vqsubq_s16(step2[6], step2[25]);
1445   out[26] = vqsubq_s16(step2[5], step2[26]);
1446   out[27] = vqsubq_s16(step2[4], step2[27]);
1447   out[28] = vqsubq_s16(step2[3], step2[28]);
1448   out[29] = vqsubq_s16(step2[2], step2[29]);
1449   out[30] = vqsubq_s16(step2[1], step2[30]);
1450   out[31] = vqsubq_s16(step2[0], step2[31]);
1451 }
1452 
idct32_low1_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1453 static inline void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
1454                                     int8_t cos_bit) {
1455   const int32_t *cospi = cospi_arr(cos_bit);
1456   int16x8_t step1;
1457   int32x4_t t32[2];
1458 
1459   // stage 1
1460   // stage 2
1461   // stage 3
1462   // stage 4
1463   // stage 5
1464 
1465   t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
1466   t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
1467   step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1468                        vrshrn_n_s32(t32[1], INV_COS_BIT));
1469 
1470   // stage 6
1471   // stage 7
1472   // stage 8
1473   // stage 9
1474 
1475   out[0] = step1;
1476   out[1] = step1;
1477   out[2] = step1;
1478   out[3] = step1;
1479   out[4] = step1;
1480   out[5] = step1;
1481   out[6] = step1;
1482   out[7] = step1;
1483   out[8] = step1;
1484   out[9] = step1;
1485   out[10] = step1;
1486   out[11] = step1;
1487   out[12] = step1;
1488   out[13] = step1;
1489   out[14] = step1;
1490   out[15] = step1;
1491   out[16] = step1;
1492   out[17] = step1;
1493   out[18] = step1;
1494   out[19] = step1;
1495   out[20] = step1;
1496   out[21] = step1;
1497   out[22] = step1;
1498   out[23] = step1;
1499   out[24] = step1;
1500   out[25] = step1;
1501   out[26] = step1;
1502   out[27] = step1;
1503   out[28] = step1;
1504   out[29] = step1;
1505   out[30] = step1;
1506   out[31] = step1;
1507 }
1508 
idct32_low8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1509 static inline void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
1510                                     int8_t cos_bit) {
1511   const int32_t *cospi = cospi_arr(cos_bit);
1512   int16x8_t step1[32], step2[32];
1513   int32x4_t t32[16];
1514   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1515                                       (int16_t)cospi[40], (int16_t)cospi[24]);
1516   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1517                                       (int16_t)cospi[16], cospi[48]);
1518   const int16x4_t c2 =
1519       set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1520                      (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1521   const int16x4_t c3 =
1522       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1523                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
1524   // stage 1
1525   // stage 2
1526 
1527   step2[0] = in[0];
1528   step2[4] = in[4];
1529   step2[8] = in[2];
1530   step2[12] = in[6];
1531 
1532   btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1533   btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1534   btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1535   btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1536 
1537   // stage 3
1538   step1[0] = step2[0];
1539   step1[4] = step2[4];
1540 
1541   btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1542   btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1543 
1544   step1[16] = step2[16];
1545   step1[17] = step2[16];
1546   step1[18] = step2[19];
1547   step1[19] = step2[19];
1548   step1[20] = step2[20];
1549   step1[21] = step2[20];
1550   step1[22] = step2[23];
1551   step1[23] = step2[23];
1552   step1[24] = step2[24];
1553   step1[25] = step2[24];
1554   step1[26] = step2[27];
1555   step1[27] = step2[27];
1556   step1[28] = step2[28];
1557   step1[29] = step2[28];
1558   step1[30] = step2[31];
1559   step1[31] = step2[31];
1560 
1561   // stage 4
1562 
1563   btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1564   btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
1565   btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
1566   btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
1567   btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
1568 
1569   step2[0] = step1[0];
1570   step2[8] = step1[8];
1571   step2[9] = step1[8];
1572   step2[10] = step1[11];
1573   step2[11] = step1[11];
1574   step2[12] = step1[12];
1575   step2[13] = step1[12];
1576   step2[14] = step1[15];
1577   step2[15] = step1[15];
1578   step2[16] = step1[16];
1579   step2[19] = step1[19];
1580   step2[20] = step1[20];
1581   step2[23] = step1[23];
1582   step2[24] = step1[24];
1583   step2[27] = step1[27];
1584   step2[28] = step1[28];
1585   step2[31] = step1[31];
1586 
1587   // stage 5
1588 
1589   t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1590   t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1591   step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1592                           vrshrn_n_s32(t32[1], INV_COS_BIT));
1593 
1594   btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
1595   btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
1596 
1597   step1[4] = step2[4];
1598   step1[5] = step2[4];
1599   step1[6] = step2[7];
1600   step1[7] = step2[7];
1601   step1[8] = step2[8];
1602   step1[11] = step2[11];
1603   step1[12] = step2[12];
1604   step1[15] = step2[15];
1605   step1[16] = vqaddq_s16(step2[16], step2[19]);
1606   step1[17] = vqaddq_s16(step2[17], step2[18]);
1607   step1[18] = vqsubq_s16(step2[17], step2[18]);
1608   step1[19] = vqsubq_s16(step2[16], step2[19]);
1609   step1[20] = vqsubq_s16(step2[23], step2[20]);
1610   step1[21] = vqsubq_s16(step2[22], step2[21]);
1611   step1[22] = vqaddq_s16(step2[22], step2[21]);
1612   step1[23] = vqaddq_s16(step2[23], step2[20]);
1613   step1[24] = vqaddq_s16(step2[24], step2[27]);
1614   step1[25] = vqaddq_s16(step2[25], step2[26]);
1615   step1[26] = vqsubq_s16(step2[25], step2[26]);
1616   step1[27] = vqsubq_s16(step2[24], step2[27]);
1617   step1[28] = vqsubq_s16(step2[31], step2[28]);
1618   step1[29] = vqsubq_s16(step2[30], step2[29]);
1619   step1[30] = vqaddq_s16(step2[30], step2[29]);
1620   step1[31] = vqaddq_s16(step2[31], step2[28]);
1621 
1622   // stage 6
1623 
1624   btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1625   btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1626   btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
1627   btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1628   btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
1629 
1630   step2[0] = step1[0];
1631   step2[1] = step1[0];
1632   step2[2] = step1[0];
1633   step2[3] = step1[0];
1634   step2[4] = step1[4];
1635   step2[7] = step1[7];
1636   step2[8] = vqaddq_s16(step1[8], step1[11]);
1637   step2[9] = vqaddq_s16(step1[9], step1[10]);
1638   step2[10] = vqsubq_s16(step1[9], step1[10]);
1639   step2[11] = vqsubq_s16(step1[8], step1[11]);
1640   step2[12] = vqsubq_s16(step1[15], step1[12]);
1641   step2[13] = vqsubq_s16(step1[14], step1[13]);
1642   step2[14] = vqaddq_s16(step1[14], step1[13]);
1643   step2[15] = vqaddq_s16(step1[15], step1[12]);
1644   step2[16] = step1[16];
1645   step2[17] = step1[17];
1646   step2[22] = step1[22];
1647   step2[23] = step1[23];
1648   step2[24] = step1[24];
1649   step2[25] = step1[25];
1650   step2[30] = step1[30];
1651   step2[31] = step1[31];
1652 
1653   // stage 7
1654 
1655   btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1656   btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1657 
1658   step1[0] = vqaddq_s16(step2[0], step2[7]);
1659   step1[1] = vqaddq_s16(step2[1], step2[6]);
1660   step1[2] = vqaddq_s16(step2[2], step2[5]);
1661   step1[3] = vqaddq_s16(step2[3], step2[4]);
1662   step1[4] = vqsubq_s16(step2[3], step2[4]);
1663   step1[5] = vqsubq_s16(step2[2], step2[5]);
1664   step1[6] = vqsubq_s16(step2[1], step2[6]);
1665   step1[7] = vqsubq_s16(step2[0], step2[7]);
1666   step1[8] = step2[8];
1667   step1[9] = step2[9];
1668   step1[14] = step2[14];
1669   step1[15] = step2[15];
1670   step1[16] = vqaddq_s16(step2[16], step2[23]);
1671   step1[17] = vqaddq_s16(step2[17], step2[22]);
1672   step1[18] = vqaddq_s16(step2[18], step2[21]);
1673   step1[19] = vqaddq_s16(step2[19], step2[20]);
1674   step1[20] = vqsubq_s16(step2[19], step2[20]);
1675   step1[21] = vqsubq_s16(step2[18], step2[21]);
1676   step1[22] = vqsubq_s16(step2[17], step2[22]);
1677   step1[23] = vqsubq_s16(step2[16], step2[23]);
1678   step1[24] = vqsubq_s16(step2[31], step2[24]);
1679   step1[25] = vqsubq_s16(step2[30], step2[25]);
1680   step1[26] = vqsubq_s16(step2[29], step2[26]);
1681   step1[27] = vqsubq_s16(step2[28], step2[27]);
1682   step1[28] = vqaddq_s16(step2[27], step2[28]);
1683   step1[29] = vqaddq_s16(step2[26], step2[29]);
1684   step1[30] = vqaddq_s16(step2[25], step2[30]);
1685   step1[31] = vqaddq_s16(step2[24], step2[31]);
1686 
1687   // stage 8
1688 
1689   btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1690   btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1691   btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1692   btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1693 
1694   step2[0] = vqaddq_s16(step1[0], step1[15]);
1695   step2[1] = vqaddq_s16(step1[1], step1[14]);
1696   step2[2] = vqaddq_s16(step1[2], step1[13]);
1697   step2[3] = vqaddq_s16(step1[3], step1[12]);
1698   step2[4] = vqaddq_s16(step1[4], step1[11]);
1699   step2[5] = vqaddq_s16(step1[5], step1[10]);
1700   step2[6] = vqaddq_s16(step1[6], step1[9]);
1701   step2[7] = vqaddq_s16(step1[7], step1[8]);
1702   step2[8] = vqsubq_s16(step1[7], step1[8]);
1703   step2[9] = vqsubq_s16(step1[6], step1[9]);
1704   step2[10] = vqsubq_s16(step1[5], step1[10]);
1705   step2[11] = vqsubq_s16(step1[4], step1[11]);
1706   step2[12] = vqsubq_s16(step1[3], step1[12]);
1707   step2[13] = vqsubq_s16(step1[2], step1[13]);
1708   step2[14] = vqsubq_s16(step1[1], step1[14]);
1709   step2[15] = vqsubq_s16(step1[0], step1[15]);
1710   step2[16] = step1[16];
1711   step2[17] = step1[17];
1712   step2[18] = step1[18];
1713   step2[19] = step1[19];
1714   step2[28] = step1[28];
1715   step2[29] = step1[29];
1716   step2[30] = step1[30];
1717   step2[31] = step1[31];
1718 
1719   // stage 9
1720 
1721   out[0] = vqaddq_s16(step2[0], step2[31]);
1722   out[1] = vqaddq_s16(step2[1], step2[30]);
1723   out[2] = vqaddq_s16(step2[2], step2[29]);
1724   out[3] = vqaddq_s16(step2[3], step2[28]);
1725   out[4] = vqaddq_s16(step2[4], step2[27]);
1726   out[5] = vqaddq_s16(step2[5], step2[26]);
1727   out[6] = vqaddq_s16(step2[6], step2[25]);
1728   out[7] = vqaddq_s16(step2[7], step2[24]);
1729   out[8] = vqaddq_s16(step2[8], step2[23]);
1730   out[9] = vqaddq_s16(step2[9], step2[22]);
1731   out[10] = vqaddq_s16(step2[10], step2[21]);
1732   out[11] = vqaddq_s16(step2[11], step2[20]);
1733   out[12] = vqaddq_s16(step2[12], step2[19]);
1734   out[13] = vqaddq_s16(step2[13], step2[18]);
1735   out[14] = vqaddq_s16(step2[14], step2[17]);
1736   out[15] = vqaddq_s16(step2[15], step2[16]);
1737   out[16] = vqsubq_s16(step2[15], step2[16]);
1738   out[17] = vqsubq_s16(step2[14], step2[17]);
1739   out[18] = vqsubq_s16(step2[13], step2[18]);
1740   out[19] = vqsubq_s16(step2[12], step2[19]);
1741   out[20] = vqsubq_s16(step2[11], step2[20]);
1742   out[21] = vqsubq_s16(step2[10], step2[21]);
1743   out[22] = vqsubq_s16(step2[9], step2[22]);
1744   out[23] = vqsubq_s16(step2[8], step2[23]);
1745   out[24] = vqsubq_s16(step2[7], step2[24]);
1746   out[25] = vqsubq_s16(step2[6], step2[25]);
1747   out[26] = vqsubq_s16(step2[5], step2[26]);
1748   out[27] = vqsubq_s16(step2[4], step2[27]);
1749   out[28] = vqsubq_s16(step2[3], step2[28]);
1750   out[29] = vqsubq_s16(step2[2], step2[29]);
1751   out[30] = vqsubq_s16(step2[1], step2[30]);
1752   out[31] = vqsubq_s16(step2[0], step2[31]);
1753 }
1754 
idct32_low16_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1755 static inline void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
1756                                      int8_t cos_bit) {
1757   const int32_t *cospi = cospi_arr(cos_bit);
1758   int16x8_t step1[32], step2[32];
1759   int32x4_t t32[16];
1760   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1761                                       (int16_t)cospi[40], (int16_t)cospi[24]);
1762   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1763                                       (int16_t)cospi[16], (int16_t)cospi[48]);
1764   const int16x4_t c2 =
1765       set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1766                      (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1767   const int16x4_t c3 =
1768       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1769                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
1770 
1771   // stage 1
1772   // stage 2
1773 
1774   btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1775   btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
1776   btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
1777   btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1778   btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1779   btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
1780   btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
1781   btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1782 
1783   step2[0] = in[0];
1784   step2[2] = in[8];
1785   step2[4] = in[4];
1786   step2[6] = in[12];
1787   step2[8] = in[2];
1788   step2[10] = in[10];
1789   step2[12] = in[6];
1790   step2[14] = in[14];
1791 
1792   // stage 3
1793 
1794   btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1795   btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
1796   btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
1797   btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1798 
1799   step1[0] = step2[0];
1800   step1[2] = step2[2];
1801   step1[4] = step2[4];
1802   step1[6] = step2[6];
1803   step1[16] = vqaddq_s16(step2[16], step2[17]);
1804   step1[17] = vqsubq_s16(step2[16], step2[17]);
1805   step1[18] = vqsubq_s16(step2[19], step2[18]);
1806   step1[19] = vqaddq_s16(step2[19], step2[18]);
1807   step1[20] = vqaddq_s16(step2[20], step2[21]);
1808   step1[21] = vqsubq_s16(step2[20], step2[21]);
1809   step1[22] = vqsubq_s16(step2[23], step2[22]);
1810   step1[23] = vqaddq_s16(step2[23], step2[22]);
1811   step1[24] = vqaddq_s16(step2[24], step2[25]);
1812   step1[25] = vqsubq_s16(step2[24], step2[25]);
1813   step1[26] = vqsubq_s16(step2[27], step2[26]);
1814   step1[27] = vqaddq_s16(step2[27], step2[26]);
1815   step1[28] = vqaddq_s16(step2[28], step2[29]);
1816   step1[29] = vqsubq_s16(step2[28], step2[29]);
1817   step1[30] = vqsubq_s16(step2[31], step2[30]);
1818   step1[31] = vqaddq_s16(step2[31], step2[30]);
1819 
1820   // stage 4
1821 
1822   btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1823   btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
1824   btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
1825   btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
1826   btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
1827   btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
1828 
1829   step2[0] = step1[0];
1830   step2[2] = step1[2];
1831   step2[8] = vqaddq_s16(step1[8], step1[9]);
1832   step2[9] = vqsubq_s16(step1[8], step1[9]);
1833   step2[10] = vqsubq_s16(step1[11], step1[10]);
1834   step2[11] = vqaddq_s16(step1[11], step1[10]);
1835   step2[12] = vqaddq_s16(step1[12], step1[13]);
1836   step2[13] = vqsubq_s16(step1[12], step1[13]);
1837   step2[14] = vqsubq_s16(step1[15], step1[14]);
1838   step2[15] = vqaddq_s16(step1[15], step1[14]);
1839   step2[16] = step1[16];
1840   step2[19] = step1[19];
1841   step2[20] = step1[20];
1842   step2[23] = step1[23];
1843   step2[24] = step1[24];
1844   step2[27] = step1[27];
1845   step2[28] = step1[28];
1846   step2[31] = step1[31];
1847 
1848   // stage 5
1849 
1850   t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1851   t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1852 
1853   step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1854                           vrshrn_n_s32(t32[1], INV_COS_BIT));
1855 
1856   btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
1857   btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
1858   btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
1859 
1860   step1[4] = vqaddq_s16(step2[4], step2[5]);
1861   step1[5] = vqsubq_s16(step2[4], step2[5]);
1862   step1[6] = vqsubq_s16(step2[7], step2[6]);
1863   step1[7] = vqaddq_s16(step2[7], step2[6]);
1864   step1[8] = step2[8];
1865   step1[11] = step2[11];
1866   step1[12] = step2[12];
1867   step1[15] = step2[15];
1868   step1[16] = vqaddq_s16(step2[16], step2[19]);
1869   step1[17] = vqaddq_s16(step2[17], step2[18]);
1870   step1[18] = vqsubq_s16(step2[17], step2[18]);
1871   step1[19] = vqsubq_s16(step2[16], step2[19]);
1872   step1[20] = vqsubq_s16(step2[23], step2[20]);
1873   step1[21] = vqsubq_s16(step2[22], step2[21]);
1874   step1[22] = vqaddq_s16(step2[22], step2[21]);
1875   step1[23] = vqaddq_s16(step2[23], step2[20]);
1876   step1[24] = vqaddq_s16(step2[24], step2[27]);
1877   step1[25] = vqaddq_s16(step2[25], step2[26]);
1878   step1[26] = vqsubq_s16(step2[25], step2[26]);
1879   step1[27] = vqsubq_s16(step2[24], step2[27]);
1880   step1[28] = vqsubq_s16(step2[31], step2[28]);
1881   step1[29] = vqsubq_s16(step2[30], step2[29]);
1882   step1[30] = vqaddq_s16(step2[30], step2[29]);
1883   step1[31] = vqaddq_s16(step2[31], step2[28]);
1884 
1885   // stage 6
1886 
1887   btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1888   btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1889   btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
1890   btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1891   btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
1892 
1893   step2[0] = vqaddq_s16(step1[0], step1[3]);
1894   step2[1] = vqaddq_s16(step1[0], step1[2]);
1895   step2[2] = vqsubq_s16(step1[0], step1[2]);
1896   step2[3] = vqsubq_s16(step1[0], step1[3]);
1897   step2[4] = step1[4];
1898   step2[7] = step1[7];
1899   step2[8] = vqaddq_s16(step1[8], step1[11]);
1900   step2[9] = vqaddq_s16(step1[9], step1[10]);
1901   step2[10] = vqsubq_s16(step1[9], step1[10]);
1902   step2[11] = vqsubq_s16(step1[8], step1[11]);
1903   step2[12] = vqsubq_s16(step1[15], step1[12]);
1904   step2[13] = vqsubq_s16(step1[14], step1[13]);
1905   step2[14] = vqaddq_s16(step1[14], step1[13]);
1906   step2[15] = vqaddq_s16(step1[15], step1[12]);
1907   step2[16] = step1[16];
1908   step2[17] = step1[17];
1909   step2[22] = step1[22];
1910   step2[23] = step1[23];
1911   step2[24] = step1[24];
1912   step2[25] = step1[25];
1913   step2[30] = step1[30];
1914   step2[31] = step1[31];
1915 
1916   // stage 7
1917 
1918   btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1919   btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1920 
1921   step1[0] = vqaddq_s16(step2[0], step2[7]);
1922   step1[1] = vqaddq_s16(step2[1], step2[6]);
1923   step1[2] = vqaddq_s16(step2[2], step2[5]);
1924   step1[3] = vqaddq_s16(step2[3], step2[4]);
1925   step1[4] = vqsubq_s16(step2[3], step2[4]);
1926   step1[5] = vqsubq_s16(step2[2], step2[5]);
1927   step1[6] = vqsubq_s16(step2[1], step2[6]);
1928   step1[7] = vqsubq_s16(step2[0], step2[7]);
1929   step1[8] = step2[8];
1930   step1[9] = step2[9];
1931   step1[14] = step2[14];
1932   step1[15] = step2[15];
1933   step1[16] = vqaddq_s16(step2[16], step2[23]);
1934   step1[17] = vqaddq_s16(step2[17], step2[22]);
1935   step1[18] = vqaddq_s16(step2[18], step2[21]);
1936   step1[19] = vqaddq_s16(step2[19], step2[20]);
1937   step1[20] = vqsubq_s16(step2[19], step2[20]);
1938   step1[21] = vqsubq_s16(step2[18], step2[21]);
1939   step1[22] = vqsubq_s16(step2[17], step2[22]);
1940   step1[23] = vqsubq_s16(step2[16], step2[23]);
1941   step1[24] = vqsubq_s16(step2[31], step2[24]);
1942   step1[25] = vqsubq_s16(step2[30], step2[25]);
1943   step1[26] = vqsubq_s16(step2[29], step2[26]);
1944   step1[27] = vqsubq_s16(step2[28], step2[27]);
1945   step1[28] = vqaddq_s16(step2[27], step2[28]);
1946   step1[29] = vqaddq_s16(step2[26], step2[29]);
1947   step1[30] = vqaddq_s16(step2[25], step2[30]);
1948   step1[31] = vqaddq_s16(step2[24], step2[31]);
1949 
1950   // stage 8
1951 
1952   btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1953   btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1954   btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1955   btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1956 
1957   step2[0] = vqaddq_s16(step1[0], step1[15]);
1958   step2[1] = vqaddq_s16(step1[1], step1[14]);
1959   step2[2] = vqaddq_s16(step1[2], step1[13]);
1960   step2[3] = vqaddq_s16(step1[3], step1[12]);
1961   step2[4] = vqaddq_s16(step1[4], step1[11]);
1962   step2[5] = vqaddq_s16(step1[5], step1[10]);
1963   step2[6] = vqaddq_s16(step1[6], step1[9]);
1964   step2[7] = vqaddq_s16(step1[7], step1[8]);
1965   step2[8] = vqsubq_s16(step1[7], step1[8]);
1966   step2[9] = vqsubq_s16(step1[6], step1[9]);
1967   step2[10] = vqsubq_s16(step1[5], step1[10]);
1968   step2[11] = vqsubq_s16(step1[4], step1[11]);
1969   step2[12] = vqsubq_s16(step1[3], step1[12]);
1970   step2[13] = vqsubq_s16(step1[2], step1[13]);
1971   step2[14] = vqsubq_s16(step1[1], step1[14]);
1972   step2[15] = vqsubq_s16(step1[0], step1[15]);
1973   step2[16] = step1[16];
1974   step2[17] = step1[17];
1975   step2[18] = step1[18];
1976   step2[19] = step1[19];
1977   step2[28] = step1[28];
1978   step2[29] = step1[29];
1979   step2[30] = step1[30];
1980   step2[31] = step1[31];
1981 
1982   // stage 9
1983 
1984   out[0] = vqaddq_s16(step2[0], step2[31]);
1985   out[1] = vqaddq_s16(step2[1], step2[30]);
1986   out[2] = vqaddq_s16(step2[2], step2[29]);
1987   out[3] = vqaddq_s16(step2[3], step2[28]);
1988   out[4] = vqaddq_s16(step2[4], step2[27]);
1989   out[5] = vqaddq_s16(step2[5], step2[26]);
1990   out[6] = vqaddq_s16(step2[6], step2[25]);
1991   out[7] = vqaddq_s16(step2[7], step2[24]);
1992   out[8] = vqaddq_s16(step2[8], step2[23]);
1993   out[9] = vqaddq_s16(step2[9], step2[22]);
1994   out[10] = vqaddq_s16(step2[10], step2[21]);
1995   out[11] = vqaddq_s16(step2[11], step2[20]);
1996   out[12] = vqaddq_s16(step2[12], step2[19]);
1997   out[13] = vqaddq_s16(step2[13], step2[18]);
1998   out[14] = vqaddq_s16(step2[14], step2[17]);
1999   out[15] = vqaddq_s16(step2[15], step2[16]);
2000   out[16] = vqsubq_s16(step2[15], step2[16]);
2001   out[17] = vqsubq_s16(step2[14], step2[17]);
2002   out[18] = vqsubq_s16(step2[13], step2[18]);
2003   out[19] = vqsubq_s16(step2[12], step2[19]);
2004   out[20] = vqsubq_s16(step2[11], step2[20]);
2005   out[21] = vqsubq_s16(step2[10], step2[21]);
2006   out[22] = vqsubq_s16(step2[9], step2[22]);
2007   out[23] = vqsubq_s16(step2[8], step2[23]);
2008   out[24] = vqsubq_s16(step2[7], step2[24]);
2009   out[25] = vqsubq_s16(step2[6], step2[25]);
2010   out[26] = vqsubq_s16(step2[5], step2[26]);
2011   out[27] = vqsubq_s16(step2[4], step2[27]);
2012   out[28] = vqsubq_s16(step2[3], step2[28]);
2013   out[29] = vqsubq_s16(step2[2], step2[29]);
2014   out[30] = vqsubq_s16(step2[1], step2[30]);
2015   out[31] = vqsubq_s16(step2[0], step2[31]);
2016 }
idct64_stage9_neon(int16x8_t * step2,int16x8_t * step1,int8_t cos_bit)2017 static inline void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
2018                                       int8_t cos_bit) {
2019   const int32_t *cospi = cospi_arr(cos_bit);
2020   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2021                                       (int16_t)cospi[16], (int16_t)cospi[48]);
2022 
2023   btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
2024   btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
2025   btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
2026   btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
2027 
2028   step1[0] = vqaddq_s16(step2[0], step2[15]);
2029   step1[1] = vqaddq_s16(step2[1], step2[14]);
2030   step1[2] = vqaddq_s16(step2[2], step2[13]);
2031   step1[3] = vqaddq_s16(step2[3], step2[12]);
2032   step1[4] = vqaddq_s16(step2[4], step2[11]);
2033   step1[5] = vqaddq_s16(step2[5], step2[10]);
2034   step1[6] = vqaddq_s16(step2[6], step2[9]);
2035   step1[7] = vqaddq_s16(step2[7], step2[8]);
2036   step1[8] = vqsubq_s16(step2[7], step2[8]);
2037   step1[9] = vqsubq_s16(step2[6], step2[9]);
2038   step1[10] = vqsubq_s16(step2[5], step2[10]);
2039   step1[11] = vqsubq_s16(step2[4], step2[11]);
2040   step1[12] = vqsubq_s16(step2[3], step2[12]);
2041   step1[13] = vqsubq_s16(step2[2], step2[13]);
2042   step1[14] = vqsubq_s16(step2[1], step2[14]);
2043   step1[15] = vqsubq_s16(step2[0], step2[15]);
2044   step1[16] = step2[16];
2045   step1[17] = step2[17];
2046   step1[18] = step2[18];
2047   step1[19] = step2[19];
2048   step1[28] = step2[28];
2049   step1[29] = step2[29];
2050   step1[30] = step2[30];
2051   step1[31] = step2[31];
2052   step1[32] = vqaddq_s16(step2[32], step2[47]);
2053   step1[33] = vqaddq_s16(step2[33], step2[46]);
2054   step1[34] = vqaddq_s16(step2[34], step2[45]);
2055   step1[35] = vqaddq_s16(step2[35], step2[44]);
2056   step1[36] = vqaddq_s16(step2[36], step2[43]);
2057   step1[37] = vqaddq_s16(step2[37], step2[42]);
2058   step1[38] = vqaddq_s16(step2[38], step2[41]);
2059   step1[39] = vqaddq_s16(step2[39], step2[40]);
2060   step1[40] = vqsubq_s16(step2[39], step2[40]);
2061   step1[41] = vqsubq_s16(step2[38], step2[41]);
2062   step1[42] = vqsubq_s16(step2[37], step2[42]);
2063   step1[43] = vqsubq_s16(step2[36], step2[43]);
2064   step1[44] = vqsubq_s16(step2[35], step2[44]);
2065   step1[45] = vqsubq_s16(step2[34], step2[45]);
2066   step1[46] = vqsubq_s16(step2[33], step2[46]);
2067   step1[47] = vqsubq_s16(step2[32], step2[47]);
2068   step1[48] = vqsubq_s16(step2[63], step2[48]);
2069   step1[49] = vqsubq_s16(step2[62], step2[49]);
2070   step1[50] = vqsubq_s16(step2[61], step2[50]);
2071   step1[51] = vqsubq_s16(step2[60], step2[51]);
2072   step1[52] = vqsubq_s16(step2[59], step2[52]);
2073   step1[53] = vqsubq_s16(step2[58], step2[53]);
2074   step1[54] = vqsubq_s16(step2[57], step2[54]);
2075   step1[55] = vqsubq_s16(step2[56], step2[55]);
2076   step1[56] = vqaddq_s16(step2[56], step2[55]);
2077   step1[57] = vqaddq_s16(step2[57], step2[54]);
2078   step1[58] = vqaddq_s16(step2[58], step2[53]);
2079   step1[59] = vqaddq_s16(step2[59], step2[52]);
2080   step1[60] = vqaddq_s16(step2[60], step2[51]);
2081   step1[61] = vqaddq_s16(step2[61], step2[50]);
2082   step1[62] = vqaddq_s16(step2[62], step2[49]);
2083   step1[63] = vqaddq_s16(step2[63], step2[48]);
2084 }
2085 
idct64_stage10_neon(int16x8_t * step1,int16x8_t * step2,int8_t cos_bit)2086 static inline void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
2087                                        int8_t cos_bit) {
2088   const int32_t *cospi = cospi_arr(cos_bit);
2089   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2090                                       (int16_t)cospi[16], (int16_t)cospi[48]);
2091 
2092   btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
2093   btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
2094   btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
2095   btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
2096   btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
2097   btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
2098   btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
2099   btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
2100 
2101   step2[0] = vqaddq_s16(step1[0], step1[31]);
2102   step2[1] = vqaddq_s16(step1[1], step1[30]);
2103   step2[2] = vqaddq_s16(step1[2], step1[29]);
2104   step2[3] = vqaddq_s16(step1[3], step1[28]);
2105   step2[4] = vqaddq_s16(step1[4], step1[27]);
2106   step2[5] = vqaddq_s16(step1[5], step1[26]);
2107   step2[6] = vqaddq_s16(step1[6], step1[25]);
2108   step2[7] = vqaddq_s16(step1[7], step1[24]);
2109   step2[8] = vqaddq_s16(step1[8], step1[23]);
2110   step2[9] = vqaddq_s16(step1[9], step1[22]);
2111   step2[10] = vqaddq_s16(step1[10], step1[21]);
2112   step2[11] = vqaddq_s16(step1[11], step1[20]);
2113   step2[12] = vqaddq_s16(step1[12], step1[19]);
2114   step2[13] = vqaddq_s16(step1[13], step1[18]);
2115   step2[14] = vqaddq_s16(step1[14], step1[17]);
2116   step2[15] = vqaddq_s16(step1[15], step1[16]);
2117   step2[16] = vqsubq_s16(step1[15], step1[16]);
2118   step2[17] = vqsubq_s16(step1[14], step1[17]);
2119   step2[18] = vqsubq_s16(step1[13], step1[18]);
2120   step2[19] = vqsubq_s16(step1[12], step1[19]);
2121   step2[20] = vqsubq_s16(step1[11], step1[20]);
2122   step2[21] = vqsubq_s16(step1[10], step1[21]);
2123   step2[22] = vqsubq_s16(step1[9], step1[22]);
2124   step2[23] = vqsubq_s16(step1[8], step1[23]);
2125   step2[24] = vqsubq_s16(step1[7], step1[24]);
2126   step2[25] = vqsubq_s16(step1[6], step1[25]);
2127   step2[26] = vqsubq_s16(step1[5], step1[26]);
2128   step2[27] = vqsubq_s16(step1[4], step1[27]);
2129   step2[28] = vqsubq_s16(step1[3], step1[28]);
2130   step2[29] = vqsubq_s16(step1[2], step1[29]);
2131   step2[30] = vqsubq_s16(step1[1], step1[30]);
2132   step2[31] = vqsubq_s16(step1[0], step1[31]);
2133   step2[32] = step1[32];
2134   step2[33] = step1[33];
2135   step2[34] = step1[34];
2136   step2[35] = step1[35];
2137   step2[36] = step1[36];
2138   step2[37] = step1[37];
2139   step2[38] = step1[38];
2140   step2[39] = step1[39];
2141   step2[56] = step1[56];
2142   step2[57] = step1[57];
2143   step2[58] = step1[58];
2144   step2[59] = step1[59];
2145   step2[60] = step1[60];
2146   step2[61] = step1[61];
2147   step2[62] = step1[62];
2148   step2[63] = step1[63];
2149 }
2150 
idct64_low32_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)2151 static inline void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
2152                                      int8_t cos_bit) {
2153   const int32_t *cospi = cospi_arr(cos_bit);
2154   int16x8_t step2[64], step1[64];
2155   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2156                                       (int16_t)cospi[36], (int16_t)cospi[28]);
2157   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2158                                       (int16_t)cospi[52], (int16_t)cospi[12]);
2159   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2160                                       (int16_t)cospi[40], (int16_t)cospi[24]);
2161   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2162                                       (int16_t)cospi[16], (int16_t)cospi[48]);
2163   const int16x4_t c4 =
2164       set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
2165                      (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
2166   const int16x4_t c5 =
2167       set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
2168                      (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2169   const int16x4_t c6 =
2170       set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2171                      (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2172   const int16x4_t c7 =
2173       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2174                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
2175 
2176   // stage 1
2177   // stage 2
2178 
2179   step2[0] = in[0];
2180   step2[2] = in[16];
2181   step2[4] = in[8];
2182   step2[6] = in[24];
2183   step2[8] = in[4];
2184   step2[10] = in[20];
2185   step2[12] = in[12];
2186   step2[14] = in[28];
2187   step2[16] = in[2];
2188   step2[18] = in[18];
2189   step2[20] = in[10];
2190   step2[22] = in[26];
2191   step2[24] = in[6];
2192   step2[26] = in[22];
2193   step2[28] = in[14];
2194   step2[30] = in[30];
2195 
2196   btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2197   btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
2198   btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
2199   btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
2200   btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
2201   btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
2202   btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
2203   btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2204   btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2205   btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
2206   btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
2207   btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
2208   btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
2209   btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
2210   btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
2211   btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2212 
2213   // stage 3
2214 
2215   step1[0] = step2[0];
2216   step1[2] = step2[2];
2217   step1[4] = step2[4];
2218   step1[6] = step2[6];
2219   step1[8] = step2[8];
2220   step1[10] = step2[10];
2221   step1[12] = step2[12];
2222   step1[14] = step2[14];
2223 
2224   btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2225   btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
2226   btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
2227   btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
2228   btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
2229   btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
2230   btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
2231   btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2232 
2233   step1[32] = vqaddq_s16(step2[32], step2[33]);
2234   step1[33] = vqsubq_s16(step2[32], step2[33]);
2235   step1[34] = vqsubq_s16(step2[35], step2[34]);
2236   step1[35] = vqaddq_s16(step2[35], step2[34]);
2237   step1[36] = vqaddq_s16(step2[36], step2[37]);
2238   step1[37] = vqsubq_s16(step2[36], step2[37]);
2239   step1[38] = vqsubq_s16(step2[39], step2[38]);
2240   step1[39] = vqaddq_s16(step2[39], step2[38]);
2241   step1[40] = vqaddq_s16(step2[40], step2[41]);
2242   step1[41] = vqsubq_s16(step2[40], step2[41]);
2243   step1[42] = vqsubq_s16(step2[43], step2[42]);
2244   step1[43] = vqaddq_s16(step2[43], step2[42]);
2245   step1[44] = vqaddq_s16(step2[44], step2[45]);
2246   step1[45] = vqsubq_s16(step2[44], step2[45]);
2247   step1[46] = vqsubq_s16(step2[47], step2[46]);
2248   step1[47] = vqaddq_s16(step2[47], step2[46]);
2249   step1[48] = vqaddq_s16(step2[48], step2[49]);
2250   step1[49] = vqsubq_s16(step2[48], step2[49]);
2251   step1[50] = vqsubq_s16(step2[51], step2[50]);
2252   step1[51] = vqaddq_s16(step2[51], step2[50]);
2253   step1[52] = vqaddq_s16(step2[52], step2[53]);
2254   step1[53] = vqsubq_s16(step2[52], step2[53]);
2255   step1[54] = vqsubq_s16(step2[55], step2[54]);
2256   step1[55] = vqaddq_s16(step2[55], step2[54]);
2257   step1[56] = vqaddq_s16(step2[56], step2[57]);
2258   step1[57] = vqsubq_s16(step2[56], step2[57]);
2259   step1[58] = vqsubq_s16(step2[59], step2[58]);
2260   step1[59] = vqaddq_s16(step2[59], step2[58]);
2261   step1[60] = vqaddq_s16(step2[60], step2[61]);
2262   step1[61] = vqsubq_s16(step2[60], step2[61]);
2263   step1[62] = vqsubq_s16(step2[63], step2[62]);
2264   step1[63] = vqaddq_s16(step2[63], step2[62]);
2265 
2266   // stage 4
2267 
2268   step2[0] = step1[0];
2269   step2[2] = step1[2];
2270   step2[4] = step1[4];
2271   step2[6] = step1[6];
2272 
2273   btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2274   btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
2275   btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
2276   btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
2277   btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
2278   btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
2279   btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
2280   btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
2281   btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
2282   btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
2283   btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
2284   btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
2285 
2286   step2[16] = vqaddq_s16(step1[16], step1[17]);
2287   step2[17] = vqsubq_s16(step1[16], step1[17]);
2288   step2[18] = vqsubq_s16(step1[19], step1[18]);
2289   step2[19] = vqaddq_s16(step1[19], step1[18]);
2290   step2[20] = vqaddq_s16(step1[20], step1[21]);
2291   step2[21] = vqsubq_s16(step1[20], step1[21]);
2292   step2[22] = vqsubq_s16(step1[23], step1[22]);
2293   step2[23] = vqaddq_s16(step1[23], step1[22]);
2294   step2[24] = vqaddq_s16(step1[24], step1[25]);
2295   step2[25] = vqsubq_s16(step1[24], step1[25]);
2296   step2[26] = vqsubq_s16(step1[27], step1[26]);
2297   step2[27] = vqaddq_s16(step1[27], step1[26]);
2298   step2[28] = vqaddq_s16(step1[28], step1[29]);
2299   step2[29] = vqsubq_s16(step1[28], step1[29]);
2300   step2[30] = vqsubq_s16(step1[31], step1[30]);
2301   step2[31] = vqaddq_s16(step1[31], step1[30]);
2302   step2[32] = step1[32];
2303   step2[35] = step1[35];
2304   step2[36] = step1[36];
2305   step2[39] = step1[39];
2306   step2[40] = step1[40];
2307   step2[43] = step1[43];
2308   step2[44] = step1[44];
2309   step2[47] = step1[47];
2310   step2[48] = step1[48];
2311   step2[51] = step1[51];
2312   step2[52] = step1[52];
2313   step2[55] = step1[55];
2314   step2[56] = step1[56];
2315   step2[59] = step1[59];
2316   step2[60] = step1[60];
2317   step2[63] = step1[63];
2318 
2319   // stage 5
2320 
2321   step1[0] = step2[0];
2322   step1[2] = step2[2];
2323 
2324   btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
2325   btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
2326   btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
2327   btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
2328   btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
2329   btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
2330 
2331   step1[8] = vqaddq_s16(step2[8], step2[9]);
2332   step1[9] = vqsubq_s16(step2[8], step2[9]);
2333   step1[10] = vqsubq_s16(step2[11], step2[10]);
2334   step1[11] = vqaddq_s16(step2[11], step2[10]);
2335   step1[12] = vqaddq_s16(step2[12], step2[13]);
2336   step1[13] = vqsubq_s16(step2[12], step2[13]);
2337   step1[14] = vqsubq_s16(step2[15], step2[14]);
2338   step1[15] = vqaddq_s16(step2[15], step2[14]);
2339   step1[16] = step2[16];
2340   step1[19] = step2[19];
2341   step1[20] = step2[20];
2342   step1[23] = step2[23];
2343   step1[24] = step2[24];
2344   step1[27] = step2[27];
2345   step1[28] = step2[28];
2346   step1[31] = step2[31];
2347   step1[32] = vqaddq_s16(step2[32], step2[35]);
2348   step1[33] = vqaddq_s16(step2[33], step2[34]);
2349   step1[34] = vqsubq_s16(step2[33], step2[34]);
2350   step1[35] = vqsubq_s16(step2[32], step2[35]);
2351   step1[36] = vqsubq_s16(step2[39], step2[36]);
2352   step1[37] = vqsubq_s16(step2[38], step2[37]);
2353   step1[38] = vqaddq_s16(step2[38], step2[37]);
2354   step1[39] = vqaddq_s16(step2[39], step2[36]);
2355   step1[40] = vqaddq_s16(step2[40], step2[43]);
2356   step1[41] = vqaddq_s16(step2[41], step2[42]);
2357   step1[42] = vqsubq_s16(step2[41], step2[42]);
2358   step1[43] = vqsubq_s16(step2[40], step2[43]);
2359   step1[44] = vqsubq_s16(step2[47], step2[44]);
2360   step1[45] = vqsubq_s16(step2[46], step2[45]);
2361   step1[46] = vqaddq_s16(step2[46], step2[45]);
2362   step1[47] = vqaddq_s16(step2[47], step2[44]);
2363   step1[48] = vqaddq_s16(step2[48], step2[51]);
2364   step1[49] = vqaddq_s16(step2[49], step2[50]);
2365   step1[50] = vqsubq_s16(step2[49], step2[50]);
2366   step1[51] = vqsubq_s16(step2[48], step2[51]);
2367   step1[52] = vqsubq_s16(step2[55], step2[52]);
2368   step1[53] = vqsubq_s16(step2[54], step2[53]);
2369   step1[54] = vqaddq_s16(step2[54], step2[53]);
2370   step1[55] = vqaddq_s16(step2[55], step2[52]);
2371   step1[56] = vqaddq_s16(step2[56], step2[59]);
2372   step1[57] = vqaddq_s16(step2[57], step2[58]);
2373   step1[58] = vqsubq_s16(step2[57], step2[58]);
2374   step1[59] = vqsubq_s16(step2[56], step2[59]);
2375   step1[60] = vqsubq_s16(step2[63], step2[60]);
2376   step1[61] = vqsubq_s16(step2[62], step2[61]);
2377   step1[62] = vqaddq_s16(step2[62], step2[61]);
2378   step1[63] = vqaddq_s16(step2[63], step2[60]);
2379 
2380   // stage 6
2381 
2382   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2383   btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
2384   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2385   btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
2386   btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2387   btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
2388   btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
2389   btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
2390   btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2391   btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
2392   btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
2393   btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
2394 
2395   step2[4] = vqaddq_s16(step1[4], step1[5]);
2396   step2[5] = vqsubq_s16(step1[4], step1[5]);
2397   step2[6] = vqsubq_s16(step1[7], step1[6]);
2398   step2[7] = vqaddq_s16(step1[7], step1[6]);
2399   step2[8] = step1[8];
2400   step2[11] = step1[11];
2401   step2[12] = step1[12];
2402   step2[15] = step1[15];
2403   step2[16] = vqaddq_s16(step1[16], step1[19]);
2404   step2[17] = vqaddq_s16(step1[17], step1[18]);
2405   step2[18] = vqsubq_s16(step1[17], step1[18]);
2406   step2[19] = vqsubq_s16(step1[16], step1[19]);
2407   step2[20] = vqsubq_s16(step1[23], step1[20]);
2408   step2[21] = vqsubq_s16(step1[22], step1[21]);
2409   step2[22] = vqaddq_s16(step1[22], step1[21]);
2410   step2[23] = vqaddq_s16(step1[23], step1[20]);
2411   step2[24] = vqaddq_s16(step1[24], step1[27]);
2412   step2[25] = vqaddq_s16(step1[25], step1[26]);
2413   step2[26] = vqsubq_s16(step1[25], step1[26]);
2414   step2[27] = vqsubq_s16(step1[24], step1[27]);
2415   step2[28] = vqsubq_s16(step1[31], step1[28]);
2416   step2[29] = vqsubq_s16(step1[30], step1[29]);
2417   step2[30] = vqaddq_s16(step1[30], step1[29]);
2418   step2[31] = vqaddq_s16(step1[31], step1[28]);
2419   step2[32] = step1[32];
2420   step2[33] = step1[33];
2421   step2[38] = step1[38];
2422   step2[39] = step1[39];
2423   step2[40] = step1[40];
2424   step2[41] = step1[41];
2425   step2[46] = step1[46];
2426   step2[47] = step1[47];
2427   step2[48] = step1[48];
2428   step2[49] = step1[49];
2429   step2[54] = step1[54];
2430   step2[55] = step1[55];
2431   step2[56] = step1[56];
2432   step2[57] = step1[57];
2433   step2[62] = step1[62];
2434   step2[63] = step1[63];
2435 
2436   // stage 7
2437 
2438   btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
2439   btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2440   btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
2441   btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
2442   btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
2443 
2444   step1[0] = vqaddq_s16(step2[0], step2[3]);
2445   step1[1] = vqaddq_s16(step2[1], step2[2]);
2446   step1[2] = vqsubq_s16(step2[1], step2[2]);
2447   step1[3] = vqsubq_s16(step2[0], step2[3]);
2448   step1[4] = step2[4];
2449   step1[7] = step2[7];
2450   step1[8] = vqaddq_s16(step2[8], step2[11]);
2451   step1[9] = vqaddq_s16(step2[9], step2[10]);
2452   step1[10] = vqsubq_s16(step2[9], step2[10]);
2453   step1[11] = vqsubq_s16(step2[8], step2[11]);
2454   step1[12] = vqsubq_s16(step2[15], step2[12]);
2455   step1[13] = vqsubq_s16(step2[14], step2[13]);
2456   step1[14] = vqaddq_s16(step2[14], step2[13]);
2457   step1[15] = vqaddq_s16(step2[15], step2[12]);
2458   step1[16] = step2[16];
2459   step1[17] = step2[17];
2460   step1[22] = step2[22];
2461   step1[23] = step2[23];
2462   step1[24] = step2[24];
2463   step1[25] = step2[25];
2464   step1[30] = step2[30];
2465   step1[31] = step2[31];
2466   step1[32] = vqaddq_s16(step2[32], step2[39]);
2467   step1[33] = vqaddq_s16(step2[33], step2[38]);
2468   step1[34] = vqaddq_s16(step2[34], step2[37]);
2469   step1[35] = vqaddq_s16(step2[35], step2[36]);
2470   step1[36] = vqsubq_s16(step2[35], step2[36]);
2471   step1[37] = vqsubq_s16(step2[34], step2[37]);
2472   step1[38] = vqsubq_s16(step2[33], step2[38]);
2473   step1[39] = vqsubq_s16(step2[32], step2[39]);
2474   step1[40] = vqsubq_s16(step2[47], step2[40]);
2475   step1[41] = vqsubq_s16(step2[46], step2[41]);
2476   step1[42] = vqsubq_s16(step2[45], step2[42]);
2477   step1[43] = vqsubq_s16(step2[44], step2[43]);
2478   step1[44] = vqaddq_s16(step2[43], step2[44]);
2479   step1[45] = vqaddq_s16(step2[42], step2[45]);
2480   step1[46] = vqaddq_s16(step2[41], step2[46]);
2481   step1[47] = vqaddq_s16(step2[40], step2[47]);
2482   step1[48] = vqaddq_s16(step2[48], step2[55]);
2483   step1[49] = vqaddq_s16(step2[49], step2[54]);
2484   step1[50] = vqaddq_s16(step2[50], step2[53]);
2485   step1[51] = vqaddq_s16(step2[51], step2[52]);
2486   step1[52] = vqsubq_s16(step2[51], step2[52]);
2487   step1[53] = vqsubq_s16(step2[50], step2[53]);
2488   step1[54] = vqsubq_s16(step2[49], step2[54]);
2489   step1[55] = vqsubq_s16(step2[48], step2[55]);
2490   step1[56] = vqsubq_s16(step2[63], step2[56]);
2491   step1[57] = vqsubq_s16(step2[62], step2[57]);
2492   step1[58] = vqsubq_s16(step2[61], step2[58]);
2493   step1[59] = vqsubq_s16(step2[60], step2[59]);
2494   step1[60] = vqaddq_s16(step2[59], step2[60]);
2495   step1[61] = vqaddq_s16(step2[58], step2[61]);
2496   step1[62] = vqaddq_s16(step2[57], step2[62]);
2497   step1[63] = vqaddq_s16(step2[56], step2[63]);
2498 
2499   // stage 8
2500 
2501   btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2502   btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2503   btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2504   btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2505   btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2506   btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
2507   btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
2508   btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
2509   btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
2510   btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
2511 
2512   step2[0] = vqaddq_s16(step1[0], step1[7]);
2513   step2[1] = vqaddq_s16(step1[1], step1[6]);
2514   step2[2] = vqaddq_s16(step1[2], step1[5]);
2515   step2[3] = vqaddq_s16(step1[3], step1[4]);
2516   step2[4] = vqsubq_s16(step1[3], step1[4]);
2517   step2[5] = vqsubq_s16(step1[2], step1[5]);
2518   step2[6] = vqsubq_s16(step1[1], step1[6]);
2519   step2[7] = vqsubq_s16(step1[0], step1[7]);
2520   step2[8] = step1[8];
2521   step2[9] = step1[9];
2522   step2[14] = step1[14];
2523   step2[15] = step1[15];
2524   step2[16] = vqaddq_s16(step1[16], step1[23]);
2525   step2[17] = vqaddq_s16(step1[17], step1[22]);
2526   step2[18] = vqaddq_s16(step1[18], step1[21]);
2527   step2[19] = vqaddq_s16(step1[19], step1[20]);
2528   step2[20] = vqsubq_s16(step1[19], step1[20]);
2529   step2[21] = vqsubq_s16(step1[18], step1[21]);
2530   step2[22] = vqsubq_s16(step1[17], step1[22]);
2531   step2[23] = vqsubq_s16(step1[16], step1[23]);
2532   step2[24] = vqsubq_s16(step1[31], step1[24]);
2533   step2[25] = vqsubq_s16(step1[30], step1[25]);
2534   step2[26] = vqsubq_s16(step1[29], step1[26]);
2535   step2[27] = vqsubq_s16(step1[28], step1[27]);
2536   step2[28] = vqaddq_s16(step1[28], step1[27]);
2537   step2[29] = vqaddq_s16(step1[29], step1[26]);
2538   step2[30] = vqaddq_s16(step1[30], step1[25]);
2539   step2[31] = vqaddq_s16(step1[31], step1[24]);
2540   step2[32] = step1[32];
2541   step2[33] = step1[33];
2542   step2[34] = step1[34];
2543   step2[35] = step1[35];
2544   step2[44] = step1[44];
2545   step2[45] = step1[45];
2546   step2[46] = step1[46];
2547   step2[47] = step1[47];
2548   step2[48] = step1[48];
2549   step2[49] = step1[49];
2550   step2[50] = step1[50];
2551   step2[51] = step1[51];
2552   step2[60] = step1[60];
2553   step2[61] = step1[61];
2554   step2[62] = step1[62];
2555   step2[63] = step1[63];
2556 
2557   // stage 9
2558   idct64_stage9_neon(step2, step1, cos_bit);
2559 
2560   // stage 10
2561   idct64_stage10_neon(step1, step2, cos_bit);
2562 
2563   // stage 11
2564 
2565   out[0] = vqaddq_s16(step2[0], step2[63]);
2566   out[1] = vqaddq_s16(step2[1], step2[62]);
2567   out[2] = vqaddq_s16(step2[2], step2[61]);
2568   out[3] = vqaddq_s16(step2[3], step2[60]);
2569   out[4] = vqaddq_s16(step2[4], step2[59]);
2570   out[5] = vqaddq_s16(step2[5], step2[58]);
2571   out[6] = vqaddq_s16(step2[6], step2[57]);
2572   out[7] = vqaddq_s16(step2[7], step2[56]);
2573   out[8] = vqaddq_s16(step2[8], step2[55]);
2574   out[9] = vqaddq_s16(step2[9], step2[54]);
2575   out[10] = vqaddq_s16(step2[10], step2[53]);
2576   out[11] = vqaddq_s16(step2[11], step2[52]);
2577   out[12] = vqaddq_s16(step2[12], step2[51]);
2578   out[13] = vqaddq_s16(step2[13], step2[50]);
2579   out[14] = vqaddq_s16(step2[14], step2[49]);
2580   out[15] = vqaddq_s16(step2[15], step2[48]);
2581   out[16] = vqaddq_s16(step2[16], step2[47]);
2582   out[17] = vqaddq_s16(step2[17], step2[46]);
2583   out[18] = vqaddq_s16(step2[18], step2[45]);
2584   out[19] = vqaddq_s16(step2[19], step2[44]);
2585   out[20] = vqaddq_s16(step2[20], step2[43]);
2586   out[21] = vqaddq_s16(step2[21], step2[42]);
2587   out[22] = vqaddq_s16(step2[22], step2[41]);
2588   out[23] = vqaddq_s16(step2[23], step2[40]);
2589   out[24] = vqaddq_s16(step2[24], step2[39]);
2590   out[25] = vqaddq_s16(step2[25], step2[38]);
2591   out[26] = vqaddq_s16(step2[26], step2[37]);
2592   out[27] = vqaddq_s16(step2[27], step2[36]);
2593   out[28] = vqaddq_s16(step2[28], step2[35]);
2594   out[29] = vqaddq_s16(step2[29], step2[34]);
2595   out[30] = vqaddq_s16(step2[30], step2[33]);
2596   out[31] = vqaddq_s16(step2[31], step2[32]);
2597   out[32] = vqsubq_s16(step2[31], step2[32]);
2598   out[33] = vqsubq_s16(step2[30], step2[33]);
2599   out[34] = vqsubq_s16(step2[29], step2[34]);
2600   out[35] = vqsubq_s16(step2[28], step2[35]);
2601   out[36] = vqsubq_s16(step2[27], step2[36]);
2602   out[37] = vqsubq_s16(step2[26], step2[37]);
2603   out[38] = vqsubq_s16(step2[25], step2[38]);
2604   out[39] = vqsubq_s16(step2[24], step2[39]);
2605   out[40] = vqsubq_s16(step2[23], step2[40]);
2606   out[41] = vqsubq_s16(step2[22], step2[41]);
2607   out[42] = vqsubq_s16(step2[21], step2[42]);
2608   out[43] = vqsubq_s16(step2[20], step2[43]);
2609   out[44] = vqsubq_s16(step2[19], step2[44]);
2610   out[45] = vqsubq_s16(step2[18], step2[45]);
2611   out[46] = vqsubq_s16(step2[17], step2[46]);
2612   out[47] = vqsubq_s16(step2[16], step2[47]);
2613   out[48] = vqsubq_s16(step2[15], step2[48]);
2614   out[49] = vqsubq_s16(step2[14], step2[49]);
2615   out[50] = vqsubq_s16(step2[13], step2[50]);
2616   out[51] = vqsubq_s16(step2[12], step2[51]);
2617   out[52] = vqsubq_s16(step2[11], step2[52]);
2618   out[53] = vqsubq_s16(step2[10], step2[53]);
2619   out[54] = vqsubq_s16(step2[9], step2[54]);
2620   out[55] = vqsubq_s16(step2[8], step2[55]);
2621   out[56] = vqsubq_s16(step2[7], step2[56]);
2622   out[57] = vqsubq_s16(step2[6], step2[57]);
2623   out[58] = vqsubq_s16(step2[5], step2[58]);
2624   out[59] = vqsubq_s16(step2[4], step2[59]);
2625   out[60] = vqsubq_s16(step2[3], step2[60]);
2626   out[61] = vqsubq_s16(step2[2], step2[61]);
2627   out[62] = vqsubq_s16(step2[1], step2[62]);
2628   out[63] = vqsubq_s16(step2[0], step2[63]);
2629 }
2630 
idct64_low1_neon(int16x8_t * input,int16x8_t * out,int8_t cos_bit)2631 static inline void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
2632                                     int8_t cos_bit) {
2633   const int32_t *cospi = cospi_arr(cos_bit);
2634   int16x8_t step1;
2635   int32x4_t t32[2];
2636 
2637   // stage 1
2638   // stage 2
2639   // stage 3
2640   // stage 4
2641   // stage 5
2642   // stage 6
2643 
2644   t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
2645   t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
2646 
2647   step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
2648                        vrshrn_n_s32(t32[1], INV_COS_BIT));
2649   // stage 7
2650   // stage 8
2651   // stage 9
2652   // stage 10
2653   // stage 11
2654   out[0] = step1;
2655   out[1] = step1;
2656   out[2] = step1;
2657   out[3] = step1;
2658   out[4] = step1;
2659   out[5] = step1;
2660   out[6] = step1;
2661   out[7] = step1;
2662   out[8] = step1;
2663   out[9] = step1;
2664   out[10] = step1;
2665   out[11] = step1;
2666   out[12] = step1;
2667   out[13] = step1;
2668   out[14] = step1;
2669   out[15] = step1;
2670   out[16] = step1;
2671   out[17] = step1;
2672   out[18] = step1;
2673   out[19] = step1;
2674   out[20] = step1;
2675   out[21] = step1;
2676   out[22] = step1;
2677   out[23] = step1;
2678   out[24] = step1;
2679   out[25] = step1;
2680   out[26] = step1;
2681   out[27] = step1;
2682   out[28] = step1;
2683   out[29] = step1;
2684   out[30] = step1;
2685   out[31] = step1;
2686   out[32] = step1;
2687   out[33] = step1;
2688   out[34] = step1;
2689   out[35] = step1;
2690   out[36] = step1;
2691   out[37] = step1;
2692   out[38] = step1;
2693   out[39] = step1;
2694   out[40] = step1;
2695   out[41] = step1;
2696   out[42] = step1;
2697   out[43] = step1;
2698   out[44] = step1;
2699   out[45] = step1;
2700   out[46] = step1;
2701   out[47] = step1;
2702   out[48] = step1;
2703   out[49] = step1;
2704   out[50] = step1;
2705   out[51] = step1;
2706   out[52] = step1;
2707   out[53] = step1;
2708   out[54] = step1;
2709   out[55] = step1;
2710   out[56] = step1;
2711   out[57] = step1;
2712   out[58] = step1;
2713   out[59] = step1;
2714   out[60] = step1;
2715   out[61] = step1;
2716   out[62] = step1;
2717   out[63] = step1;
2718 }
2719 
idct64_low8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)2720 static inline void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
2721                                     int8_t cos_bit) {
2722   const int32_t *cospi = cospi_arr(cos_bit);
2723   int16x8_t step2[64], step1[64];
2724 
2725   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2726                                       (int16_t)cospi[36], (int16_t)cospi[28]);
2727   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2728                                       (int16_t)cospi[52], (int16_t)cospi[12]);
2729   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2730                                       (int16_t)cospi[40], (int16_t)cospi[24]);
2731   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2732                                       (int16_t)cospi[16], (int16_t)cospi[48]);
2733   const int16x4_t c4 =
2734       set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
2735                      (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2736   const int16x4_t c5 =
2737       set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2738                      (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2739   const int16x4_t c6 =
2740       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2741                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
2742 
2743   // stage 1
2744   // stage 2
2745 
2746   step2[0] = in[0];
2747   step2[8] = in[4];
2748   step2[16] = in[2];
2749   step2[24] = in[6];
2750 
2751   btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2752   btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2753   btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2754   btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2755 
2756   // stage 3
2757 
2758   step1[0] = step2[0];
2759   step1[8] = step2[8];
2760 
2761   btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2762   btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2763 
2764   step1[32] = step2[32];
2765   step1[33] = step2[32];
2766   step1[38] = step2[39];
2767   step1[39] = step2[39];
2768   step1[40] = step2[40];
2769   step1[41] = step2[40];
2770   step1[46] = step2[47];
2771   step1[47] = step2[47];
2772   step1[48] = step2[48];
2773   step1[49] = step2[48];
2774   step1[54] = step2[55];
2775   step1[55] = step2[55];
2776   step1[56] = step2[56];
2777   step1[57] = step2[56];
2778   step1[62] = step2[63];
2779   step1[63] = step2[63];
2780 
2781   // stage 4
2782 
2783   step2[0] = step1[0];
2784 
2785   btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2786   btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
2787   btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
2788   btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
2789   btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
2790 
2791   step2[16] = step1[16];
2792   step2[17] = step1[16];
2793   step2[22] = step1[23];
2794   step2[23] = step1[23];
2795   step2[24] = step1[24];
2796   step2[25] = step1[24];
2797   step2[30] = step1[31];
2798   step2[31] = step1[31];
2799   step2[32] = step1[32];
2800   step2[39] = step1[39];
2801   step2[40] = step1[40];
2802   step2[47] = step1[47];
2803   step2[48] = step1[48];
2804   step2[55] = step1[55];
2805   step2[56] = step1[56];
2806   step2[63] = step1[63];
2807 
2808   // stage 5
2809 
2810   step1[0] = step2[0];
2811 
2812   btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
2813   btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
2814 
2815   step1[8] = step2[8];
2816   step1[9] = step2[8];
2817   step1[14] = step2[15];
2818   step1[15] = step2[15];
2819 
2820   step1[16] = step2[16];
2821   step1[23] = step2[23];
2822   step1[24] = step2[24];
2823   step1[31] = step2[31];
2824   step1[32] = step2[32];
2825   step1[33] = step2[33];
2826   step1[34] = step2[33];
2827   step1[35] = step2[32];
2828   step1[36] = step2[39];
2829   step1[37] = step2[38];
2830   step1[38] = step2[38];
2831   step1[39] = step2[39];
2832   step1[40] = step2[40];
2833   step1[41] = step2[41];
2834   step1[42] = step2[41];
2835   step1[43] = step2[40];
2836   step1[44] = step2[47];
2837   step1[45] = step2[46];
2838   step1[46] = step2[46];
2839   step1[47] = step2[47];
2840   step1[48] = step2[48];
2841   step1[49] = step2[49];
2842   step1[50] = step2[49];
2843   step1[51] = step2[48];
2844   step1[52] = step2[55];
2845   step1[53] = step2[54];
2846   step1[54] = step2[54];
2847   step1[55] = step2[55];
2848   step1[56] = step2[56];
2849   step1[57] = step2[57];
2850   step1[58] = step2[57];
2851   step1[59] = step2[56];
2852   step1[60] = step2[63];
2853   step1[61] = step2[62];
2854   step1[62] = step2[62];
2855   step1[63] = step2[63];
2856 
2857   // stage 6
2858 
2859   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2860   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2861   btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2862   btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
2863   btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
2864   btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
2865   btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2866   btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
2867   btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
2868   btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
2869 
2870   step2[8] = step1[8];
2871   step2[15] = step1[15];
2872   step2[16] = step1[16];
2873   step2[17] = step1[17];
2874   step2[18] = step1[17];
2875   step2[19] = step1[16];
2876   step2[20] = step1[23];
2877   step2[21] = step1[22];
2878   step2[22] = step1[22];
2879   step2[23] = step1[23];
2880   step2[24] = step1[24];
2881   step2[25] = step1[25];
2882   step2[26] = step1[25];
2883   step2[27] = step1[24];
2884   step2[28] = step1[31];
2885   step2[29] = step1[30];
2886   step2[30] = step1[30];
2887   step2[31] = step1[31];
2888   step2[32] = step1[32];
2889   step2[33] = step1[33];
2890   step2[38] = step1[38];
2891   step2[39] = step1[39];
2892   step2[40] = step1[40];
2893   step2[41] = step1[41];
2894   step2[46] = step1[46];
2895   step2[47] = step1[47];
2896   step2[48] = step1[48];
2897   step2[49] = step1[49];
2898   step2[54] = step1[54];
2899   step2[55] = step1[55];
2900   step2[56] = step1[56];
2901   step2[57] = step1[57];
2902   step2[62] = step1[62];
2903   step2[63] = step1[63];
2904 
2905   // stage 7
2906 
2907   btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2908   btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
2909   btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
2910   btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
2911 
2912   step1[0] = step2[0];
2913   step1[1] = step2[1];
2914   step1[2] = step2[1];
2915   step1[3] = step2[0];
2916   step1[8] = step2[8];
2917   step1[9] = step2[9];
2918   step1[10] = step2[9];
2919   step1[11] = step2[8];
2920   step1[12] = step2[15];
2921   step1[13] = step2[14];
2922   step1[14] = step2[14];
2923   step1[15] = step2[15];
2924   step1[16] = step2[16];
2925   step1[17] = step2[17];
2926   step1[22] = step2[22];
2927   step1[23] = step2[23];
2928   step1[24] = step2[24];
2929   step1[25] = step2[25];
2930   step1[30] = step2[30];
2931   step1[31] = step2[31];
2932   step1[32] = vqaddq_s16(step2[32], step2[39]);
2933   step1[33] = vqaddq_s16(step2[33], step2[38]);
2934   step1[34] = vqaddq_s16(step2[34], step2[37]);
2935   step1[35] = vqaddq_s16(step2[35], step2[36]);
2936   step1[36] = vqsubq_s16(step2[35], step2[36]);
2937   step1[37] = vqsubq_s16(step2[34], step2[37]);
2938   step1[38] = vqsubq_s16(step2[33], step2[38]);
2939   step1[39] = vqsubq_s16(step2[32], step2[39]);
2940   step1[40] = vqsubq_s16(step2[47], step2[40]);
2941   step1[41] = vqsubq_s16(step2[46], step2[41]);
2942   step1[42] = vqsubq_s16(step2[45], step2[42]);
2943   step1[43] = vqsubq_s16(step2[44], step2[43]);
2944   step1[44] = vqaddq_s16(step2[43], step2[44]);
2945   step1[45] = vqaddq_s16(step2[42], step2[45]);
2946   step1[46] = vqaddq_s16(step2[41], step2[46]);
2947   step1[47] = vqaddq_s16(step2[40], step2[47]);
2948   step1[48] = vqaddq_s16(step2[48], step2[55]);
2949   step1[49] = vqaddq_s16(step2[49], step2[54]);
2950   step1[50] = vqaddq_s16(step2[50], step2[53]);
2951   step1[51] = vqaddq_s16(step2[51], step2[52]);
2952   step1[52] = vqsubq_s16(step2[51], step2[52]);
2953   step1[53] = vqsubq_s16(step2[50], step2[53]);
2954   step1[54] = vqsubq_s16(step2[49], step2[54]);
2955   step1[55] = vqsubq_s16(step2[48], step2[55]);
2956   step1[56] = vqsubq_s16(step2[63], step2[56]);
2957   step1[57] = vqsubq_s16(step2[62], step2[57]);
2958   step1[58] = vqsubq_s16(step2[61], step2[58]);
2959   step1[59] = vqsubq_s16(step2[60], step2[59]);
2960   step1[60] = vqaddq_s16(step2[59], step2[60]);
2961   step1[61] = vqaddq_s16(step2[58], step2[61]);
2962   step1[62] = vqaddq_s16(step2[57], step2[62]);
2963   step1[63] = vqaddq_s16(step2[56], step2[63]);
2964 
2965   // stage 8
2966 
2967   btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2968   btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2969   btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2970   btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2971   btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2972   btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
2973   btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
2974   btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
2975   btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
2976   btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
2977 
2978   step2[0] = step1[0];
2979   step2[1] = step1[1];
2980   step2[2] = step1[2];
2981   step2[3] = step1[3];
2982   step2[4] = step1[3];
2983   step2[5] = step1[2];
2984   step2[6] = step1[1];
2985   step2[7] = step1[0];
2986   step2[8] = step1[8];
2987   step2[9] = step1[9];
2988   step2[14] = step1[14];
2989   step2[15] = step1[15];
2990   step2[16] = vqaddq_s16(step1[16], step1[23]);
2991   step2[17] = vqaddq_s16(step1[17], step1[22]);
2992   step2[18] = vqaddq_s16(step1[18], step1[21]);
2993   step2[19] = vqaddq_s16(step1[19], step1[20]);
2994   step2[20] = vqsubq_s16(step1[19], step1[20]);
2995   step2[21] = vqsubq_s16(step1[18], step1[21]);
2996   step2[22] = vqsubq_s16(step1[17], step1[22]);
2997   step2[23] = vqsubq_s16(step1[16], step1[23]);
2998   step2[24] = vqsubq_s16(step1[31], step1[24]);
2999   step2[25] = vqsubq_s16(step1[30], step1[25]);
3000   step2[26] = vqsubq_s16(step1[29], step1[26]);
3001   step2[27] = vqsubq_s16(step1[28], step1[27]);
3002   step2[28] = vqaddq_s16(step1[28], step1[27]);
3003   step2[29] = vqaddq_s16(step1[29], step1[26]);
3004   step2[30] = vqaddq_s16(step1[30], step1[25]);
3005   step2[31] = vqaddq_s16(step1[31], step1[24]);
3006   step2[32] = step1[32];
3007   step2[33] = step1[33];
3008   step2[34] = step1[34];
3009   step2[35] = step1[35];
3010   step2[44] = step1[44];
3011   step2[45] = step1[45];
3012   step2[46] = step1[46];
3013   step2[47] = step1[47];
3014   step2[48] = step1[48];
3015   step2[49] = step1[49];
3016   step2[50] = step1[50];
3017   step2[51] = step1[51];
3018   step2[60] = step1[60];
3019   step2[61] = step1[61];
3020   step2[62] = step1[62];
3021   step2[63] = step1[63];
3022 
3023   // stage 9
3024   idct64_stage9_neon(step2, step1, cos_bit);
3025 
3026   // stage 10
3027   idct64_stage10_neon(step1, step2, cos_bit);
3028 
3029   // stage 11
3030 
3031   out[0] = vqaddq_s16(step2[0], step2[63]);
3032   out[1] = vqaddq_s16(step2[1], step2[62]);
3033   out[2] = vqaddq_s16(step2[2], step2[61]);
3034   out[3] = vqaddq_s16(step2[3], step2[60]);
3035   out[4] = vqaddq_s16(step2[4], step2[59]);
3036   out[5] = vqaddq_s16(step2[5], step2[58]);
3037   out[6] = vqaddq_s16(step2[6], step2[57]);
3038   out[7] = vqaddq_s16(step2[7], step2[56]);
3039   out[8] = vqaddq_s16(step2[8], step2[55]);
3040   out[9] = vqaddq_s16(step2[9], step2[54]);
3041   out[10] = vqaddq_s16(step2[10], step2[53]);
3042   out[11] = vqaddq_s16(step2[11], step2[52]);
3043   out[12] = vqaddq_s16(step2[12], step2[51]);
3044   out[13] = vqaddq_s16(step2[13], step2[50]);
3045   out[14] = vqaddq_s16(step2[14], step2[49]);
3046   out[15] = vqaddq_s16(step2[15], step2[48]);
3047   out[16] = vqaddq_s16(step2[16], step2[47]);
3048   out[17] = vqaddq_s16(step2[17], step2[46]);
3049   out[18] = vqaddq_s16(step2[18], step2[45]);
3050   out[19] = vqaddq_s16(step2[19], step2[44]);
3051   out[20] = vqaddq_s16(step2[20], step2[43]);
3052   out[21] = vqaddq_s16(step2[21], step2[42]);
3053   out[22] = vqaddq_s16(step2[22], step2[41]);
3054   out[23] = vqaddq_s16(step2[23], step2[40]);
3055   out[24] = vqaddq_s16(step2[24], step2[39]);
3056   out[25] = vqaddq_s16(step2[25], step2[38]);
3057   out[26] = vqaddq_s16(step2[26], step2[37]);
3058   out[27] = vqaddq_s16(step2[27], step2[36]);
3059   out[28] = vqaddq_s16(step2[28], step2[35]);
3060   out[29] = vqaddq_s16(step2[29], step2[34]);
3061   out[30] = vqaddq_s16(step2[30], step2[33]);
3062   out[31] = vqaddq_s16(step2[31], step2[32]);
3063   out[32] = vqsubq_s16(step2[31], step2[32]);
3064   out[33] = vqsubq_s16(step2[30], step2[33]);
3065   out[34] = vqsubq_s16(step2[29], step2[34]);
3066   out[35] = vqsubq_s16(step2[28], step2[35]);
3067   out[36] = vqsubq_s16(step2[27], step2[36]);
3068   out[37] = vqsubq_s16(step2[26], step2[37]);
3069   out[38] = vqsubq_s16(step2[25], step2[38]);
3070   out[39] = vqsubq_s16(step2[24], step2[39]);
3071   out[40] = vqsubq_s16(step2[23], step2[40]);
3072   out[41] = vqsubq_s16(step2[22], step2[41]);
3073   out[42] = vqsubq_s16(step2[21], step2[42]);
3074   out[43] = vqsubq_s16(step2[20], step2[43]);
3075   out[44] = vqsubq_s16(step2[19], step2[44]);
3076   out[45] = vqsubq_s16(step2[18], step2[45]);
3077   out[46] = vqsubq_s16(step2[17], step2[46]);
3078   out[47] = vqsubq_s16(step2[16], step2[47]);
3079   out[48] = vqsubq_s16(step2[15], step2[48]);
3080   out[49] = vqsubq_s16(step2[14], step2[49]);
3081   out[50] = vqsubq_s16(step2[13], step2[50]);
3082   out[51] = vqsubq_s16(step2[12], step2[51]);
3083   out[52] = vqsubq_s16(step2[11], step2[52]);
3084   out[53] = vqsubq_s16(step2[10], step2[53]);
3085   out[54] = vqsubq_s16(step2[9], step2[54]);
3086   out[55] = vqsubq_s16(step2[8], step2[55]);
3087   out[56] = vqsubq_s16(step2[7], step2[56]);
3088   out[57] = vqsubq_s16(step2[6], step2[57]);
3089   out[58] = vqsubq_s16(step2[5], step2[58]);
3090   out[59] = vqsubq_s16(step2[4], step2[59]);
3091   out[60] = vqsubq_s16(step2[3], step2[60]);
3092   out[61] = vqsubq_s16(step2[2], step2[61]);
3093   out[62] = vqsubq_s16(step2[1], step2[62]);
3094   out[63] = vqsubq_s16(step2[0], step2[63]);
3095 }
3096 
idct64_low16_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)3097 static inline void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
3098                                      int8_t cos_bit) {
3099   const int32_t *cospi = cospi_arr(cos_bit);
3100   int16x8_t step2[64], step1[64];
3101 
3102   const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
3103                                       (int16_t)cospi[36], (int16_t)cospi[28]);
3104   const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
3105                                       (int16_t)cospi[52], (int16_t)cospi[12]);
3106   const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
3107                                       (int16_t)cospi[40], (int16_t)cospi[24]);
3108   const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
3109                                       (int16_t)cospi[16], (int16_t)cospi[48]);
3110   const int16x4_t c4 =
3111       set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
3112                      (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
3113   const int16x4_t c5 =
3114       set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
3115                      (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
3116   const int16x4_t c6 =
3117       set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
3118                      (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
3119   const int16x4_t c7 =
3120       set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
3121                      (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
3122 
3123   // stage 1
3124   // stage 2
3125 
3126   step2[0] = in[0];
3127   step2[4] = in[8];
3128   step2[8] = in[4];
3129   step2[12] = in[12];
3130   step2[16] = in[2];
3131   step2[20] = in[10];
3132   step2[24] = in[6];
3133   step2[28] = in[14];
3134 
3135   btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
3136   btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
3137   btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
3138   btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
3139   btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
3140   btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
3141   btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
3142   btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
3143 
3144   // stage 3
3145 
3146   step1[0] = step2[0];
3147   step1[4] = step2[4];
3148   step1[8] = step2[8];
3149   step1[12] = step2[12];
3150 
3151   btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
3152   btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
3153   btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
3154   btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
3155 
3156   step1[32] = step2[32];
3157   step1[33] = step2[32];
3158   step1[34] = step2[35];
3159   step1[35] = step2[35];
3160   step1[36] = step2[36];
3161   step1[37] = step2[36];
3162   step1[38] = step2[39];
3163   step1[39] = step2[39];
3164   step1[40] = step2[40];
3165   step1[41] = step2[40];
3166   step1[42] = step2[43];
3167   step1[43] = step2[43];
3168   step1[44] = step2[44];
3169   step1[45] = step2[44];
3170   step1[46] = step2[47];
3171   step1[47] = step2[47];
3172   step1[48] = step2[48];
3173   step1[49] = step2[48];
3174   step1[50] = step2[51];
3175   step1[51] = step2[51];
3176   step1[52] = step2[52];
3177   step1[53] = step2[52];
3178   step1[54] = step2[55];
3179   step1[55] = step2[55];
3180   step1[56] = step2[56];
3181   step1[57] = step2[56];
3182   step1[58] = step2[59];
3183   step1[59] = step2[59];
3184   step1[60] = step2[60];
3185   step1[61] = step2[60];
3186   step1[62] = step2[63];
3187   step1[63] = step2[63];
3188 
3189   // stage 4
3190 
3191   step2[0] = step1[0];
3192   step2[4] = step1[4];
3193 
3194   btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
3195   btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
3196   btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
3197   btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
3198   btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
3199   btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
3200   btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
3201   btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
3202   btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
3203   btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
3204 
3205   step2[16] = step1[16];
3206   step2[17] = step1[16];
3207   step2[18] = step1[19];
3208   step2[19] = step1[19];
3209   step2[20] = step1[20];
3210   step2[21] = step1[20];
3211   step2[22] = step1[23];
3212   step2[23] = step1[23];
3213   step2[24] = step1[24];
3214   step2[25] = step1[24];
3215   step2[26] = step1[27];
3216   step2[27] = step1[27];
3217   step2[28] = step1[28];
3218   step2[29] = step1[28];
3219   step2[30] = step1[31];
3220   step2[31] = step1[31];
3221   step2[32] = step1[32];
3222   step2[35] = step1[35];
3223   step2[36] = step1[36];
3224   step2[39] = step1[39];
3225   step2[40] = step1[40];
3226   step2[43] = step1[43];
3227   step2[44] = step1[44];
3228   step2[47] = step1[47];
3229   step2[48] = step1[48];
3230   step2[51] = step1[51];
3231   step2[52] = step1[52];
3232   step2[55] = step1[55];
3233   step2[56] = step1[56];
3234   step2[59] = step1[59];
3235   step2[60] = step1[60];
3236   step2[63] = step1[63];
3237 
3238   // stage 5
3239 
3240   step1[0] = step2[0];
3241 
3242   btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
3243   btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
3244   btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
3245   btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
3246   btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
3247 
3248   step1[8] = step2[8];
3249   step1[9] = step2[8];
3250   step1[10] = step2[11];
3251   step1[11] = step2[11];
3252   step1[12] = step2[12];
3253   step1[13] = step2[12];
3254   step1[14] = step2[15];
3255   step1[15] = step2[15];
3256   step1[16] = step2[16];
3257   step1[19] = step2[19];
3258   step1[20] = step2[20];
3259   step1[23] = step2[23];
3260   step1[24] = step2[24];
3261   step1[27] = step2[27];
3262   step1[28] = step2[28];
3263   step1[31] = step2[31];
3264   step1[32] = vqaddq_s16(step2[32], step2[35]);
3265   step1[33] = vqaddq_s16(step2[33], step2[34]);
3266   step1[34] = vqsubq_s16(step2[33], step2[34]);
3267   step1[35] = vqsubq_s16(step2[32], step2[35]);
3268   step1[36] = vqsubq_s16(step2[39], step2[36]);
3269   step1[37] = vqsubq_s16(step2[38], step2[37]);
3270   step1[38] = vqaddq_s16(step2[38], step2[37]);
3271   step1[39] = vqaddq_s16(step2[39], step2[36]);
3272   step1[40] = vqaddq_s16(step2[40], step2[43]);
3273   step1[41] = vqaddq_s16(step2[41], step2[42]);
3274   step1[42] = vqsubq_s16(step2[41], step2[42]);
3275   step1[43] = vqsubq_s16(step2[40], step2[43]);
3276   step1[44] = vqsubq_s16(step2[47], step2[44]);
3277   step1[45] = vqsubq_s16(step2[46], step2[45]);
3278   step1[46] = vqaddq_s16(step2[46], step2[45]);
3279   step1[47] = vqaddq_s16(step2[47], step2[44]);
3280   step1[48] = vqaddq_s16(step2[48], step2[51]);
3281   step1[49] = vqaddq_s16(step2[49], step2[50]);
3282   step1[50] = vqsubq_s16(step2[49], step2[50]);
3283   step1[51] = vqsubq_s16(step2[48], step2[51]);
3284   step1[52] = vqsubq_s16(step2[55], step2[52]);
3285   step1[53] = vqsubq_s16(step2[54], step2[53]);
3286   step1[54] = vqaddq_s16(step2[54], step2[53]);
3287   step1[55] = vqaddq_s16(step2[55], step2[52]);
3288   step1[56] = vqaddq_s16(step2[56], step2[59]);
3289   step1[57] = vqaddq_s16(step2[57], step2[58]);
3290   step1[58] = vqsubq_s16(step2[57], step2[58]);
3291   step1[59] = vqsubq_s16(step2[56], step2[59]);
3292   step1[60] = vqsubq_s16(step2[63], step2[60]);
3293   step1[61] = vqsubq_s16(step2[62], step2[61]);
3294   step1[62] = vqaddq_s16(step2[62], step2[61]);
3295   step1[63] = vqaddq_s16(step2[63], step2[60]);
3296 
3297   // stage 6
3298 
3299   btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
3300   btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
3301   btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
3302   btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
3303   btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
3304   btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
3305   btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
3306   btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
3307   btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
3308   btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
3309   btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
3310 
3311   step2[4] = step1[4];
3312   step2[5] = step1[4];
3313   step2[6] = step1[7];
3314   step2[7] = step1[7];
3315   step2[8] = step1[8];
3316   step2[11] = step1[11];
3317   step2[12] = step1[12];
3318   step2[15] = step1[15];
3319   step2[16] = vqaddq_s16(step1[16], step1[19]);
3320   step2[17] = vqaddq_s16(step1[17], step1[18]);
3321   step2[18] = vqsubq_s16(step1[17], step1[18]);
3322   step2[19] = vqsubq_s16(step1[16], step1[19]);
3323   step2[20] = vqsubq_s16(step1[23], step1[20]);
3324   step2[21] = vqsubq_s16(step1[22], step1[21]);
3325   step2[22] = vqaddq_s16(step1[22], step1[21]);
3326   step2[23] = vqaddq_s16(step1[23], step1[20]);
3327   step2[24] = vqaddq_s16(step1[24], step1[27]);
3328   step2[25] = vqaddq_s16(step1[25], step1[26]);
3329   step2[26] = vqsubq_s16(step1[25], step1[26]);
3330   step2[27] = vqsubq_s16(step1[24], step1[27]);
3331   step2[28] = vqsubq_s16(step1[31], step1[28]);
3332   step2[29] = vqsubq_s16(step1[30], step1[29]);
3333   step2[30] = vqaddq_s16(step1[30], step1[29]);
3334   step2[31] = vqaddq_s16(step1[31], step1[28]);
3335   step2[32] = step1[32];
3336   step2[33] = step1[33];
3337   step2[38] = step1[38];
3338   step2[39] = step1[39];
3339   step2[40] = step1[40];
3340   step2[41] = step1[41];
3341   step2[46] = step1[46];
3342   step2[47] = step1[47];
3343   step2[48] = step1[48];
3344   step2[49] = step1[49];
3345   step2[54] = step1[54];
3346   step2[55] = step1[55];
3347   step2[56] = step1[56];
3348   step2[57] = step1[57];
3349   step2[62] = step1[62];
3350   step2[63] = step1[63];
3351 
3352   // stage 7
3353 
3354   btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
3355   btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
3356   btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
3357   btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
3358   btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
3359 
3360   step1[0] = step2[0];
3361   step1[1] = step2[1];
3362   step1[2] = step2[1];
3363   step1[3] = step2[0];
3364   step1[4] = step2[4];
3365   step1[7] = step2[7];
3366   step1[8] = vqaddq_s16(step2[8], step2[11]);
3367   step1[9] = vqaddq_s16(step2[9], step2[10]);
3368   step1[10] = vqsubq_s16(step2[9], step2[10]);
3369   step1[11] = vqsubq_s16(step2[8], step2[11]);
3370   step1[12] = vqsubq_s16(step2[15], step2[12]);
3371   step1[13] = vqsubq_s16(step2[14], step2[13]);
3372   step1[14] = vqaddq_s16(step2[14], step2[13]);
3373   step1[15] = vqaddq_s16(step2[15], step2[12]);
3374   step1[16] = step2[16];
3375   step1[17] = step2[17];
3376   step1[22] = step2[22];
3377   step1[23] = step2[23];
3378   step1[24] = step2[24];
3379   step1[25] = step2[25];
3380   step1[30] = step2[30];
3381   step1[31] = step2[31];
3382   step1[32] = vqaddq_s16(step2[32], step2[39]);
3383   step1[33] = vqaddq_s16(step2[33], step2[38]);
3384   step1[34] = vqaddq_s16(step2[34], step2[37]);
3385   step1[35] = vqaddq_s16(step2[35], step2[36]);
3386   step1[36] = vqsubq_s16(step2[35], step2[36]);
3387   step1[37] = vqsubq_s16(step2[34], step2[37]);
3388   step1[38] = vqsubq_s16(step2[33], step2[38]);
3389   step1[39] = vqsubq_s16(step2[32], step2[39]);
3390   step1[40] = vqsubq_s16(step2[47], step2[40]);
3391   step1[41] = vqsubq_s16(step2[46], step2[41]);
3392   step1[42] = vqsubq_s16(step2[45], step2[42]);
3393   step1[43] = vqsubq_s16(step2[44], step2[43]);
3394   step1[44] = vqaddq_s16(step2[43], step2[44]);
3395   step1[45] = vqaddq_s16(step2[42], step2[45]);
3396   step1[46] = vqaddq_s16(step2[41], step2[46]);
3397   step1[47] = vqaddq_s16(step2[40], step2[47]);
3398   step1[48] = vqaddq_s16(step2[48], step2[55]);
3399   step1[49] = vqaddq_s16(step2[49], step2[54]);
3400   step1[50] = vqaddq_s16(step2[50], step2[53]);
3401   step1[51] = vqaddq_s16(step2[51], step2[52]);
3402   step1[52] = vqsubq_s16(step2[51], step2[52]);
3403   step1[53] = vqsubq_s16(step2[50], step2[53]);
3404   step1[54] = vqsubq_s16(step2[49], step2[54]);
3405   step1[55] = vqsubq_s16(step2[48], step2[55]);
3406   step1[56] = vqsubq_s16(step2[63], step2[56]);
3407   step1[57] = vqsubq_s16(step2[62], step2[57]);
3408   step1[58] = vqsubq_s16(step2[61], step2[58]);
3409   step1[59] = vqsubq_s16(step2[60], step2[59]);
3410   step1[60] = vqaddq_s16(step2[59], step2[60]);
3411   step1[61] = vqaddq_s16(step2[58], step2[61]);
3412   step1[62] = vqaddq_s16(step2[57], step2[62]);
3413   step1[63] = vqaddq_s16(step2[56], step2[63]);
3414 
3415   // stage 8
3416 
3417   btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
3418   btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
3419   btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
3420   btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
3421   btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
3422   btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
3423   btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
3424   btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
3425   btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
3426   btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
3427 
3428   step2[0] = vqaddq_s16(step1[0], step1[7]);
3429   step2[1] = vqaddq_s16(step1[1], step1[6]);
3430   step2[2] = vqaddq_s16(step1[2], step1[5]);
3431   step2[3] = vqaddq_s16(step1[3], step1[4]);
3432   step2[4] = vqsubq_s16(step1[3], step1[4]);
3433   step2[5] = vqsubq_s16(step1[2], step1[5]);
3434   step2[6] = vqsubq_s16(step1[1], step1[6]);
3435   step2[7] = vqsubq_s16(step1[0], step1[7]);
3436   step2[8] = step1[8];
3437   step2[9] = step1[9];
3438   step2[14] = step1[14];
3439   step2[15] = step1[15];
3440   step2[16] = vqaddq_s16(step1[16], step1[23]);
3441   step2[17] = vqaddq_s16(step1[17], step1[22]);
3442   step2[18] = vqaddq_s16(step1[18], step1[21]);
3443   step2[19] = vqaddq_s16(step1[19], step1[20]);
3444   step2[20] = vqsubq_s16(step1[19], step1[20]);
3445   step2[21] = vqsubq_s16(step1[18], step1[21]);
3446   step2[22] = vqsubq_s16(step1[17], step1[22]);
3447   step2[23] = vqsubq_s16(step1[16], step1[23]);
3448   step2[24] = vqsubq_s16(step1[31], step1[24]);
3449   step2[25] = vqsubq_s16(step1[30], step1[25]);
3450   step2[26] = vqsubq_s16(step1[29], step1[26]);
3451   step2[27] = vqsubq_s16(step1[28], step1[27]);
3452   step2[28] = vqaddq_s16(step1[28], step1[27]);
3453   step2[29] = vqaddq_s16(step1[29], step1[26]);
3454   step2[30] = vqaddq_s16(step1[30], step1[25]);
3455   step2[31] = vqaddq_s16(step1[31], step1[24]);
3456   step2[32] = step1[32];
3457   step2[33] = step1[33];
3458   step2[34] = step1[34];
3459   step2[35] = step1[35];
3460   step2[44] = step1[44];
3461   step2[45] = step1[45];
3462   step2[46] = step1[46];
3463   step2[47] = step1[47];
3464   step2[48] = step1[48];
3465   step2[49] = step1[49];
3466   step2[50] = step1[50];
3467   step2[51] = step1[51];
3468   step2[60] = step1[60];
3469   step2[61] = step1[61];
3470   step2[62] = step1[62];
3471   step2[63] = step1[63];
3472 
3473   // stage 9
3474   idct64_stage9_neon(step2, step1, cos_bit);
3475 
3476   // stage 10
3477   idct64_stage10_neon(step1, step2, cos_bit);
3478 
3479   // stage 11
3480 
3481   out[0] = vqaddq_s16(step2[0], step2[63]);
3482   out[1] = vqaddq_s16(step2[1], step2[62]);
3483   out[2] = vqaddq_s16(step2[2], step2[61]);
3484   out[3] = vqaddq_s16(step2[3], step2[60]);
3485   out[4] = vqaddq_s16(step2[4], step2[59]);
3486   out[5] = vqaddq_s16(step2[5], step2[58]);
3487   out[6] = vqaddq_s16(step2[6], step2[57]);
3488   out[7] = vqaddq_s16(step2[7], step2[56]);
3489   out[8] = vqaddq_s16(step2[8], step2[55]);
3490   out[9] = vqaddq_s16(step2[9], step2[54]);
3491   out[10] = vqaddq_s16(step2[10], step2[53]);
3492   out[11] = vqaddq_s16(step2[11], step2[52]);
3493   out[12] = vqaddq_s16(step2[12], step2[51]);
3494   out[13] = vqaddq_s16(step2[13], step2[50]);
3495   out[14] = vqaddq_s16(step2[14], step2[49]);
3496   out[15] = vqaddq_s16(step2[15], step2[48]);
3497   out[16] = vqaddq_s16(step2[16], step2[47]);
3498   out[17] = vqaddq_s16(step2[17], step2[46]);
3499   out[18] = vqaddq_s16(step2[18], step2[45]);
3500   out[19] = vqaddq_s16(step2[19], step2[44]);
3501   out[20] = vqaddq_s16(step2[20], step2[43]);
3502   out[21] = vqaddq_s16(step2[21], step2[42]);
3503   out[22] = vqaddq_s16(step2[22], step2[41]);
3504   out[23] = vqaddq_s16(step2[23], step2[40]);
3505   out[24] = vqaddq_s16(step2[24], step2[39]);
3506   out[25] = vqaddq_s16(step2[25], step2[38]);
3507   out[26] = vqaddq_s16(step2[26], step2[37]);
3508   out[27] = vqaddq_s16(step2[27], step2[36]);
3509   out[28] = vqaddq_s16(step2[28], step2[35]);
3510   out[29] = vqaddq_s16(step2[29], step2[34]);
3511   out[30] = vqaddq_s16(step2[30], step2[33]);
3512   out[31] = vqaddq_s16(step2[31], step2[32]);
3513   out[32] = vqsubq_s16(step2[31], step2[32]);
3514   out[33] = vqsubq_s16(step2[30], step2[33]);
3515   out[34] = vqsubq_s16(step2[29], step2[34]);
3516   out[35] = vqsubq_s16(step2[28], step2[35]);
3517   out[36] = vqsubq_s16(step2[27], step2[36]);
3518   out[37] = vqsubq_s16(step2[26], step2[37]);
3519   out[38] = vqsubq_s16(step2[25], step2[38]);
3520   out[39] = vqsubq_s16(step2[24], step2[39]);
3521   out[40] = vqsubq_s16(step2[23], step2[40]);
3522   out[41] = vqsubq_s16(step2[22], step2[41]);
3523   out[42] = vqsubq_s16(step2[21], step2[42]);
3524   out[43] = vqsubq_s16(step2[20], step2[43]);
3525   out[44] = vqsubq_s16(step2[19], step2[44]);
3526   out[45] = vqsubq_s16(step2[18], step2[45]);
3527   out[46] = vqsubq_s16(step2[17], step2[46]);
3528   out[47] = vqsubq_s16(step2[16], step2[47]);
3529   out[48] = vqsubq_s16(step2[15], step2[48]);
3530   out[49] = vqsubq_s16(step2[14], step2[49]);
3531   out[50] = vqsubq_s16(step2[13], step2[50]);
3532   out[51] = vqsubq_s16(step2[12], step2[51]);
3533   out[52] = vqsubq_s16(step2[11], step2[52]);
3534   out[53] = vqsubq_s16(step2[10], step2[53]);
3535   out[54] = vqsubq_s16(step2[9], step2[54]);
3536   out[55] = vqsubq_s16(step2[8], step2[55]);
3537   out[56] = vqsubq_s16(step2[7], step2[56]);
3538   out[57] = vqsubq_s16(step2[6], step2[57]);
3539   out[58] = vqsubq_s16(step2[5], step2[58]);
3540   out[59] = vqsubq_s16(step2[4], step2[59]);
3541   out[60] = vqsubq_s16(step2[3], step2[60]);
3542   out[61] = vqsubq_s16(step2[2], step2[61]);
3543   out[62] = vqsubq_s16(step2[1], step2[62]);
3544   out[63] = vqsubq_s16(step2[0], step2[63]);
3545 }
3546 
3547 // Functions for blocks with eob at DC and within
3548 // topleft 8x8, 16x16, 32x32 corner
3549 static const transform_neon
3550     lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
3551       {
3552           { NULL, NULL, NULL, NULL },
3553           { NULL, NULL, NULL, NULL },
3554           { NULL, NULL, NULL, NULL },
3555       },
3556       { { idct8_low1_neon, idct8_neon, NULL, NULL },
3557         { iadst8_low1_neon, iadst8_neon, NULL, NULL },
3558         { NULL, NULL, NULL, NULL } },
3559       {
3560           { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
3561           { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
3562           { NULL, NULL, NULL, NULL },
3563       },
3564       { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
3565         { NULL, NULL, NULL, NULL },
3566         { NULL, NULL, NULL, NULL } },
3567       { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
3568           idct64_low32_neon },
3569         { NULL, NULL, NULL, NULL },
3570         { NULL, NULL, NULL, NULL } }
3571     };
3572 
lowbd_inv_txfm2d_add_idtx_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)3573 static inline void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
3574                                                   uint8_t *output, int stride,
3575                                                   TX_TYPE tx_type,
3576                                                   TX_SIZE tx_size, int eob) {
3577   (void)tx_type;
3578   int16x8_t a[32 * 4];
3579   int16x8_t b[32 * 4];
3580   int eobx, eoby;
3581   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
3582   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3583   const int txw_idx = get_txw_idx(tx_size);
3584   const int txh_idx = get_txh_idx(tx_size);
3585   const int txfm_size_col = tx_size_wide[tx_size];
3586   const int txfm_size_row = tx_size_high[tx_size];
3587   lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3588                                0);
3589   lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3590                                0);
3591   const int buf_size_w_div8 = txfm_size_col >> 3;
3592   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
3593   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3594   const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3595   const int input_stride = txfm_size_row;
3596   int temp_b = 0;
3597 
3598   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3599     int16x8_t *cur_a = &a[i * txfm_size_col];
3600     load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3601                                     buf_size_nonzero_w);
3602     input += 8;
3603     if (abs(rect_type) == 1) {
3604       round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
3605     }
3606     identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
3607                              -shift[0]);
3608     for (int j = 0; j < buf_size_w_div8; ++j) {
3609       transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
3610     }
3611     temp_b += 8;
3612   }
3613   for (int j = 0; j < buf_size_w_div8; ++j) {
3614     identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3615                              txh_idx, txfm_size_row, -shift[1]);
3616   }
3617   if (txfm_size_col >= 16) {
3618     for (int i = 0; i < (txfm_size_col >> 4); i++) {
3619       lowbd_add_flip_buffer_16xn_neon(
3620           &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
3621     }
3622   } else if (txfm_size_col == 8) {
3623     lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
3624   }
3625 }
3626 
lowbd_inv_txfm2d_add_v_identity_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)3627 static inline void lowbd_inv_txfm2d_add_v_identity_neon(
3628     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3629     TX_SIZE tx_size, int eob) {
3630   int16x8_t a[16 * 2];
3631   int16x8_t b[16 * 2];
3632   int eobx, eoby, ud_flip, lr_flip;
3633   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
3634   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3635   const int txw_idx = get_txw_idx(tx_size);
3636   const int txh_idx = get_txh_idx(tx_size);
3637   const int txfm_size_col = tx_size_wide[tx_size];
3638   const int txfm_size_row = tx_size_high[tx_size];
3639   lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3640                                0);
3641   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
3642   const int buf_size_w_div8 = txfm_size_col >> 3;
3643   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3644   const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3645   const int input_stride = txfm_size_row;
3646   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
3647   int temp_b = 0;
3648   const transform_neon row_txfm =
3649       lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
3650 
3651   assert(row_txfm != NULL);
3652 
3653   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3654 
3655   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3656     int16x8_t *cur_a = &a[i * txfm_size_col];
3657     load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3658                                     buf_size_nonzero_w);
3659     input += 8;
3660     if (abs(rect_type) == 1) {
3661       round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
3662     }
3663     row_txfm(cur_a, cur_a, INV_COS_BIT);
3664     round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
3665     if (lr_flip == 1) {
3666       for (int j = 0; j < buf_size_w_div8; ++j) {
3667         flip_buf_ud_neon(&cur_a[j * 8], 8);
3668         transpose_arrays_s16_8x8(
3669             &cur_a[j * 8],
3670             &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
3671       }
3672       temp_b += 8;
3673     } else {
3674       for (int j = 0; j < buf_size_w_div8; ++j) {
3675         transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
3676       }
3677       temp_b += 8;
3678     }
3679   }
3680   for (int j = 0; j < buf_size_w_div8; ++j) {
3681     identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3682                              txh_idx, txfm_size_row, -shift[1]);
3683   }
3684   if (txfm_size_col >= 16) {
3685     for (int i = 0; i < (txfm_size_col >> 4); i++) {
3686       lowbd_add_flip_buffer_16xn_neon(
3687           &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
3688     }
3689   } else if (txfm_size_col == 8) {
3690     lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
3691   }
3692 }
3693 
lowbd_inv_txfm2d_add_h_identity_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)3694 static inline void lowbd_inv_txfm2d_add_h_identity_neon(
3695     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3696     TX_SIZE tx_size, int eob) {
3697   int16x8_t a[16 * 2];
3698   int16x8_t b[16 * 2];
3699   int eobx, eoby, ud_flip, lr_flip;
3700   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
3701   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3702   const int txw_idx = get_txw_idx(tx_size);
3703   const int txh_idx = get_txh_idx(tx_size);
3704   const int txfm_size_col = tx_size_wide[tx_size];
3705   const int txfm_size_row = tx_size_high[tx_size];
3706   lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3707                                0);
3708   const int buf_size_w_div8 = txfm_size_col >> 3;
3709   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
3710   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3711   const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3712   const int input_stride = txfm_size_row;
3713   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
3714   int temp_b = 0;
3715   const transform_neon col_txfm =
3716       lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
3717 
3718   assert(col_txfm != NULL);
3719 
3720   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3721 
3722   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3723     int16x8_t *cur_a = &a[i * txfm_size_col];
3724     load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3725                                     buf_size_nonzero_w);
3726     input += 8;
3727     if (abs(rect_type) == 1) {
3728       round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
3729     }
3730     identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
3731                              -shift[0]);
3732     for (int j = 0; j < buf_size_w_div8; ++j) {
3733       transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
3734     }
3735     temp_b += 8;
3736   }
3737   for (int j = 0; j < buf_size_w_div8; ++j) {
3738     col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
3739     round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
3740   }
3741   if (txfm_size_col >= 16) {
3742     for (int i = 0; i < (txfm_size_col >> 4); i++) {
3743       lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
3744                                       output + 16 * i, stride, ud_flip,
3745                                       txfm_size_row);
3746     }
3747   } else if (txfm_size_col == 8) {
3748     lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
3749   }
3750 }
3751 
lowbd_inv_txfm2d_add_4x4_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3752 static inline void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
3753                                                  uint8_t *output, int stride,
3754                                                  TX_TYPE tx_type, int eob) {
3755   (void)eob;
3756   TX_SIZE tx_size = TX_4X4;
3757   DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
3758   int32_t *temp_in = txfm_buf;
3759 
3760   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3761   const int txw_idx = get_txw_idx(tx_size);
3762   const int txh_idx = get_txh_idx(tx_size);
3763   const int txfm_size_col = tx_size_wide[tx_size];
3764   const int txfm_size_row = tx_size_high[tx_size];
3765   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3766   int32_t *temp_out = temp_in + buf_offset;
3767   int32_t *buf = temp_out + buf_offset;
3768   int32_t *buf_ptr = buf;
3769   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
3770   int r;
3771   const transform_1d_neon row_txfm =
3772       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3773   const transform_1d_neon col_txfm =
3774       lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3775 
3776   int ud_flip, lr_flip;
3777   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3778 
3779   for (int i = 0; i < txfm_size_row; i++) {
3780     for (int c = 0; c < txfm_size_col; ++c)
3781       temp_in[c] = input[c * txfm_size_row];
3782     row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3783 
3784     input++;
3785     buf_ptr += txfm_size_col;
3786   }
3787 
3788   for (int c = 0; c < txfm_size_col; ++c) {
3789     if (lr_flip == 0) {
3790       for (r = 0; r < txfm_size_row; ++r)
3791         temp_in[r] = buf[r * txfm_size_col + c];
3792     } else {
3793       // flip left right
3794       for (r = 0; r < txfm_size_row; ++r)
3795         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3796     }
3797     clamp_buf(temp_in, txfm_size_row, 16);
3798     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3799     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3800 
3801     if (ud_flip == 0) {
3802       for (r = 0; r < txfm_size_row; ++r) {
3803         output[r * stride + c] =
3804             clip_pixel(output[r * stride + c] + temp_out[r]);
3805       }
3806     } else {
3807       // flip upside down
3808       for (r = 0; r < txfm_size_row; ++r) {
3809         output[r * stride + c] = clip_pixel(output[r * stride + c] +
3810                                             temp_out[txfm_size_row - r - 1]);
3811       }
3812     }
3813   }
3814 }
3815 
lowbd_inv_txfm2d_add_4x8_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3816 static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
3817                                           int stride, TX_TYPE tx_type,
3818                                           int eob) {
3819   (void)eob;
3820   TX_SIZE tx_size = TX_4X8;
3821   DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
3822   int32_t *temp_in = txfm_buf;
3823 
3824   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3825   const int txw_idx = get_txw_idx(tx_size);
3826   const int txh_idx = get_txh_idx(tx_size);
3827   const int txfm_size_col = tx_size_wide[tx_size];
3828   const int txfm_size_row = tx_size_high[tx_size];
3829   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3830   int32_t *temp_out = temp_in + buf_offset;
3831   int32_t *buf = temp_out + buf_offset;
3832   int32_t *buf_ptr = buf;
3833   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3834                                                    16, 16, 16, 16 };
3835   int r;
3836   const transform_1d_neon row_txfm =
3837       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3838   const transform_1d_neon col_txfm =
3839       lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3840 
3841   int ud_flip, lr_flip;
3842   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3843 
3844   for (int i = 0; i < txfm_size_row; i++) {
3845     for (int c = 0; c < txfm_size_col; c++)
3846       temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
3847                                NewSqrt2Bits);
3848 
3849     row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3850     input++;
3851     buf_ptr += txfm_size_col;
3852   }
3853 
3854   for (int c = 0; c < txfm_size_col; ++c) {
3855     if (lr_flip == 0) {
3856       for (r = 0; r < txfm_size_row; ++r)
3857         temp_in[r] = buf[r * txfm_size_col + c];
3858     } else {
3859       // flip left right
3860       for (r = 0; r < txfm_size_row; ++r)
3861         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3862     }
3863     clamp_buf(temp_in, txfm_size_row, 16);
3864     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3865     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3866 
3867     if (ud_flip == 0) {
3868       for (r = 0; r < txfm_size_row; ++r) {
3869         output[r * stride + c] =
3870             clip_pixel(output[r * stride + c] + temp_out[r]);
3871       }
3872     } else {
3873       // flip upside down
3874       for (r = 0; r < txfm_size_row; ++r) {
3875         output[r * stride + c] = clip_pixel(output[r * stride + c] +
3876                                             temp_out[txfm_size_row - r - 1]);
3877       }
3878     }
3879   }
3880 }
3881 
lowbd_inv_txfm2d_add_8x4_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3882 static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
3883                                           int stride, TX_TYPE tx_type,
3884                                           int eob) {
3885   (void)eob;
3886   TX_SIZE tx_size = TX_8X4;
3887   DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
3888   int32_t *temp_in = txfm_buf;
3889 
3890   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3891   const int txw_idx = get_txw_idx(tx_size);
3892   const int txh_idx = get_txh_idx(tx_size);
3893   const int txfm_size_col = tx_size_wide[tx_size];
3894   const int txfm_size_row = tx_size_high[tx_size];
3895   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3896   int32_t *temp_out = temp_in + buf_offset;
3897   int32_t *buf = temp_out + buf_offset;
3898   int32_t *buf_ptr = buf;
3899   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3900                                                    16, 16, 16, 16 };
3901   int r;
3902   const transform_1d_neon row_txfm =
3903       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3904   const transform_1d_neon col_txfm =
3905       lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3906 
3907   int ud_flip, lr_flip;
3908   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3909 
3910   for (int i = 0; i < txfm_size_row; i++) {
3911     for (int c = 0; c < txfm_size_col; c++)
3912       temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
3913                                NewSqrt2Bits);
3914 
3915     row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3916     input++;
3917     buf_ptr += txfm_size_col;
3918   }
3919 
3920   for (int c = 0; c < txfm_size_col; ++c) {
3921     if (lr_flip == 0) {
3922       for (r = 0; r < txfm_size_row; ++r)
3923         temp_in[r] = buf[r * txfm_size_col + c];
3924     } else {
3925       // flip left right
3926       for (r = 0; r < txfm_size_row; ++r)
3927         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3928     }
3929     clamp_buf(temp_in, txfm_size_row, 16);
3930     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3931     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3932 
3933     if (ud_flip == 0) {
3934       for (r = 0; r < txfm_size_row; ++r) {
3935         output[r * stride + c] =
3936             clip_pixel(output[r * stride + c] + temp_out[r]);
3937       }
3938     } else {
3939       // flip upside down
3940       for (r = 0; r < txfm_size_row; ++r) {
3941         output[r * stride + c] = clip_pixel(output[r * stride + c] +
3942                                             temp_out[txfm_size_row - r - 1]);
3943       }
3944     }
3945   }
3946 }
3947 
lowbd_inv_txfm2d_add_4x16_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3948 static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
3949                                            uint8_t *output, int stride,
3950                                            TX_TYPE tx_type, int eob) {
3951   (void)eob;
3952   TX_SIZE tx_size = TX_4X16;
3953   DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
3954   int32_t *temp_in = txfm_buf;
3955 
3956   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3957   const int txw_idx = get_txw_idx(tx_size);
3958   const int txh_idx = get_txh_idx(tx_size);
3959   const int txfm_size_col = tx_size_wide[tx_size];
3960   const int txfm_size_row = tx_size_high[tx_size];
3961   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3962   int32_t *temp_out = temp_in + buf_offset;
3963   int32_t *buf = temp_out + buf_offset;
3964   int32_t *buf_ptr = buf;
3965   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
3966                                                    16, 16, 16, 16, 16 };
3967   int r;
3968   const transform_1d_neon row_txfm =
3969       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3970   const transform_1d_neon col_txfm =
3971       lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3972 
3973   int ud_flip, lr_flip;
3974   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3975 
3976   for (int i = 0; i < txfm_size_row; i++) {
3977     for (int c = 0; c < txfm_size_col; c++)
3978       temp_in[c] = input[c * txfm_size_row];
3979     row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3980     av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
3981     input++;
3982     buf_ptr += txfm_size_col;
3983   }
3984 
3985   for (int c = 0; c < txfm_size_col; ++c) {
3986     if (lr_flip == 0) {
3987       for (r = 0; r < txfm_size_row; ++r)
3988         temp_in[r] = buf[r * txfm_size_col + c];
3989     } else {
3990       // flip left right
3991       for (r = 0; r < txfm_size_row; ++r)
3992         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3993     }
3994     clamp_buf(temp_in, txfm_size_row, 16);
3995     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3996     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3997 
3998     if (ud_flip == 0) {
3999       for (r = 0; r < txfm_size_row; ++r) {
4000         output[r * stride + c] =
4001             clip_pixel(output[r * stride + c] + temp_out[r]);
4002       }
4003     } else {
4004       // flip upside down
4005       for (r = 0; r < txfm_size_row; ++r) {
4006         output[r * stride + c] = clip_pixel(output[r * stride + c] +
4007                                             temp_out[txfm_size_row - r - 1]);
4008       }
4009     }
4010   }
4011 }
4012 
lowbd_inv_txfm2d_add_16x4_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)4013 static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
4014                                            uint8_t *output, int stride,
4015                                            TX_TYPE tx_type, int eob) {
4016   (void)eob;
4017   TX_SIZE tx_size = TX_16X4;
4018   DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
4019   int32_t *temp_in = txfm_buf;
4020 
4021   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4022   const int txw_idx = get_txw_idx(tx_size);
4023   const int txh_idx = get_txh_idx(tx_size);
4024   const int txfm_size_col = tx_size_wide[tx_size];
4025   const int txfm_size_row = tx_size_high[tx_size];
4026   const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
4027   int32_t *temp_out = temp_in + buf_offset;
4028   int32_t *buf = temp_out + buf_offset;
4029   int32_t *buf_ptr = buf;
4030   const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
4031                                                    16, 16, 16, 16, 16 };
4032   int r;
4033   const transform_1d_neon row_txfm =
4034       lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
4035   const transform_1d_neon col_txfm =
4036       lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
4037 
4038   int ud_flip, lr_flip;
4039   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4040 
4041   for (int i = 0; i < txfm_size_row; i++) {
4042     for (int c = 0; c < txfm_size_col; c++)
4043       temp_in[c] = input[c * txfm_size_row];
4044     row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
4045     av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
4046     input++;
4047     buf_ptr += txfm_size_col;
4048   }
4049 
4050   for (int c = 0; c < txfm_size_col; ++c) {
4051     if (lr_flip == 0) {
4052       for (r = 0; r < txfm_size_row; ++r)
4053         temp_in[r] = buf[r * txfm_size_col + c];
4054     } else {
4055       // flip left right
4056       for (r = 0; r < txfm_size_row; ++r)
4057         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4058     }
4059     clamp_buf(temp_in, txfm_size_row, 16);
4060     col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
4061     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4062 
4063     if (ud_flip == 0) {
4064       for (r = 0; r < txfm_size_row; ++r) {
4065         output[r * stride + c] =
4066             clip_pixel(output[r * stride + c] + temp_out[r]);
4067       }
4068     } else {
4069       // flip upside down
4070       for (r = 0; r < txfm_size_row; ++r) {
4071         output[r * stride + c] = clip_pixel(output[r * stride + c] +
4072                                             temp_out[txfm_size_row - r - 1]);
4073       }
4074     }
4075   }
4076 }
4077 
lowbd_inv_txfm2d_add_no_identity_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)4078 static inline void lowbd_inv_txfm2d_add_no_identity_neon(
4079     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4080     TX_SIZE tx_size, int eob) {
4081   int16x8_t a[64 * 8];
4082   int16x8_t b[64 * 8];
4083   int eobx, eoby, ud_flip, lr_flip;
4084   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
4085   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4086   const int txw_idx = get_txw_idx(tx_size);
4087   const int txh_idx = get_txh_idx(tx_size);
4088   const int txfm_size_col = tx_size_wide[tx_size];
4089   const int txfm_size_row = tx_size_high[tx_size];
4090   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
4091   const int buf_size_w_div8 = txfm_size_col >> 3;
4092   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4093   const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
4094   const int input_stride = AOMMIN(32, txfm_size_row);
4095   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4096   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4097   int temp_b = 0;
4098 
4099   const transform_neon row_txfm =
4100       lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4101   const transform_neon col_txfm =
4102       lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4103 
4104   assert(col_txfm != NULL);
4105   assert(row_txfm != NULL);
4106 
4107   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4108 
4109   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4110     int16x8_t *cur_a = &a[i * txfm_size_col];
4111     load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
4112                                     buf_size_nonzero_w);
4113     input += 8;
4114     if (abs(rect_type) == 1) {
4115       round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
4116     }
4117     row_txfm(cur_a, cur_a, INV_COS_BIT);
4118     round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
4119     if (lr_flip == 1) {
4120       for (int j = 0; j < buf_size_w_div8; ++j) {
4121         flip_buf_ud_neon(&cur_a[j * 8], 8);
4122         transpose_arrays_s16_8x8(
4123             &cur_a[j * 8],
4124             &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
4125       }
4126       temp_b += 8;
4127     } else {
4128       for (int j = 0; j < buf_size_w_div8; ++j) {
4129         transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
4130       }
4131       temp_b += 8;
4132     }
4133   }
4134   for (int j = 0; j < buf_size_w_div8; ++j) {
4135     col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
4136     round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
4137   }
4138 
4139   if (txfm_size_col >= 16) {
4140     for (int i = 0; i < (txfm_size_col >> 4); i++) {
4141       lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
4142                                       output + 16 * i, stride, ud_flip,
4143                                       txfm_size_row);
4144     }
4145   } else if (txfm_size_col == 8) {
4146     lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
4147   }
4148 }
4149 
lowbd_inv_txfm2d_add_universe_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)4150 static inline void lowbd_inv_txfm2d_add_universe_neon(
4151     const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4152     TX_SIZE tx_size, int eob) {
4153   switch (tx_type) {
4154     case IDTX:
4155       lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
4156                                      eob);
4157       break;
4158 
4159     case H_DCT:
4160     case H_ADST:
4161     case H_FLIPADST:
4162       lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
4163                                            tx_size, eob);
4164       break;
4165 
4166     case V_DCT:
4167     case V_ADST:
4168     case V_FLIPADST:
4169       lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
4170                                            tx_size, eob);
4171       break;
4172 
4173     default:
4174       lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
4175                                             tx_size, eob);
4176       break;
4177   }
4178 }
4179 
4180 // This function is used by av1_inv_txfm2d_test.cc.
4181 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
4182                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
4183                                    int eob);
4184 
av1_lowbd_inv_txfm2d_add_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)4185 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
4186                                    int stride, TX_TYPE tx_type, TX_SIZE tx_size,
4187                                    int eob) {
4188   switch (tx_size) {
4189     case TX_4X4:
4190       lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
4191       break;
4192 
4193     case TX_4X8:
4194       lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
4195       break;
4196 
4197     case TX_8X4:
4198       lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
4199       break;
4200 
4201     case TX_4X16:
4202       lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
4203       break;
4204 
4205     case TX_16X4:
4206       lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
4207       break;
4208 
4209     default:
4210       lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
4211                                          tx_size, eob);
4212       break;
4213   }
4214 }
av1_inv_txfm_add_neon(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)4215 void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
4216                            const TxfmParam *txfm_param) {
4217   const TX_TYPE tx_type = txfm_param->tx_type;
4218   if (!txfm_param->lossless) {
4219     av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
4220                                   txfm_param->tx_size, txfm_param->eob);
4221   } else {
4222     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
4223   }
4224 }
4225