1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom_dsp/arm/transpose_neon.h"
19 #include "av1/common/av1_inv_txfm1d.h"
20 #include "av1/common/av1_inv_txfm1d_cfg.h"
21 #include "av1/common/av1_txfm.h"
22 #include "av1/common/enums.h"
23 #include "av1/common/idct.h"
24 #include "av1/common/arm/av1_inv_txfm_neon.h"
25
26 // 1D itx types
27 typedef enum ATTRIBUTE_PACKED {
28 IDCT_1D,
29 IADST_1D,
30 IFLIPADST_1D = IADST_1D,
31 IIDENTITY_1D,
32 ITX_TYPES_1D,
33 } ITX_TYPE_1D;
34
35 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
36 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
37 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
38 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
39 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
40 };
41
42 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
43 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
44 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
45 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
46 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
47 };
48
49 // 1D functions
50 static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
51 { av1_idct4, av1_iadst4, av1_iidentity4_c },
52 { av1_idct8, av1_iadst8, av1_iidentity8_c },
53 { av1_idct16, av1_iadst16, av1_iidentity16_c },
54 { av1_idct32, NULL, NULL },
55 { av1_idct64, NULL, NULL },
56 };
57
lowbd_add_flip_buffer_8xn_neon(int16x8_t * in,uint8_t * output,int stride,int flipud,const int height)58 static inline void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
59 uint8_t *output, int stride,
60 int flipud,
61 const int height) {
62 int j = flipud ? (height - 1) : 0;
63 const int step = flipud ? -1 : 1;
64 int16x8_t temp_output;
65 for (int i = 0; i < height; ++i, j += step) {
66 temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
67 temp_output = vaddq_s16(temp_output, in[j]);
68 vst1_u8(output, vqmovun_s16(temp_output));
69 output += stride;
70 }
71 }
72
lowbd_get_recon_16x16_neon(const uint8x16_t pred,int16x8_t res0,int16x8_t res1)73 static inline uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
74 int16x8_t res0,
75 int16x8_t res1) {
76 int16x8_t temp_output[2];
77 uint8x16_t temp_output_8q;
78 temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
79 temp_output[0] = vaddq_s16(temp_output[0], res0);
80 temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
81 temp_output[1] = vaddq_s16(temp_output[1], res1);
82 temp_output_8q =
83 vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
84 return temp_output_8q;
85 }
86
lowbd_add_flip_buffer_16xn_neon(int16x8_t * in,uint8_t * output,int stride,int flipud,int height)87 static inline void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
88 uint8_t *output, int stride,
89 int flipud, int height) {
90 uint8x16_t temp_output_8q;
91 int j = flipud ? (height - 1) : 0;
92 const int step = flipud ? -1 : 1;
93 for (int i = 0; i < height; ++i, j += step) {
94 temp_output_8q = vld1q_u8(output + i * stride);
95 temp_output_8q =
96 lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
97 vst1q_u8((output + i * stride), temp_output_8q);
98 }
99 }
100
lowbd_inv_txfm2d_memset_neon(int16x8_t * a,int size,int value)101 static inline void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
102 int value) {
103 for (int i = 0; i < size; i++) {
104 a[i] = vdupq_n_s16((int16_t)value);
105 }
106 }
107
btf_16_lane_0_1_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)108 static inline void btf_16_lane_0_1_neon(const int16x8_t in0,
109 const int16x8_t in1, const int16x4_t c,
110 int16x8_t *t0, int16x8_t *t1) {
111 int32x4_t s0[2], s1[2];
112 int16x4_t v0[2], v1[2];
113
114 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
115 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
116 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
117 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
118
119 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
120 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
121 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
122 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
123
124 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
125 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
126 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
127 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
128
129 *t0 = vcombine_s16(v0[0], v0[1]);
130 *t1 = vcombine_s16(v1[0], v1[1]);
131 }
132
btf_16_lane_1_0_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)133 static inline void btf_16_lane_1_0_neon(const int16x8_t in0,
134 const int16x8_t in1, const int16x4_t c,
135 int16x8_t *t0, int16x8_t *t1) {
136 int32x4_t s0[2], s1[2];
137 int16x4_t v0[2], v1[2];
138
139 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
140 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
141 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
142 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
143
144 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
145 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
146 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
147 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
148
149 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
150 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
151 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
152 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
153
154 *t0 = vcombine_s16(v0[0], v0[1]);
155 *t1 = vcombine_s16(v1[0], v1[1]);
156 }
157
btf_16_lane_2_3_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)158 static inline void btf_16_lane_2_3_neon(const int16x8_t in0,
159 const int16x8_t in1, const int16x4_t c,
160 int16x8_t *t0, int16x8_t *t1) {
161 int32x4_t s0[2], s1[2];
162 int16x4_t v0[2], v1[2];
163
164 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
165 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
166 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
167 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
168
169 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
170 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
171 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
172 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
173
174 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
175 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
176 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
177 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
178
179 *t0 = vcombine_s16(v0[0], v0[1]);
180 *t1 = vcombine_s16(v1[0], v1[1]);
181 }
182
btf_16_neon(const int16x8_t in0,int16_t coef1,int16_t coef2,int16x8_t * t0,int16x8_t * t1)183 static inline void btf_16_neon(const int16x8_t in0, int16_t coef1,
184 int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
185 int32x4_t s0_l, s0_h, s1_l, s1_h;
186 int16x4_t v0[2], v1[2];
187
188 s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
189 s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
190 s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
191 s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
192
193 v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
194 v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
195 v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
196 v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
197
198 *t0 = vcombine_s16(v0[0], v0[1]);
199 *t1 = vcombine_s16(v1[0], v1[1]);
200 }
201
btf_16_lane_3_2_neon(const int16x8_t in0,const int16x8_t in1,const int16x4_t c,int16x8_t * t0,int16x8_t * t1)202 static inline void btf_16_lane_3_2_neon(const int16x8_t in0,
203 const int16x8_t in1, const int16x4_t c,
204 int16x8_t *t0, int16x8_t *t1) {
205 int32x4_t s0[2], s1[2];
206 int16x4_t v0[2], v1[2];
207
208 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
209 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
210 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
211 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
212
213 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
214 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
215 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
216 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
217
218 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
219 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
220 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
221 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
222
223 *t0 = vcombine_s16(v0[0], v0[1]);
224 *t1 = vcombine_s16(v1[0], v1[1]);
225 }
226
btf_16_half_neon(int16x8_t * const x,const int16x4_t c)227 static inline void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
228 int32x4_t t0[2], t1[2];
229 int16x4_t v0[2], v1[2];
230
231 // Don't add/sub before multiply, which will overflow in iadst8.
232 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
233 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
234 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
235 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
236
237 t0[0] = vaddq_s32(x0_lo, x1_lo);
238 t0[1] = vaddq_s32(x0_hi, x1_hi);
239 t1[0] = vsubq_s32(x0_lo, x1_lo);
240 t1[1] = vsubq_s32(x0_hi, x1_hi);
241
242 v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
243 v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
244 v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
245 v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
246
247 x[0] = vcombine_s16(v0[0], v0[1]);
248 x[1] = vcombine_s16(v1[0], v1[1]);
249 }
250
set_s16x4_neon(const int16_t c0,const int16_t c1,const int16_t c2,const int16_t c3)251 static inline int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
252 const int16_t c2, const int16_t c3) {
253 int16x4_t val = vdup_n_s16(c0);
254 val = vset_lane_s16(c1, val, 1);
255 val = vset_lane_s16(c2, val, 2);
256 val = vset_lane_s16(c3, val, 3);
257 return val;
258 }
259
iadst8_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)260 static inline void iadst8_neon(int16x8_t *const in, int16x8_t *out,
261 int8_t cos_bit) {
262 const int32_t *cospi = cospi_arr(cos_bit);
263
264 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
265 (int16_t)cospi[20], (int16_t)cospi[44]);
266 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
267 (int16_t)cospi[52], (int16_t)cospi[12]);
268 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
269 (int16_t)cospi[16], (int16_t)cospi[48]);
270
271 int16x8_t x[8];
272 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
273
274 // Stage 1
275 x[0] = in[7];
276 x[1] = in[0];
277 x[2] = in[5];
278 x[3] = in[2];
279 x[4] = in[3];
280 x[5] = in[4];
281 x[6] = in[1];
282 x[7] = in[6];
283
284 // Stage 2
285 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
286 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
287 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
288 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
289
290 // Stage 3
291 x[0] = vqaddq_s16(s0, s4);
292 x[1] = vqaddq_s16(s1, s5);
293 x[2] = vqaddq_s16(s2, s6);
294 x[3] = vqaddq_s16(s3, s7);
295 x[4] = vqsubq_s16(s0, s4);
296 x[5] = vqsubq_s16(s1, s5);
297 x[6] = vqsubq_s16(s2, s6);
298 x[7] = vqsubq_s16(s3, s7);
299
300 // Stage 4
301 s0 = x[0];
302 s1 = x[1];
303 s2 = x[2];
304 s3 = x[3];
305 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
306 btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
307
308 // Stage 5
309 x[0] = vqaddq_s16(s0, s2);
310 x[1] = vqaddq_s16(s1, s3);
311 x[2] = vqsubq_s16(s0, s2);
312 x[3] = vqsubq_s16(s1, s3);
313 x[4] = vqaddq_s16(s4, s6);
314 x[5] = vqaddq_s16(s5, s7);
315 x[6] = vqsubq_s16(s4, s6);
316 x[7] = vqsubq_s16(s5, s7);
317
318 // stage 6
319 btf_16_half_neon(x + 2, c2);
320 btf_16_half_neon(x + 6, c2);
321
322 // Stage 7
323 out[0] = x[0];
324 out[1] = vqnegq_s16(x[4]);
325 out[2] = x[6];
326 out[3] = vqnegq_s16(x[2]);
327 out[4] = x[3];
328 out[5] = vqnegq_s16(x[7]);
329 out[6] = x[5];
330 out[7] = vqnegq_s16(x[1]);
331 }
332
iadst8_low1_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)333 static inline void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
334 int8_t cos_bit) {
335 const int32_t *cospi = cospi_arr(cos_bit);
336 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
337 (int16_t)cospi[16], (int16_t)cospi[48]);
338
339 int16x8_t x[8];
340 int16x8_t s0, s1, s4, s5;
341
342 // Stage 1
343 x[1] = in[0];
344
345 // Stage 2
346
347 btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
348
349 // Stage 3
350 x[0] = s0;
351 x[1] = s1;
352 x[4] = s0;
353 x[5] = s1;
354
355 // Stage 4
356 s0 = x[0];
357 s1 = x[1];
358 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
359
360 // Stage 5
361 x[0] = s0;
362 x[1] = s1;
363 x[2] = s0;
364 x[3] = s1;
365 x[4] = s4;
366 x[5] = s5;
367 x[6] = s4;
368 x[7] = s5;
369
370 // stage 6
371 btf_16_half_neon(x + 2, c2);
372 btf_16_half_neon(x + 6, c2);
373
374 // Stage 7
375 out[0] = x[0];
376 out[1] = vqnegq_s16(x[4]);
377 out[2] = x[6];
378 out[3] = vqnegq_s16(x[2]);
379 out[4] = x[3];
380 out[5] = vqnegq_s16(x[7]);
381 out[6] = x[5];
382 out[7] = vqnegq_s16(x[1]);
383 }
384
idct8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)385 static inline void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
386 const int32_t *cospi = cospi_arr(cos_bit);
387 int16x8_t step1[8], step2[8];
388 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
389 (int16_t)cospi[40], (int16_t)cospi[24]);
390 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
391 (int16_t)cospi[16], (int16_t)cospi[48]);
392
393 // stage 2
394 btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
395 btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
396
397 // stage 3
398 btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
399 btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
400 step2[4] = vqaddq_s16(step1[4], step1[5]);
401 step2[5] = vqsubq_s16(step1[4], step1[5]);
402 step2[6] = vqsubq_s16(step1[7], step1[6]);
403 step2[7] = vqaddq_s16(step1[7], step1[6]);
404
405 // stage 4
406 step1[0] = vqaddq_s16(step2[0], step2[3]);
407 step1[1] = vqaddq_s16(step2[1], step2[2]);
408 step1[2] = vqsubq_s16(step2[1], step2[2]);
409 step1[3] = vqsubq_s16(step2[0], step2[3]);
410 btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
411
412 // stage 5
413 out[0] = vqaddq_s16(step1[0], step2[7]);
414 out[1] = vqaddq_s16(step1[1], step1[6]);
415 out[2] = vqaddq_s16(step1[2], step1[5]);
416 out[3] = vqaddq_s16(step1[3], step2[4]);
417 out[4] = vqsubq_s16(step1[3], step2[4]);
418 out[5] = vqsubq_s16(step1[2], step1[5]);
419 out[6] = vqsubq_s16(step1[1], step1[6]);
420 out[7] = vqsubq_s16(step1[0], step2[7]);
421 }
422
idct8_low1_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)423 static inline void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
424 int8_t cos_bit) {
425 const int32_t *cospi = cospi_arr(cos_bit);
426 int16x8_t step1;
427 int32x4_t t32[2];
428
429 // stage 1
430 // stage 2
431 // stage 3
432 t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
433 t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
434
435 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
436 vrshrn_n_s32(t32[1], INV_COS_BIT));
437
438 // stage 4
439 // stage 5
440 out[0] = step1;
441 out[1] = step1;
442 out[2] = step1;
443 out[3] = step1;
444 out[4] = step1;
445 out[5] = step1;
446 out[6] = step1;
447 out[7] = step1;
448 }
449
round_shift_array_16_neon(int16x8_t * arr,int size,int bit)450 static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
451 assert(!(size % 4));
452 if (!bit) return;
453 const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
454 for (int i = 0; i < size; i++) {
455 arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
456 }
457 }
458
flip_buf_ud_neon(int16x8_t * input,int size)459 static inline void flip_buf_ud_neon(int16x8_t *input, int size) {
460 int16x8_t temp[8];
461 for (int i = 0; i < size; ++i) {
462 temp[i] = input[size - 1 - i];
463 }
464 for (int i = 0; i < size; ++i) {
465 input[i] = temp[i];
466 }
467 }
468
load_buffer_32bit_to_16bit_neon(const int32_t * input,int stride,int16x8_t * const a,int out_size)469 static inline void load_buffer_32bit_to_16bit_neon(const int32_t *input,
470 int stride,
471 int16x8_t *const a,
472 int out_size) {
473 for (int i = 0; i < out_size; ++i) {
474 a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
475 vmovn_s32(vld1q_s32(input + 4)));
476 input += stride;
477 }
478 }
479
480 static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
481 4 * 5793 };
482
identity_txfm_round_neon(int16x8_t * input,int16x8_t * output,int txw_idx,int8_t size,int bit)483 static inline void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
484 int txw_idx, int8_t size, int bit) {
485 const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
486 int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
487 int16x4_t low_i16, high_i16;
488 int32x4_t low_i32, high_i32;
489 for (int i = 0; i < size; i++) {
490 int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
491 int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
492 low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
493 high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
494 low_i16 = vqmovn_s32(low_i32);
495 high_i16 = vqmovn_s32(high_i32);
496 output[i] = vcombine_s16(low_i16, high_i16);
497 }
498 }
499
round_shift_for_rect(int16x8_t * input,int16x8_t * output,int size)500 static inline void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
501 int size) {
502 int32x4_t out_low, out_high;
503 int16x4_t low, high;
504
505 for (int z = 0; z < size; ++z) {
506 out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
507 out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
508
509 low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
510 high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
511
512 output[z] = vcombine_s16(low, high);
513 }
514 }
515
idct16_low1_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)516 static inline void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
517 int8_t cos_bit) {
518 const int32_t *cospi = cospi_arr(cos_bit);
519 int16x8_t step1;
520 int32x4_t t32[2];
521
522 // stage 4
523
524 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
525 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
526 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
527 vrshrn_n_s32(t32[1], INV_COS_BIT));
528
529 // stage 6
530 // stage 7
531 out[0] = step1;
532 out[1] = step1;
533 out[2] = step1;
534 out[3] = step1;
535 out[4] = step1;
536 out[5] = step1;
537 out[6] = step1;
538 out[7] = step1;
539 out[8] = step1;
540 out[9] = step1;
541 out[10] = step1;
542 out[11] = step1;
543 out[12] = step1;
544 out[13] = step1;
545 out[14] = step1;
546 out[15] = step1;
547 }
548
idct16_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)549 static inline void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
550 const int32_t *cospi = cospi_arr(cos_bit);
551 int16x8_t step1[16], step2[16];
552
553 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
554 (int16_t)cospi[36], (int16_t)cospi[28]);
555 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
556 (int16_t)cospi[52], (int16_t)cospi[12]);
557 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
558 (int16_t)cospi[40], (int16_t)cospi[24]);
559 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
560 (int16_t)cospi[16], (int16_t)cospi[48]);
561 const int16x4_t c4 =
562 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
563 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
564 // stage 2
565
566 btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
567 btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
568 btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
569 btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
570
571 step2[0] = in[0];
572 step2[1] = in[8];
573 step2[2] = in[4];
574 step2[3] = in[12];
575 step2[4] = in[2];
576 step2[5] = in[10];
577 step2[6] = in[6];
578 step2[7] = in[14];
579
580 // stage 3
581
582 btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
583 btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
584
585 step1[0] = step2[0];
586 step1[1] = step2[1];
587 step1[2] = step2[2];
588 step1[3] = step2[3];
589 step1[8] = vqaddq_s16(step2[8], step2[9]);
590 step1[9] = vqsubq_s16(step2[8], step2[9]);
591 step1[10] = vqsubq_s16(step2[11], step2[10]);
592 step1[11] = vqaddq_s16(step2[11], step2[10]);
593 step1[12] = vqaddq_s16(step2[12], step2[13]);
594 step1[13] = vqsubq_s16(step2[12], step2[13]);
595 step1[14] = vqsubq_s16(step2[15], step2[14]);
596 step1[15] = vqaddq_s16(step2[15], step2[14]);
597
598 // stage 4
599
600 btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
601 btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
602 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
603 btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
604
605 step2[4] = vqaddq_s16(step1[4], step1[5]);
606 step2[5] = vqsubq_s16(step1[4], step1[5]);
607 step2[6] = vqsubq_s16(step1[7], step1[6]);
608 step2[7] = vqaddq_s16(step1[7], step1[6]);
609 step2[8] = step1[8];
610 step2[11] = step1[11];
611 step2[12] = step1[12];
612 step2[15] = step1[15];
613
614 // stage 5
615
616 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
617
618 step1[0] = vqaddq_s16(step2[0], step2[3]);
619 step1[1] = vqaddq_s16(step2[1], step2[2]);
620 step1[2] = vqsubq_s16(step2[1], step2[2]);
621 step1[3] = vqsubq_s16(step2[0], step2[3]);
622 step1[4] = step2[4];
623 step1[7] = step2[7];
624 step1[8] = vqaddq_s16(step2[8], step2[11]);
625 step1[9] = vqaddq_s16(step2[9], step2[10]);
626 step1[10] = vqsubq_s16(step2[9], step2[10]);
627 step1[11] = vqsubq_s16(step2[8], step2[11]);
628 step1[12] = vqsubq_s16(step2[15], step2[12]);
629 step1[13] = vqsubq_s16(step2[14], step2[13]);
630 step1[14] = vqaddq_s16(step2[14], step2[13]);
631 step1[15] = vqaddq_s16(step2[15], step2[12]);
632
633 // stage 6
634
635 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
636 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
637
638 step2[0] = vqaddq_s16(step1[0], step1[7]);
639 step2[1] = vqaddq_s16(step1[1], step1[6]);
640 step2[2] = vqaddq_s16(step1[2], step1[5]);
641 step2[3] = vqaddq_s16(step1[3], step1[4]);
642 step2[4] = vqsubq_s16(step1[3], step1[4]);
643 step2[5] = vqsubq_s16(step1[2], step1[5]);
644 step2[6] = vqsubq_s16(step1[1], step1[6]);
645 step2[7] = vqsubq_s16(step1[0], step1[7]);
646 step2[8] = step1[8];
647 step2[9] = step1[9];
648 step2[14] = step1[14];
649 step2[15] = step1[15];
650
651 // stage 7
652 out[0] = vqaddq_s16(step2[0], step2[15]);
653 out[1] = vqaddq_s16(step2[1], step2[14]);
654 out[2] = vqaddq_s16(step2[2], step2[13]);
655 out[3] = vqaddq_s16(step2[3], step2[12]);
656 out[4] = vqaddq_s16(step2[4], step2[11]);
657 out[5] = vqaddq_s16(step2[5], step2[10]);
658 out[6] = vqaddq_s16(step2[6], step2[9]);
659 out[7] = vqaddq_s16(step2[7], step2[8]);
660 out[8] = vqsubq_s16(step2[7], step2[8]);
661 out[9] = vqsubq_s16(step2[6], step2[9]);
662 out[10] = vqsubq_s16(step2[5], step2[10]);
663 out[11] = vqsubq_s16(step2[4], step2[11]);
664 out[12] = vqsubq_s16(step2[3], step2[12]);
665 out[13] = vqsubq_s16(step2[2], step2[13]);
666 out[14] = vqsubq_s16(step2[1], step2[14]);
667 out[15] = vqsubq_s16(step2[0], step2[15]);
668 }
669
idct16_low8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)670 static inline void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
671 int8_t cos_bit) {
672 const int32_t *cospi = cospi_arr(cos_bit);
673 int16x8_t step1[16], step2[16];
674 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
675 (int16_t)cospi[16], (int16_t)cospi[48]);
676 const int16x4_t c1 =
677 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
678 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
679
680 // stage 1
681 // stage 2
682
683 step2[0] = in[0];
684 step2[2] = in[4];
685 step2[4] = in[2];
686 step2[6] = in[6];
687
688 btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
689 btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
690 btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
691 btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
692
693 // stage 3
694
695 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
696 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
697
698 step1[0] = step2[0];
699 step1[2] = step2[2];
700 step1[8] = vqaddq_s16(step2[8], step2[9]);
701 step1[9] = vqsubq_s16(step2[8], step2[9]);
702 step1[10] = vqsubq_s16(step2[11], step2[10]);
703 step1[11] = vqaddq_s16(step2[11], step2[10]);
704 step1[12] = vqaddq_s16(step2[12], step2[13]);
705 step1[13] = vqsubq_s16(step2[12], step2[13]);
706 step1[14] = vqsubq_s16(step2[15], step2[14]);
707 step1[15] = vqaddq_s16(step2[15], step2[14]);
708
709 // stage 4
710
711 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
712 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
713 btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
714 btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
715
716 step2[4] = vqaddq_s16(step1[4], step1[5]);
717 step2[5] = vqsubq_s16(step1[4], step1[5]);
718 step2[6] = vqsubq_s16(step1[7], step1[6]);
719 step2[7] = vqaddq_s16(step1[7], step1[6]);
720 step2[8] = step1[8];
721 step2[11] = step1[11];
722 step2[12] = step1[12];
723 step2[15] = step1[15];
724
725 // stage 5
726
727 btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
728 step1[0] = vqaddq_s16(step2[0], step2[3]);
729 step1[1] = vqaddq_s16(step2[1], step2[2]);
730 step1[2] = vqsubq_s16(step2[1], step2[2]);
731 step1[3] = vqsubq_s16(step2[0], step2[3]);
732 step1[4] = step2[4];
733 step1[7] = step2[7];
734 step1[8] = vqaddq_s16(step2[8], step2[11]);
735 step1[9] = vqaddq_s16(step2[9], step2[10]);
736 step1[10] = vqsubq_s16(step2[9], step2[10]);
737 step1[11] = vqsubq_s16(step2[8], step2[11]);
738 step1[12] = vqsubq_s16(step2[15], step2[12]);
739 step1[13] = vqsubq_s16(step2[14], step2[13]);
740 step1[14] = vqaddq_s16(step2[14], step2[13]);
741 step1[15] = vqaddq_s16(step2[15], step2[12]);
742
743 // stage 6
744 btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
745 btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
746
747 step2[0] = vqaddq_s16(step1[0], step1[7]);
748 step2[1] = vqaddq_s16(step1[1], step1[6]);
749 step2[2] = vqaddq_s16(step1[2], step1[5]);
750 step2[3] = vqaddq_s16(step1[3], step1[4]);
751 step2[4] = vqsubq_s16(step1[3], step1[4]);
752 step2[5] = vqsubq_s16(step1[2], step1[5]);
753 step2[6] = vqsubq_s16(step1[1], step1[6]);
754 step2[7] = vqsubq_s16(step1[0], step1[7]);
755 step2[8] = step1[8];
756 step2[9] = step1[9];
757 step2[14] = step1[14];
758 step2[15] = step1[15];
759
760 // stage 7
761
762 out[0] = vqaddq_s16(step2[0], step2[15]);
763 out[1] = vqaddq_s16(step2[1], step2[14]);
764 out[2] = vqaddq_s16(step2[2], step2[13]);
765 out[3] = vqaddq_s16(step2[3], step2[12]);
766 out[4] = vqaddq_s16(step2[4], step2[11]);
767 out[5] = vqaddq_s16(step2[5], step2[10]);
768 out[6] = vqaddq_s16(step2[6], step2[9]);
769 out[7] = vqaddq_s16(step2[7], step2[8]);
770 out[8] = vqsubq_s16(step2[7], step2[8]);
771 out[9] = vqsubq_s16(step2[6], step2[9]);
772 out[10] = vqsubq_s16(step2[5], step2[10]);
773 out[11] = vqsubq_s16(step2[4], step2[11]);
774 out[12] = vqsubq_s16(step2[3], step2[12]);
775 out[13] = vqsubq_s16(step2[2], step2[13]);
776 out[14] = vqsubq_s16(step2[1], step2[14]);
777 out[15] = vqsubq_s16(step2[0], step2[15]);
778 }
779
iadst16_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)780 static inline void iadst16_neon(int16x8_t *const in, int16x8_t *out,
781 int8_t cos_bit) {
782 const int32_t *cospi = cospi_arr(cos_bit);
783
784 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
785 (int16_t)cospi[10], (int16_t)cospi[54]);
786 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
787 (int16_t)cospi[26], (int16_t)cospi[38]);
788 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
789 (int16_t)cospi[42], (int16_t)cospi[22]);
790 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
791 (int16_t)cospi[58], (int16_t)cospi[6]);
792 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
793 (int16_t)cospi[40], (int16_t)cospi[24]);
794 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
795 (int16_t)cospi[16], (int16_t)cospi[48]);
796
797 int16x8_t x[16];
798 int16x8_t t[14];
799 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
800 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
801
802 // Stage 1
803 x[0] = in[15];
804 x[1] = in[0];
805 x[2] = in[13];
806 x[3] = in[2];
807 x[4] = in[11];
808 x[5] = in[4];
809 x[6] = in[9];
810 x[7] = in[6];
811 x[8] = in[7];
812 x[9] = in[8];
813 x[10] = in[5];
814 x[11] = in[10];
815 x[12] = in[3];
816 x[13] = in[12];
817 x[14] = in[1];
818 x[15] = in[14];
819
820 // Stage 2
821 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
822 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
823 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
824 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
825 btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
826 btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
827 btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
828 btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
829
830 // Stage 3
831 x[0] = vqaddq_s16(s0, s8);
832 x[1] = vqaddq_s16(s1, s9);
833 x[2] = vqaddq_s16(s2, s10);
834 x[3] = vqaddq_s16(s3, s11);
835 x[4] = vqaddq_s16(s4, s12);
836 x[5] = vqaddq_s16(s5, s13);
837 x[6] = vqaddq_s16(s6, s14);
838 x[7] = vqaddq_s16(s7, s15);
839 x[8] = vqsubq_s16(s0, s8);
840 x[9] = vqsubq_s16(s1, s9);
841 x[10] = vqsubq_s16(s2, s10);
842 x[11] = vqsubq_s16(s3, s11);
843 x[12] = vqsubq_s16(s4, s12);
844 x[13] = vqsubq_s16(s5, s13);
845 x[14] = vqsubq_s16(s6, s14);
846 x[15] = vqsubq_s16(s7, s15);
847
848 // Stage 4
849 t[0] = x[0];
850 t[1] = x[1];
851 t[2] = x[2];
852 t[3] = x[3];
853 t[4] = x[4];
854 t[5] = x[5];
855 t[6] = x[6];
856 t[7] = x[7];
857 btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
858 btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
859 btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
860 btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
861
862 // Stage 5
863 x[0] = vqaddq_s16(t[0], t[4]);
864 x[1] = vqaddq_s16(t[1], t[5]);
865 x[2] = vqaddq_s16(t[2], t[6]);
866 x[3] = vqaddq_s16(t[3], t[7]);
867 x[4] = vqsubq_s16(t[0], t[4]);
868 x[5] = vqsubq_s16(t[1], t[5]);
869 x[6] = vqsubq_s16(t[2], t[6]);
870 x[7] = vqsubq_s16(t[3], t[7]);
871 x[8] = vqaddq_s16(s8, s12);
872 x[9] = vqaddq_s16(s9, s13);
873 x[10] = vqaddq_s16(s10, s14);
874 x[11] = vqaddq_s16(s11, s15);
875 x[12] = vqsubq_s16(s8, s12);
876 x[13] = vqsubq_s16(s9, s13);
877 x[14] = vqsubq_s16(s10, s14);
878 x[15] = vqsubq_s16(s11, s15);
879
880 // stage 6
881 t[0] = x[0];
882 t[1] = x[1];
883 t[2] = x[2];
884 t[3] = x[3];
885 btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
886 btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
887 t[8] = x[8];
888 t[9] = x[9];
889 t[10] = x[10];
890 t[11] = x[11];
891 btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
892 btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
893
894 // Stage 7
895 x[0] = vqaddq_s16(t[0], t[2]);
896 x[1] = vqaddq_s16(t[1], t[3]);
897 x[2] = vqsubq_s16(t[0], t[2]);
898 x[3] = vqsubq_s16(t[1], t[3]);
899 x[4] = vqaddq_s16(s4, s6);
900 x[5] = vqaddq_s16(s5, s7);
901 x[6] = vqsubq_s16(s4, s6);
902 x[7] = vqsubq_s16(s5, s7);
903 x[8] = vqaddq_s16(t[8], t[10]);
904 x[9] = vqaddq_s16(t[9], t[11]);
905 x[10] = vqsubq_s16(t[8], t[10]);
906 x[11] = vqsubq_s16(t[9], t[11]);
907 x[12] = vqaddq_s16(s12, s14);
908 x[13] = vqaddq_s16(s13, s15);
909 x[14] = vqsubq_s16(s12, s14);
910 x[15] = vqsubq_s16(s13, s15);
911
912 // Stage 8
913 btf_16_half_neon(x + 2, c5);
914 btf_16_half_neon(x + 6, c5);
915 btf_16_half_neon(x + 10, c5);
916 btf_16_half_neon(x + 14, c5);
917
918 // Stage 9
919 out[0] = x[0];
920 out[1] = vqnegq_s16(x[8]);
921 out[2] = x[12];
922 out[3] = vqnegq_s16(x[4]);
923 out[4] = x[6];
924 out[5] = vqnegq_s16(x[14]);
925 out[6] = x[10];
926 out[7] = vqnegq_s16(x[2]);
927 out[8] = x[3];
928 out[9] = vqnegq_s16(x[11]);
929 out[10] = x[15];
930 out[11] = vqnegq_s16(x[7]);
931 out[12] = x[5];
932 out[13] = vqnegq_s16(x[13]);
933 out[14] = x[9];
934 out[15] = vqnegq_s16(x[1]);
935 }
936
iadst16_low1_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)937 static inline void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
938 int8_t cos_bit) {
939 const int32_t *cospi = cospi_arr(cos_bit);
940 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
941 (int16_t)cospi[40], (int16_t)cospi[24]);
942 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
943 (int16_t)cospi[16], (int16_t)cospi[48]);
944
945 int16x8_t x[16];
946 int16x8_t t[10];
947 int16x8_t s0, s1, s4, s5;
948 int16x8_t s8, s9, s12, s13;
949
950 // Stage 1
951 x[1] = in[0];
952
953 // Stage 2
954 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
955
956 // Stage 3
957 x[0] = s0;
958 x[1] = s1;
959 x[8] = s0;
960 x[9] = s1;
961
962 // Stage 4
963 t[0] = x[0];
964 t[1] = x[1];
965 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
966
967 // Stage 5
968 x[0] = t[0];
969 x[1] = t[1];
970 x[4] = t[0];
971 x[5] = t[1];
972 x[8] = s8;
973 x[9] = s9;
974 x[12] = s8;
975 x[13] = s9;
976
977 // stage 6
978 t[0] = x[0];
979 t[1] = x[1];
980 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
981 t[8] = x[8];
982 t[9] = x[9];
983 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
984
985 // Stage 7
986 x[0] = t[0];
987 x[1] = t[1];
988 x[2] = t[0];
989 x[3] = t[1];
990 x[4] = s4;
991 x[5] = s5;
992 x[6] = s4;
993 x[7] = s5;
994 x[8] = t[8];
995 x[9] = t[9];
996 x[10] = t[8];
997 x[11] = t[9];
998 x[12] = s12;
999 x[13] = s13;
1000 x[14] = s12;
1001 x[15] = s13;
1002
1003 // Stage 8
1004 btf_16_half_neon(x + 2, c1);
1005 btf_16_half_neon(x + 6, c1);
1006 btf_16_half_neon(x + 10, c1);
1007 btf_16_half_neon(x + 14, c1);
1008
1009 // Stage 9
1010 out[0] = x[0];
1011 out[1] = vqnegq_s16(x[8]);
1012 out[2] = x[12];
1013 out[3] = vqnegq_s16(x[4]);
1014 out[4] = x[6];
1015 out[5] = vqnegq_s16(x[14]);
1016 out[6] = x[10];
1017 out[7] = vqnegq_s16(x[2]);
1018 out[8] = x[3];
1019 out[9] = vqnegq_s16(x[11]);
1020 out[10] = x[15];
1021 out[11] = vqnegq_s16(x[7]);
1022 out[12] = x[5];
1023 out[13] = vqnegq_s16(x[13]);
1024 out[14] = x[9];
1025 out[15] = vqnegq_s16(x[1]);
1026 }
1027
iadst16_low8_neon(int16x8_t * const in,int16x8_t * out,int8_t cos_bit)1028 static inline void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
1029 int8_t cos_bit) {
1030 const int32_t *cospi = cospi_arr(cos_bit);
1031
1032 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1033 (int16_t)cospi[40], (int16_t)cospi[24]);
1034 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1035 (int16_t)cospi[16], (int16_t)cospi[48]);
1036
1037 int16x8_t x[16];
1038 int16x8_t t[14];
1039 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1040 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
1041
1042 // Stage 1
1043 x[1] = in[0];
1044 x[3] = in[2];
1045 x[5] = in[4];
1046 x[7] = in[6];
1047 x[8] = in[7];
1048 x[10] = in[5];
1049 x[12] = in[3];
1050 x[14] = in[1];
1051
1052 // Stage 2
1053 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
1054 btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
1055 btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
1056 btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
1057
1058 btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
1059 btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
1060 btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
1061 btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
1062
1063 // Stage 3
1064 x[0] = vqaddq_s16(s0, s8);
1065 x[1] = vqaddq_s16(s1, s9);
1066 x[2] = vqaddq_s16(s2, s10);
1067 x[3] = vqaddq_s16(s3, s11);
1068 x[4] = vqaddq_s16(s4, s12);
1069 x[5] = vqaddq_s16(s5, s13);
1070 x[6] = vqaddq_s16(s6, s14);
1071 x[7] = vqaddq_s16(s7, s15);
1072 x[8] = vqsubq_s16(s0, s8);
1073 x[9] = vqsubq_s16(s1, s9);
1074 x[10] = vqsubq_s16(s2, s10);
1075 x[11] = vqsubq_s16(s3, s11);
1076 x[12] = vqsubq_s16(s4, s12);
1077 x[13] = vqsubq_s16(s5, s13);
1078 x[14] = vqsubq_s16(s6, s14);
1079 x[15] = vqsubq_s16(s7, s15);
1080
1081 // Stage 4
1082 t[0] = x[0];
1083 t[1] = x[1];
1084 t[2] = x[2];
1085 t[3] = x[3];
1086 t[4] = x[4];
1087 t[5] = x[5];
1088 t[6] = x[6];
1089 t[7] = x[7];
1090 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
1091 btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
1092 btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
1093 btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
1094
1095 // Stage 5
1096 x[0] = vqaddq_s16(t[0], t[4]);
1097 x[1] = vqaddq_s16(t[1], t[5]);
1098 x[2] = vqaddq_s16(t[2], t[6]);
1099 x[3] = vqaddq_s16(t[3], t[7]);
1100 x[4] = vqsubq_s16(t[0], t[4]);
1101 x[5] = vqsubq_s16(t[1], t[5]);
1102 x[6] = vqsubq_s16(t[2], t[6]);
1103 x[7] = vqsubq_s16(t[3], t[7]);
1104 x[8] = vqaddq_s16(s8, s12);
1105 x[9] = vqaddq_s16(s9, s13);
1106 x[10] = vqaddq_s16(s10, s14);
1107 x[11] = vqaddq_s16(s11, s15);
1108 x[12] = vqsubq_s16(s8, s12);
1109 x[13] = vqsubq_s16(s9, s13);
1110 x[14] = vqsubq_s16(s10, s14);
1111 x[15] = vqsubq_s16(s11, s15);
1112
1113 // stage 6
1114 t[0] = x[0];
1115 t[1] = x[1];
1116 t[2] = x[2];
1117 t[3] = x[3];
1118 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
1119 btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
1120 t[8] = x[8];
1121 t[9] = x[9];
1122 t[10] = x[10];
1123 t[11] = x[11];
1124 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
1125 btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
1126
1127 // Stage 7
1128 x[0] = vqaddq_s16(t[0], t[2]);
1129 x[1] = vqaddq_s16(t[1], t[3]);
1130 x[2] = vqsubq_s16(t[0], t[2]);
1131 x[3] = vqsubq_s16(t[1], t[3]);
1132 x[4] = vqaddq_s16(s4, s6);
1133 x[5] = vqaddq_s16(s5, s7);
1134 x[6] = vqsubq_s16(s4, s6);
1135 x[7] = vqsubq_s16(s5, s7);
1136 x[8] = vqaddq_s16(t[8], t[10]);
1137 x[9] = vqaddq_s16(t[9], t[11]);
1138 x[10] = vqsubq_s16(t[8], t[10]);
1139 x[11] = vqsubq_s16(t[9], t[11]);
1140 x[12] = vqaddq_s16(s12, s14);
1141 x[13] = vqaddq_s16(s13, s15);
1142 x[14] = vqsubq_s16(s12, s14);
1143 x[15] = vqsubq_s16(s13, s15);
1144
1145 // Stage 8
1146 btf_16_half_neon(x + 2, c1);
1147 btf_16_half_neon(x + 6, c1);
1148 btf_16_half_neon(x + 10, c1);
1149 btf_16_half_neon(x + 14, c1);
1150
1151 // Stage 9
1152 out[0] = x[0];
1153 out[1] = vqnegq_s16(x[8]);
1154 out[2] = x[12];
1155 out[3] = vqnegq_s16(x[4]);
1156 out[4] = x[6];
1157 out[5] = vqnegq_s16(x[14]);
1158 out[6] = x[10];
1159 out[7] = vqnegq_s16(x[2]);
1160 out[8] = x[3];
1161 out[9] = vqnegq_s16(x[11]);
1162 out[10] = x[15];
1163 out[11] = vqnegq_s16(x[7]);
1164 out[12] = x[5];
1165 out[13] = vqnegq_s16(x[13]);
1166 out[14] = x[9];
1167 out[15] = vqnegq_s16(x[1]);
1168 }
1169
idct32_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1170 static inline void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
1171 const int32_t *cospi = cospi_arr(cos_bit);
1172 int16x8_t step1[32], step2[32];
1173
1174 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
1175 (int16_t)cospi[34], (int16_t)cospi[30]);
1176 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
1177 (int16_t)cospi[50], (int16_t)cospi[14]);
1178 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
1179 (int16_t)cospi[42], (int16_t)cospi[22]);
1180 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
1181 (int16_t)cospi[58], (int16_t)cospi[6]);
1182 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
1183 (int16_t)cospi[36], (int16_t)cospi[28]);
1184 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
1185 (int16_t)cospi[52], (int16_t)cospi[12]);
1186 const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1187 (int16_t)cospi[40], (int16_t)cospi[24]);
1188 const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1189 (int16_t)cospi[16], (int16_t)cospi[48]);
1190 const int16x4_t c8 =
1191 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1192 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1193 const int16x4_t c9 =
1194 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1195 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
1196
1197 // stage 2
1198
1199 btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
1200 btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
1201 btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
1202 btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
1203 btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
1204 btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
1205 btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
1206 btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
1207
1208 step2[0] = in[0];
1209 step2[1] = in[16];
1210 step2[2] = in[8];
1211 step2[3] = in[24];
1212 step2[4] = in[4];
1213 step2[5] = in[20];
1214 step2[6] = in[12];
1215 step2[7] = in[28];
1216 step2[8] = in[2];
1217 step2[9] = in[18];
1218 step2[10] = in[10];
1219 step2[11] = in[26];
1220 step2[12] = in[6];
1221 step2[13] = in[22];
1222 step2[14] = in[14];
1223 step2[15] = in[30];
1224
1225 // stage 3
1226
1227 btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
1228 btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
1229 btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
1230 btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
1231
1232 step1[0] = step2[0];
1233 step1[1] = step2[1];
1234 step1[2] = step2[2];
1235 step1[3] = step2[3];
1236 step1[4] = step2[4];
1237 step1[5] = step2[5];
1238 step1[6] = step2[6];
1239 step1[7] = step2[7];
1240
1241 step1[16] = vqaddq_s16(step2[16], step2[17]);
1242 step1[17] = vqsubq_s16(step2[16], step2[17]);
1243 step1[18] = vqsubq_s16(step2[19], step2[18]);
1244 step1[19] = vqaddq_s16(step2[19], step2[18]);
1245 step1[20] = vqaddq_s16(step2[20], step2[21]);
1246 step1[21] = vqsubq_s16(step2[20], step2[21]);
1247 step1[22] = vqsubq_s16(step2[23], step2[22]);
1248 step1[23] = vqaddq_s16(step2[23], step2[22]);
1249 step1[24] = vqaddq_s16(step2[24], step2[25]);
1250 step1[25] = vqsubq_s16(step2[24], step2[25]);
1251 step1[26] = vqsubq_s16(step2[27], step2[26]);
1252 step1[27] = vqaddq_s16(step2[27], step2[26]);
1253 step1[28] = vqaddq_s16(step2[28], step2[29]);
1254 step1[29] = vqsubq_s16(step2[28], step2[29]);
1255 step1[30] = vqsubq_s16(step2[31], step2[30]);
1256 step1[31] = vqaddq_s16(step2[31], step2[30]);
1257
1258 // stage 4
1259
1260 btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
1261 btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
1262 btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
1263 btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
1264 btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
1265 btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
1266
1267 step2[0] = step1[0];
1268 step2[1] = step1[1];
1269 step2[2] = step1[2];
1270 step2[3] = step1[3];
1271 step2[8] = vqaddq_s16(step1[8], step1[9]);
1272 step2[9] = vqsubq_s16(step1[8], step1[9]);
1273 step2[10] = vqsubq_s16(step1[11], step1[10]);
1274 step2[11] = vqaddq_s16(step1[11], step1[10]);
1275 step2[12] = vqaddq_s16(step1[12], step1[13]);
1276 step2[13] = vqsubq_s16(step1[12], step1[13]);
1277 step2[14] = vqsubq_s16(step1[15], step1[14]);
1278 step2[15] = vqaddq_s16(step1[15], step1[14]);
1279 step2[16] = step1[16];
1280 step2[19] = step1[19];
1281 step2[20] = step1[20];
1282 step2[23] = step1[23];
1283 step2[24] = step1[24];
1284 step2[27] = step1[27];
1285 step2[28] = step1[28];
1286 step2[31] = step1[31];
1287
1288 // stage 5
1289
1290 btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
1291 btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
1292 btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
1293 btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
1294
1295 step1[4] = vqaddq_s16(step2[4], step2[5]);
1296 step1[5] = vqsubq_s16(step2[4], step2[5]);
1297 step1[6] = vqsubq_s16(step2[7], step2[6]);
1298 step1[7] = vqaddq_s16(step2[7], step2[6]);
1299 step1[8] = step2[8];
1300 step1[11] = step2[11];
1301 step1[12] = step2[12];
1302 step1[15] = step2[15];
1303 step1[16] = vqaddq_s16(step2[16], step2[19]);
1304 step1[17] = vqaddq_s16(step2[17], step2[18]);
1305 step1[18] = vqsubq_s16(step2[17], step2[18]);
1306 step1[19] = vqsubq_s16(step2[16], step2[19]);
1307 step1[20] = vqsubq_s16(step2[23], step2[20]);
1308 step1[21] = vqsubq_s16(step2[22], step2[21]);
1309 step1[22] = vqaddq_s16(step2[22], step2[21]);
1310 step1[23] = vqaddq_s16(step2[23], step2[20]);
1311 step1[24] = vqaddq_s16(step2[24], step2[27]);
1312 step1[25] = vqaddq_s16(step2[25], step2[26]);
1313 step1[26] = vqsubq_s16(step2[25], step2[26]);
1314 step1[27] = vqsubq_s16(step2[24], step2[27]);
1315 step1[28] = vqsubq_s16(step2[31], step2[28]);
1316 step1[29] = vqsubq_s16(step2[30], step2[29]);
1317 step1[30] = vqaddq_s16(step2[30], step2[29]);
1318 step1[31] = vqaddq_s16(step2[31], step2[28]);
1319
1320 // stage 6
1321
1322 btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
1323 btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
1324 btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
1325 btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
1326 btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
1327
1328 step2[0] = vqaddq_s16(step1[0], step1[3]);
1329 step2[1] = vqaddq_s16(step1[1], step1[2]);
1330 step2[2] = vqsubq_s16(step1[1], step1[2]);
1331 step2[3] = vqsubq_s16(step1[0], step1[3]);
1332 step2[4] = step1[4];
1333 step2[7] = step1[7];
1334 step2[8] = vqaddq_s16(step1[8], step1[11]);
1335 step2[9] = vqaddq_s16(step1[9], step1[10]);
1336 step2[10] = vqsubq_s16(step1[9], step1[10]);
1337 step2[11] = vqsubq_s16(step1[8], step1[11]);
1338 step2[12] = vqsubq_s16(step1[15], step1[12]);
1339 step2[13] = vqsubq_s16(step1[14], step1[13]);
1340 step2[14] = vqaddq_s16(step1[14], step1[13]);
1341 step2[15] = vqaddq_s16(step1[15], step1[12]);
1342 step2[16] = step1[16];
1343 step2[17] = step1[17];
1344 step2[22] = step1[22];
1345 step2[23] = step1[23];
1346 step2[24] = step1[24];
1347 step2[25] = step1[25];
1348 step2[30] = step1[30];
1349 step2[31] = step1[31];
1350
1351 // stage 7
1352
1353 btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
1354 btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
1355
1356 step1[0] = vqaddq_s16(step2[0], step2[7]);
1357 step1[1] = vqaddq_s16(step2[1], step2[6]);
1358 step1[2] = vqaddq_s16(step2[2], step2[5]);
1359 step1[3] = vqaddq_s16(step2[3], step2[4]);
1360 step1[4] = vqsubq_s16(step2[3], step2[4]);
1361 step1[5] = vqsubq_s16(step2[2], step2[5]);
1362 step1[6] = vqsubq_s16(step2[1], step2[6]);
1363 step1[7] = vqsubq_s16(step2[0], step2[7]);
1364 step1[8] = step2[8];
1365 step1[9] = step2[9];
1366 step1[14] = step2[14];
1367 step1[15] = step2[15];
1368 step1[16] = vqaddq_s16(step2[16], step2[23]);
1369 step1[17] = vqaddq_s16(step2[17], step2[22]);
1370 step1[18] = vqaddq_s16(step2[18], step2[21]);
1371 step1[19] = vqaddq_s16(step2[19], step2[20]);
1372 step1[20] = vqsubq_s16(step2[19], step2[20]);
1373 step1[21] = vqsubq_s16(step2[18], step2[21]);
1374 step1[22] = vqsubq_s16(step2[17], step2[22]);
1375 step1[23] = vqsubq_s16(step2[16], step2[23]);
1376 step1[24] = vqsubq_s16(step2[31], step2[24]);
1377 step1[25] = vqsubq_s16(step2[30], step2[25]);
1378 step1[26] = vqsubq_s16(step2[29], step2[26]);
1379 step1[27] = vqsubq_s16(step2[28], step2[27]);
1380 step1[28] = vqaddq_s16(step2[27], step2[28]);
1381 step1[29] = vqaddq_s16(step2[26], step2[29]);
1382 step1[30] = vqaddq_s16(step2[25], step2[30]);
1383 step1[31] = vqaddq_s16(step2[24], step2[31]);
1384
1385 // stage 8
1386
1387 btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
1388 btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
1389 btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
1390 btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
1391
1392 step2[0] = vqaddq_s16(step1[0], step1[15]);
1393 step2[1] = vqaddq_s16(step1[1], step1[14]);
1394 step2[2] = vqaddq_s16(step1[2], step1[13]);
1395 step2[3] = vqaddq_s16(step1[3], step1[12]);
1396 step2[4] = vqaddq_s16(step1[4], step1[11]);
1397 step2[5] = vqaddq_s16(step1[5], step1[10]);
1398 step2[6] = vqaddq_s16(step1[6], step1[9]);
1399 step2[7] = vqaddq_s16(step1[7], step1[8]);
1400 step2[8] = vqsubq_s16(step1[7], step1[8]);
1401 step2[9] = vqsubq_s16(step1[6], step1[9]);
1402 step2[10] = vqsubq_s16(step1[5], step1[10]);
1403 step2[11] = vqsubq_s16(step1[4], step1[11]);
1404 step2[12] = vqsubq_s16(step1[3], step1[12]);
1405 step2[13] = vqsubq_s16(step1[2], step1[13]);
1406 step2[14] = vqsubq_s16(step1[1], step1[14]);
1407 step2[15] = vqsubq_s16(step1[0], step1[15]);
1408 step2[16] = step1[16];
1409 step2[17] = step1[17];
1410 step2[18] = step1[18];
1411 step2[19] = step1[19];
1412 step2[28] = step1[28];
1413 step2[29] = step1[29];
1414 step2[30] = step1[30];
1415 step2[31] = step1[31];
1416
1417 // stage 9
1418
1419 out[0] = vqaddq_s16(step2[0], step2[31]);
1420 out[1] = vqaddq_s16(step2[1], step2[30]);
1421 out[2] = vqaddq_s16(step2[2], step2[29]);
1422 out[3] = vqaddq_s16(step2[3], step2[28]);
1423 out[4] = vqaddq_s16(step2[4], step2[27]);
1424 out[5] = vqaddq_s16(step2[5], step2[26]);
1425 out[6] = vqaddq_s16(step2[6], step2[25]);
1426 out[7] = vqaddq_s16(step2[7], step2[24]);
1427 out[8] = vqaddq_s16(step2[8], step2[23]);
1428 out[9] = vqaddq_s16(step2[9], step2[22]);
1429 out[10] = vqaddq_s16(step2[10], step2[21]);
1430 out[11] = vqaddq_s16(step2[11], step2[20]);
1431 out[12] = vqaddq_s16(step2[12], step2[19]);
1432 out[13] = vqaddq_s16(step2[13], step2[18]);
1433 out[14] = vqaddq_s16(step2[14], step2[17]);
1434 out[15] = vqaddq_s16(step2[15], step2[16]);
1435 out[16] = vqsubq_s16(step2[15], step2[16]);
1436 out[17] = vqsubq_s16(step2[14], step2[17]);
1437 out[18] = vqsubq_s16(step2[13], step2[18]);
1438 out[19] = vqsubq_s16(step2[12], step2[19]);
1439 out[20] = vqsubq_s16(step2[11], step2[20]);
1440 out[21] = vqsubq_s16(step2[10], step2[21]);
1441 out[22] = vqsubq_s16(step2[9], step2[22]);
1442 out[23] = vqsubq_s16(step2[8], step2[23]);
1443 out[24] = vqsubq_s16(step2[7], step2[24]);
1444 out[25] = vqsubq_s16(step2[6], step2[25]);
1445 out[26] = vqsubq_s16(step2[5], step2[26]);
1446 out[27] = vqsubq_s16(step2[4], step2[27]);
1447 out[28] = vqsubq_s16(step2[3], step2[28]);
1448 out[29] = vqsubq_s16(step2[2], step2[29]);
1449 out[30] = vqsubq_s16(step2[1], step2[30]);
1450 out[31] = vqsubq_s16(step2[0], step2[31]);
1451 }
1452
idct32_low1_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1453 static inline void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
1454 int8_t cos_bit) {
1455 const int32_t *cospi = cospi_arr(cos_bit);
1456 int16x8_t step1;
1457 int32x4_t t32[2];
1458
1459 // stage 1
1460 // stage 2
1461 // stage 3
1462 // stage 4
1463 // stage 5
1464
1465 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
1466 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
1467 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1468 vrshrn_n_s32(t32[1], INV_COS_BIT));
1469
1470 // stage 6
1471 // stage 7
1472 // stage 8
1473 // stage 9
1474
1475 out[0] = step1;
1476 out[1] = step1;
1477 out[2] = step1;
1478 out[3] = step1;
1479 out[4] = step1;
1480 out[5] = step1;
1481 out[6] = step1;
1482 out[7] = step1;
1483 out[8] = step1;
1484 out[9] = step1;
1485 out[10] = step1;
1486 out[11] = step1;
1487 out[12] = step1;
1488 out[13] = step1;
1489 out[14] = step1;
1490 out[15] = step1;
1491 out[16] = step1;
1492 out[17] = step1;
1493 out[18] = step1;
1494 out[19] = step1;
1495 out[20] = step1;
1496 out[21] = step1;
1497 out[22] = step1;
1498 out[23] = step1;
1499 out[24] = step1;
1500 out[25] = step1;
1501 out[26] = step1;
1502 out[27] = step1;
1503 out[28] = step1;
1504 out[29] = step1;
1505 out[30] = step1;
1506 out[31] = step1;
1507 }
1508
idct32_low8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1509 static inline void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
1510 int8_t cos_bit) {
1511 const int32_t *cospi = cospi_arr(cos_bit);
1512 int16x8_t step1[32], step2[32];
1513 int32x4_t t32[16];
1514 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1515 (int16_t)cospi[40], (int16_t)cospi[24]);
1516 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1517 (int16_t)cospi[16], cospi[48]);
1518 const int16x4_t c2 =
1519 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1520 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1521 const int16x4_t c3 =
1522 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1523 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
1524 // stage 1
1525 // stage 2
1526
1527 step2[0] = in[0];
1528 step2[4] = in[4];
1529 step2[8] = in[2];
1530 step2[12] = in[6];
1531
1532 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1533 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1534 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1535 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1536
1537 // stage 3
1538 step1[0] = step2[0];
1539 step1[4] = step2[4];
1540
1541 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1542 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1543
1544 step1[16] = step2[16];
1545 step1[17] = step2[16];
1546 step1[18] = step2[19];
1547 step1[19] = step2[19];
1548 step1[20] = step2[20];
1549 step1[21] = step2[20];
1550 step1[22] = step2[23];
1551 step1[23] = step2[23];
1552 step1[24] = step2[24];
1553 step1[25] = step2[24];
1554 step1[26] = step2[27];
1555 step1[27] = step2[27];
1556 step1[28] = step2[28];
1557 step1[29] = step2[28];
1558 step1[30] = step2[31];
1559 step1[31] = step2[31];
1560
1561 // stage 4
1562
1563 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1564 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
1565 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
1566 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
1567 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
1568
1569 step2[0] = step1[0];
1570 step2[8] = step1[8];
1571 step2[9] = step1[8];
1572 step2[10] = step1[11];
1573 step2[11] = step1[11];
1574 step2[12] = step1[12];
1575 step2[13] = step1[12];
1576 step2[14] = step1[15];
1577 step2[15] = step1[15];
1578 step2[16] = step1[16];
1579 step2[19] = step1[19];
1580 step2[20] = step1[20];
1581 step2[23] = step1[23];
1582 step2[24] = step1[24];
1583 step2[27] = step1[27];
1584 step2[28] = step1[28];
1585 step2[31] = step1[31];
1586
1587 // stage 5
1588
1589 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1590 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1591 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1592 vrshrn_n_s32(t32[1], INV_COS_BIT));
1593
1594 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
1595 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
1596
1597 step1[4] = step2[4];
1598 step1[5] = step2[4];
1599 step1[6] = step2[7];
1600 step1[7] = step2[7];
1601 step1[8] = step2[8];
1602 step1[11] = step2[11];
1603 step1[12] = step2[12];
1604 step1[15] = step2[15];
1605 step1[16] = vqaddq_s16(step2[16], step2[19]);
1606 step1[17] = vqaddq_s16(step2[17], step2[18]);
1607 step1[18] = vqsubq_s16(step2[17], step2[18]);
1608 step1[19] = vqsubq_s16(step2[16], step2[19]);
1609 step1[20] = vqsubq_s16(step2[23], step2[20]);
1610 step1[21] = vqsubq_s16(step2[22], step2[21]);
1611 step1[22] = vqaddq_s16(step2[22], step2[21]);
1612 step1[23] = vqaddq_s16(step2[23], step2[20]);
1613 step1[24] = vqaddq_s16(step2[24], step2[27]);
1614 step1[25] = vqaddq_s16(step2[25], step2[26]);
1615 step1[26] = vqsubq_s16(step2[25], step2[26]);
1616 step1[27] = vqsubq_s16(step2[24], step2[27]);
1617 step1[28] = vqsubq_s16(step2[31], step2[28]);
1618 step1[29] = vqsubq_s16(step2[30], step2[29]);
1619 step1[30] = vqaddq_s16(step2[30], step2[29]);
1620 step1[31] = vqaddq_s16(step2[31], step2[28]);
1621
1622 // stage 6
1623
1624 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1625 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1626 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
1627 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1628 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
1629
1630 step2[0] = step1[0];
1631 step2[1] = step1[0];
1632 step2[2] = step1[0];
1633 step2[3] = step1[0];
1634 step2[4] = step1[4];
1635 step2[7] = step1[7];
1636 step2[8] = vqaddq_s16(step1[8], step1[11]);
1637 step2[9] = vqaddq_s16(step1[9], step1[10]);
1638 step2[10] = vqsubq_s16(step1[9], step1[10]);
1639 step2[11] = vqsubq_s16(step1[8], step1[11]);
1640 step2[12] = vqsubq_s16(step1[15], step1[12]);
1641 step2[13] = vqsubq_s16(step1[14], step1[13]);
1642 step2[14] = vqaddq_s16(step1[14], step1[13]);
1643 step2[15] = vqaddq_s16(step1[15], step1[12]);
1644 step2[16] = step1[16];
1645 step2[17] = step1[17];
1646 step2[22] = step1[22];
1647 step2[23] = step1[23];
1648 step2[24] = step1[24];
1649 step2[25] = step1[25];
1650 step2[30] = step1[30];
1651 step2[31] = step1[31];
1652
1653 // stage 7
1654
1655 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1656 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1657
1658 step1[0] = vqaddq_s16(step2[0], step2[7]);
1659 step1[1] = vqaddq_s16(step2[1], step2[6]);
1660 step1[2] = vqaddq_s16(step2[2], step2[5]);
1661 step1[3] = vqaddq_s16(step2[3], step2[4]);
1662 step1[4] = vqsubq_s16(step2[3], step2[4]);
1663 step1[5] = vqsubq_s16(step2[2], step2[5]);
1664 step1[6] = vqsubq_s16(step2[1], step2[6]);
1665 step1[7] = vqsubq_s16(step2[0], step2[7]);
1666 step1[8] = step2[8];
1667 step1[9] = step2[9];
1668 step1[14] = step2[14];
1669 step1[15] = step2[15];
1670 step1[16] = vqaddq_s16(step2[16], step2[23]);
1671 step1[17] = vqaddq_s16(step2[17], step2[22]);
1672 step1[18] = vqaddq_s16(step2[18], step2[21]);
1673 step1[19] = vqaddq_s16(step2[19], step2[20]);
1674 step1[20] = vqsubq_s16(step2[19], step2[20]);
1675 step1[21] = vqsubq_s16(step2[18], step2[21]);
1676 step1[22] = vqsubq_s16(step2[17], step2[22]);
1677 step1[23] = vqsubq_s16(step2[16], step2[23]);
1678 step1[24] = vqsubq_s16(step2[31], step2[24]);
1679 step1[25] = vqsubq_s16(step2[30], step2[25]);
1680 step1[26] = vqsubq_s16(step2[29], step2[26]);
1681 step1[27] = vqsubq_s16(step2[28], step2[27]);
1682 step1[28] = vqaddq_s16(step2[27], step2[28]);
1683 step1[29] = vqaddq_s16(step2[26], step2[29]);
1684 step1[30] = vqaddq_s16(step2[25], step2[30]);
1685 step1[31] = vqaddq_s16(step2[24], step2[31]);
1686
1687 // stage 8
1688
1689 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1690 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1691 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1692 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1693
1694 step2[0] = vqaddq_s16(step1[0], step1[15]);
1695 step2[1] = vqaddq_s16(step1[1], step1[14]);
1696 step2[2] = vqaddq_s16(step1[2], step1[13]);
1697 step2[3] = vqaddq_s16(step1[3], step1[12]);
1698 step2[4] = vqaddq_s16(step1[4], step1[11]);
1699 step2[5] = vqaddq_s16(step1[5], step1[10]);
1700 step2[6] = vqaddq_s16(step1[6], step1[9]);
1701 step2[7] = vqaddq_s16(step1[7], step1[8]);
1702 step2[8] = vqsubq_s16(step1[7], step1[8]);
1703 step2[9] = vqsubq_s16(step1[6], step1[9]);
1704 step2[10] = vqsubq_s16(step1[5], step1[10]);
1705 step2[11] = vqsubq_s16(step1[4], step1[11]);
1706 step2[12] = vqsubq_s16(step1[3], step1[12]);
1707 step2[13] = vqsubq_s16(step1[2], step1[13]);
1708 step2[14] = vqsubq_s16(step1[1], step1[14]);
1709 step2[15] = vqsubq_s16(step1[0], step1[15]);
1710 step2[16] = step1[16];
1711 step2[17] = step1[17];
1712 step2[18] = step1[18];
1713 step2[19] = step1[19];
1714 step2[28] = step1[28];
1715 step2[29] = step1[29];
1716 step2[30] = step1[30];
1717 step2[31] = step1[31];
1718
1719 // stage 9
1720
1721 out[0] = vqaddq_s16(step2[0], step2[31]);
1722 out[1] = vqaddq_s16(step2[1], step2[30]);
1723 out[2] = vqaddq_s16(step2[2], step2[29]);
1724 out[3] = vqaddq_s16(step2[3], step2[28]);
1725 out[4] = vqaddq_s16(step2[4], step2[27]);
1726 out[5] = vqaddq_s16(step2[5], step2[26]);
1727 out[6] = vqaddq_s16(step2[6], step2[25]);
1728 out[7] = vqaddq_s16(step2[7], step2[24]);
1729 out[8] = vqaddq_s16(step2[8], step2[23]);
1730 out[9] = vqaddq_s16(step2[9], step2[22]);
1731 out[10] = vqaddq_s16(step2[10], step2[21]);
1732 out[11] = vqaddq_s16(step2[11], step2[20]);
1733 out[12] = vqaddq_s16(step2[12], step2[19]);
1734 out[13] = vqaddq_s16(step2[13], step2[18]);
1735 out[14] = vqaddq_s16(step2[14], step2[17]);
1736 out[15] = vqaddq_s16(step2[15], step2[16]);
1737 out[16] = vqsubq_s16(step2[15], step2[16]);
1738 out[17] = vqsubq_s16(step2[14], step2[17]);
1739 out[18] = vqsubq_s16(step2[13], step2[18]);
1740 out[19] = vqsubq_s16(step2[12], step2[19]);
1741 out[20] = vqsubq_s16(step2[11], step2[20]);
1742 out[21] = vqsubq_s16(step2[10], step2[21]);
1743 out[22] = vqsubq_s16(step2[9], step2[22]);
1744 out[23] = vqsubq_s16(step2[8], step2[23]);
1745 out[24] = vqsubq_s16(step2[7], step2[24]);
1746 out[25] = vqsubq_s16(step2[6], step2[25]);
1747 out[26] = vqsubq_s16(step2[5], step2[26]);
1748 out[27] = vqsubq_s16(step2[4], step2[27]);
1749 out[28] = vqsubq_s16(step2[3], step2[28]);
1750 out[29] = vqsubq_s16(step2[2], step2[29]);
1751 out[30] = vqsubq_s16(step2[1], step2[30]);
1752 out[31] = vqsubq_s16(step2[0], step2[31]);
1753 }
1754
idct32_low16_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)1755 static inline void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
1756 int8_t cos_bit) {
1757 const int32_t *cospi = cospi_arr(cos_bit);
1758 int16x8_t step1[32], step2[32];
1759 int32x4_t t32[16];
1760 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1761 (int16_t)cospi[40], (int16_t)cospi[24]);
1762 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1763 (int16_t)cospi[16], (int16_t)cospi[48]);
1764 const int16x4_t c2 =
1765 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1766 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1767 const int16x4_t c3 =
1768 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1769 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
1770
1771 // stage 1
1772 // stage 2
1773
1774 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1775 btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
1776 btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
1777 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1778 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1779 btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
1780 btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
1781 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1782
1783 step2[0] = in[0];
1784 step2[2] = in[8];
1785 step2[4] = in[4];
1786 step2[6] = in[12];
1787 step2[8] = in[2];
1788 step2[10] = in[10];
1789 step2[12] = in[6];
1790 step2[14] = in[14];
1791
1792 // stage 3
1793
1794 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1795 btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
1796 btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
1797 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1798
1799 step1[0] = step2[0];
1800 step1[2] = step2[2];
1801 step1[4] = step2[4];
1802 step1[6] = step2[6];
1803 step1[16] = vqaddq_s16(step2[16], step2[17]);
1804 step1[17] = vqsubq_s16(step2[16], step2[17]);
1805 step1[18] = vqsubq_s16(step2[19], step2[18]);
1806 step1[19] = vqaddq_s16(step2[19], step2[18]);
1807 step1[20] = vqaddq_s16(step2[20], step2[21]);
1808 step1[21] = vqsubq_s16(step2[20], step2[21]);
1809 step1[22] = vqsubq_s16(step2[23], step2[22]);
1810 step1[23] = vqaddq_s16(step2[23], step2[22]);
1811 step1[24] = vqaddq_s16(step2[24], step2[25]);
1812 step1[25] = vqsubq_s16(step2[24], step2[25]);
1813 step1[26] = vqsubq_s16(step2[27], step2[26]);
1814 step1[27] = vqaddq_s16(step2[27], step2[26]);
1815 step1[28] = vqaddq_s16(step2[28], step2[29]);
1816 step1[29] = vqsubq_s16(step2[28], step2[29]);
1817 step1[30] = vqsubq_s16(step2[31], step2[30]);
1818 step1[31] = vqaddq_s16(step2[31], step2[30]);
1819
1820 // stage 4
1821
1822 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1823 btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
1824 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
1825 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
1826 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
1827 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
1828
1829 step2[0] = step1[0];
1830 step2[2] = step1[2];
1831 step2[8] = vqaddq_s16(step1[8], step1[9]);
1832 step2[9] = vqsubq_s16(step1[8], step1[9]);
1833 step2[10] = vqsubq_s16(step1[11], step1[10]);
1834 step2[11] = vqaddq_s16(step1[11], step1[10]);
1835 step2[12] = vqaddq_s16(step1[12], step1[13]);
1836 step2[13] = vqsubq_s16(step1[12], step1[13]);
1837 step2[14] = vqsubq_s16(step1[15], step1[14]);
1838 step2[15] = vqaddq_s16(step1[15], step1[14]);
1839 step2[16] = step1[16];
1840 step2[19] = step1[19];
1841 step2[20] = step1[20];
1842 step2[23] = step1[23];
1843 step2[24] = step1[24];
1844 step2[27] = step1[27];
1845 step2[28] = step1[28];
1846 step2[31] = step1[31];
1847
1848 // stage 5
1849
1850 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1851 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1852
1853 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1854 vrshrn_n_s32(t32[1], INV_COS_BIT));
1855
1856 btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
1857 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
1858 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
1859
1860 step1[4] = vqaddq_s16(step2[4], step2[5]);
1861 step1[5] = vqsubq_s16(step2[4], step2[5]);
1862 step1[6] = vqsubq_s16(step2[7], step2[6]);
1863 step1[7] = vqaddq_s16(step2[7], step2[6]);
1864 step1[8] = step2[8];
1865 step1[11] = step2[11];
1866 step1[12] = step2[12];
1867 step1[15] = step2[15];
1868 step1[16] = vqaddq_s16(step2[16], step2[19]);
1869 step1[17] = vqaddq_s16(step2[17], step2[18]);
1870 step1[18] = vqsubq_s16(step2[17], step2[18]);
1871 step1[19] = vqsubq_s16(step2[16], step2[19]);
1872 step1[20] = vqsubq_s16(step2[23], step2[20]);
1873 step1[21] = vqsubq_s16(step2[22], step2[21]);
1874 step1[22] = vqaddq_s16(step2[22], step2[21]);
1875 step1[23] = vqaddq_s16(step2[23], step2[20]);
1876 step1[24] = vqaddq_s16(step2[24], step2[27]);
1877 step1[25] = vqaddq_s16(step2[25], step2[26]);
1878 step1[26] = vqsubq_s16(step2[25], step2[26]);
1879 step1[27] = vqsubq_s16(step2[24], step2[27]);
1880 step1[28] = vqsubq_s16(step2[31], step2[28]);
1881 step1[29] = vqsubq_s16(step2[30], step2[29]);
1882 step1[30] = vqaddq_s16(step2[30], step2[29]);
1883 step1[31] = vqaddq_s16(step2[31], step2[28]);
1884
1885 // stage 6
1886
1887 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1888 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1889 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
1890 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1891 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
1892
1893 step2[0] = vqaddq_s16(step1[0], step1[3]);
1894 step2[1] = vqaddq_s16(step1[0], step1[2]);
1895 step2[2] = vqsubq_s16(step1[0], step1[2]);
1896 step2[3] = vqsubq_s16(step1[0], step1[3]);
1897 step2[4] = step1[4];
1898 step2[7] = step1[7];
1899 step2[8] = vqaddq_s16(step1[8], step1[11]);
1900 step2[9] = vqaddq_s16(step1[9], step1[10]);
1901 step2[10] = vqsubq_s16(step1[9], step1[10]);
1902 step2[11] = vqsubq_s16(step1[8], step1[11]);
1903 step2[12] = vqsubq_s16(step1[15], step1[12]);
1904 step2[13] = vqsubq_s16(step1[14], step1[13]);
1905 step2[14] = vqaddq_s16(step1[14], step1[13]);
1906 step2[15] = vqaddq_s16(step1[15], step1[12]);
1907 step2[16] = step1[16];
1908 step2[17] = step1[17];
1909 step2[22] = step1[22];
1910 step2[23] = step1[23];
1911 step2[24] = step1[24];
1912 step2[25] = step1[25];
1913 step2[30] = step1[30];
1914 step2[31] = step1[31];
1915
1916 // stage 7
1917
1918 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1919 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1920
1921 step1[0] = vqaddq_s16(step2[0], step2[7]);
1922 step1[1] = vqaddq_s16(step2[1], step2[6]);
1923 step1[2] = vqaddq_s16(step2[2], step2[5]);
1924 step1[3] = vqaddq_s16(step2[3], step2[4]);
1925 step1[4] = vqsubq_s16(step2[3], step2[4]);
1926 step1[5] = vqsubq_s16(step2[2], step2[5]);
1927 step1[6] = vqsubq_s16(step2[1], step2[6]);
1928 step1[7] = vqsubq_s16(step2[0], step2[7]);
1929 step1[8] = step2[8];
1930 step1[9] = step2[9];
1931 step1[14] = step2[14];
1932 step1[15] = step2[15];
1933 step1[16] = vqaddq_s16(step2[16], step2[23]);
1934 step1[17] = vqaddq_s16(step2[17], step2[22]);
1935 step1[18] = vqaddq_s16(step2[18], step2[21]);
1936 step1[19] = vqaddq_s16(step2[19], step2[20]);
1937 step1[20] = vqsubq_s16(step2[19], step2[20]);
1938 step1[21] = vqsubq_s16(step2[18], step2[21]);
1939 step1[22] = vqsubq_s16(step2[17], step2[22]);
1940 step1[23] = vqsubq_s16(step2[16], step2[23]);
1941 step1[24] = vqsubq_s16(step2[31], step2[24]);
1942 step1[25] = vqsubq_s16(step2[30], step2[25]);
1943 step1[26] = vqsubq_s16(step2[29], step2[26]);
1944 step1[27] = vqsubq_s16(step2[28], step2[27]);
1945 step1[28] = vqaddq_s16(step2[27], step2[28]);
1946 step1[29] = vqaddq_s16(step2[26], step2[29]);
1947 step1[30] = vqaddq_s16(step2[25], step2[30]);
1948 step1[31] = vqaddq_s16(step2[24], step2[31]);
1949
1950 // stage 8
1951
1952 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1953 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1954 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1955 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1956
1957 step2[0] = vqaddq_s16(step1[0], step1[15]);
1958 step2[1] = vqaddq_s16(step1[1], step1[14]);
1959 step2[2] = vqaddq_s16(step1[2], step1[13]);
1960 step2[3] = vqaddq_s16(step1[3], step1[12]);
1961 step2[4] = vqaddq_s16(step1[4], step1[11]);
1962 step2[5] = vqaddq_s16(step1[5], step1[10]);
1963 step2[6] = vqaddq_s16(step1[6], step1[9]);
1964 step2[7] = vqaddq_s16(step1[7], step1[8]);
1965 step2[8] = vqsubq_s16(step1[7], step1[8]);
1966 step2[9] = vqsubq_s16(step1[6], step1[9]);
1967 step2[10] = vqsubq_s16(step1[5], step1[10]);
1968 step2[11] = vqsubq_s16(step1[4], step1[11]);
1969 step2[12] = vqsubq_s16(step1[3], step1[12]);
1970 step2[13] = vqsubq_s16(step1[2], step1[13]);
1971 step2[14] = vqsubq_s16(step1[1], step1[14]);
1972 step2[15] = vqsubq_s16(step1[0], step1[15]);
1973 step2[16] = step1[16];
1974 step2[17] = step1[17];
1975 step2[18] = step1[18];
1976 step2[19] = step1[19];
1977 step2[28] = step1[28];
1978 step2[29] = step1[29];
1979 step2[30] = step1[30];
1980 step2[31] = step1[31];
1981
1982 // stage 9
1983
1984 out[0] = vqaddq_s16(step2[0], step2[31]);
1985 out[1] = vqaddq_s16(step2[1], step2[30]);
1986 out[2] = vqaddq_s16(step2[2], step2[29]);
1987 out[3] = vqaddq_s16(step2[3], step2[28]);
1988 out[4] = vqaddq_s16(step2[4], step2[27]);
1989 out[5] = vqaddq_s16(step2[5], step2[26]);
1990 out[6] = vqaddq_s16(step2[6], step2[25]);
1991 out[7] = vqaddq_s16(step2[7], step2[24]);
1992 out[8] = vqaddq_s16(step2[8], step2[23]);
1993 out[9] = vqaddq_s16(step2[9], step2[22]);
1994 out[10] = vqaddq_s16(step2[10], step2[21]);
1995 out[11] = vqaddq_s16(step2[11], step2[20]);
1996 out[12] = vqaddq_s16(step2[12], step2[19]);
1997 out[13] = vqaddq_s16(step2[13], step2[18]);
1998 out[14] = vqaddq_s16(step2[14], step2[17]);
1999 out[15] = vqaddq_s16(step2[15], step2[16]);
2000 out[16] = vqsubq_s16(step2[15], step2[16]);
2001 out[17] = vqsubq_s16(step2[14], step2[17]);
2002 out[18] = vqsubq_s16(step2[13], step2[18]);
2003 out[19] = vqsubq_s16(step2[12], step2[19]);
2004 out[20] = vqsubq_s16(step2[11], step2[20]);
2005 out[21] = vqsubq_s16(step2[10], step2[21]);
2006 out[22] = vqsubq_s16(step2[9], step2[22]);
2007 out[23] = vqsubq_s16(step2[8], step2[23]);
2008 out[24] = vqsubq_s16(step2[7], step2[24]);
2009 out[25] = vqsubq_s16(step2[6], step2[25]);
2010 out[26] = vqsubq_s16(step2[5], step2[26]);
2011 out[27] = vqsubq_s16(step2[4], step2[27]);
2012 out[28] = vqsubq_s16(step2[3], step2[28]);
2013 out[29] = vqsubq_s16(step2[2], step2[29]);
2014 out[30] = vqsubq_s16(step2[1], step2[30]);
2015 out[31] = vqsubq_s16(step2[0], step2[31]);
2016 }
idct64_stage9_neon(int16x8_t * step2,int16x8_t * step1,int8_t cos_bit)2017 static inline void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
2018 int8_t cos_bit) {
2019 const int32_t *cospi = cospi_arr(cos_bit);
2020 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2021 (int16_t)cospi[16], (int16_t)cospi[48]);
2022
2023 btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
2024 btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
2025 btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
2026 btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
2027
2028 step1[0] = vqaddq_s16(step2[0], step2[15]);
2029 step1[1] = vqaddq_s16(step2[1], step2[14]);
2030 step1[2] = vqaddq_s16(step2[2], step2[13]);
2031 step1[3] = vqaddq_s16(step2[3], step2[12]);
2032 step1[4] = vqaddq_s16(step2[4], step2[11]);
2033 step1[5] = vqaddq_s16(step2[5], step2[10]);
2034 step1[6] = vqaddq_s16(step2[6], step2[9]);
2035 step1[7] = vqaddq_s16(step2[7], step2[8]);
2036 step1[8] = vqsubq_s16(step2[7], step2[8]);
2037 step1[9] = vqsubq_s16(step2[6], step2[9]);
2038 step1[10] = vqsubq_s16(step2[5], step2[10]);
2039 step1[11] = vqsubq_s16(step2[4], step2[11]);
2040 step1[12] = vqsubq_s16(step2[3], step2[12]);
2041 step1[13] = vqsubq_s16(step2[2], step2[13]);
2042 step1[14] = vqsubq_s16(step2[1], step2[14]);
2043 step1[15] = vqsubq_s16(step2[0], step2[15]);
2044 step1[16] = step2[16];
2045 step1[17] = step2[17];
2046 step1[18] = step2[18];
2047 step1[19] = step2[19];
2048 step1[28] = step2[28];
2049 step1[29] = step2[29];
2050 step1[30] = step2[30];
2051 step1[31] = step2[31];
2052 step1[32] = vqaddq_s16(step2[32], step2[47]);
2053 step1[33] = vqaddq_s16(step2[33], step2[46]);
2054 step1[34] = vqaddq_s16(step2[34], step2[45]);
2055 step1[35] = vqaddq_s16(step2[35], step2[44]);
2056 step1[36] = vqaddq_s16(step2[36], step2[43]);
2057 step1[37] = vqaddq_s16(step2[37], step2[42]);
2058 step1[38] = vqaddq_s16(step2[38], step2[41]);
2059 step1[39] = vqaddq_s16(step2[39], step2[40]);
2060 step1[40] = vqsubq_s16(step2[39], step2[40]);
2061 step1[41] = vqsubq_s16(step2[38], step2[41]);
2062 step1[42] = vqsubq_s16(step2[37], step2[42]);
2063 step1[43] = vqsubq_s16(step2[36], step2[43]);
2064 step1[44] = vqsubq_s16(step2[35], step2[44]);
2065 step1[45] = vqsubq_s16(step2[34], step2[45]);
2066 step1[46] = vqsubq_s16(step2[33], step2[46]);
2067 step1[47] = vqsubq_s16(step2[32], step2[47]);
2068 step1[48] = vqsubq_s16(step2[63], step2[48]);
2069 step1[49] = vqsubq_s16(step2[62], step2[49]);
2070 step1[50] = vqsubq_s16(step2[61], step2[50]);
2071 step1[51] = vqsubq_s16(step2[60], step2[51]);
2072 step1[52] = vqsubq_s16(step2[59], step2[52]);
2073 step1[53] = vqsubq_s16(step2[58], step2[53]);
2074 step1[54] = vqsubq_s16(step2[57], step2[54]);
2075 step1[55] = vqsubq_s16(step2[56], step2[55]);
2076 step1[56] = vqaddq_s16(step2[56], step2[55]);
2077 step1[57] = vqaddq_s16(step2[57], step2[54]);
2078 step1[58] = vqaddq_s16(step2[58], step2[53]);
2079 step1[59] = vqaddq_s16(step2[59], step2[52]);
2080 step1[60] = vqaddq_s16(step2[60], step2[51]);
2081 step1[61] = vqaddq_s16(step2[61], step2[50]);
2082 step1[62] = vqaddq_s16(step2[62], step2[49]);
2083 step1[63] = vqaddq_s16(step2[63], step2[48]);
2084 }
2085
idct64_stage10_neon(int16x8_t * step1,int16x8_t * step2,int8_t cos_bit)2086 static inline void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
2087 int8_t cos_bit) {
2088 const int32_t *cospi = cospi_arr(cos_bit);
2089 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2090 (int16_t)cospi[16], (int16_t)cospi[48]);
2091
2092 btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
2093 btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
2094 btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
2095 btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
2096 btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
2097 btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
2098 btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
2099 btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
2100
2101 step2[0] = vqaddq_s16(step1[0], step1[31]);
2102 step2[1] = vqaddq_s16(step1[1], step1[30]);
2103 step2[2] = vqaddq_s16(step1[2], step1[29]);
2104 step2[3] = vqaddq_s16(step1[3], step1[28]);
2105 step2[4] = vqaddq_s16(step1[4], step1[27]);
2106 step2[5] = vqaddq_s16(step1[5], step1[26]);
2107 step2[6] = vqaddq_s16(step1[6], step1[25]);
2108 step2[7] = vqaddq_s16(step1[7], step1[24]);
2109 step2[8] = vqaddq_s16(step1[8], step1[23]);
2110 step2[9] = vqaddq_s16(step1[9], step1[22]);
2111 step2[10] = vqaddq_s16(step1[10], step1[21]);
2112 step2[11] = vqaddq_s16(step1[11], step1[20]);
2113 step2[12] = vqaddq_s16(step1[12], step1[19]);
2114 step2[13] = vqaddq_s16(step1[13], step1[18]);
2115 step2[14] = vqaddq_s16(step1[14], step1[17]);
2116 step2[15] = vqaddq_s16(step1[15], step1[16]);
2117 step2[16] = vqsubq_s16(step1[15], step1[16]);
2118 step2[17] = vqsubq_s16(step1[14], step1[17]);
2119 step2[18] = vqsubq_s16(step1[13], step1[18]);
2120 step2[19] = vqsubq_s16(step1[12], step1[19]);
2121 step2[20] = vqsubq_s16(step1[11], step1[20]);
2122 step2[21] = vqsubq_s16(step1[10], step1[21]);
2123 step2[22] = vqsubq_s16(step1[9], step1[22]);
2124 step2[23] = vqsubq_s16(step1[8], step1[23]);
2125 step2[24] = vqsubq_s16(step1[7], step1[24]);
2126 step2[25] = vqsubq_s16(step1[6], step1[25]);
2127 step2[26] = vqsubq_s16(step1[5], step1[26]);
2128 step2[27] = vqsubq_s16(step1[4], step1[27]);
2129 step2[28] = vqsubq_s16(step1[3], step1[28]);
2130 step2[29] = vqsubq_s16(step1[2], step1[29]);
2131 step2[30] = vqsubq_s16(step1[1], step1[30]);
2132 step2[31] = vqsubq_s16(step1[0], step1[31]);
2133 step2[32] = step1[32];
2134 step2[33] = step1[33];
2135 step2[34] = step1[34];
2136 step2[35] = step1[35];
2137 step2[36] = step1[36];
2138 step2[37] = step1[37];
2139 step2[38] = step1[38];
2140 step2[39] = step1[39];
2141 step2[56] = step1[56];
2142 step2[57] = step1[57];
2143 step2[58] = step1[58];
2144 step2[59] = step1[59];
2145 step2[60] = step1[60];
2146 step2[61] = step1[61];
2147 step2[62] = step1[62];
2148 step2[63] = step1[63];
2149 }
2150
idct64_low32_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)2151 static inline void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
2152 int8_t cos_bit) {
2153 const int32_t *cospi = cospi_arr(cos_bit);
2154 int16x8_t step2[64], step1[64];
2155 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2156 (int16_t)cospi[36], (int16_t)cospi[28]);
2157 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2158 (int16_t)cospi[52], (int16_t)cospi[12]);
2159 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2160 (int16_t)cospi[40], (int16_t)cospi[24]);
2161 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2162 (int16_t)cospi[16], (int16_t)cospi[48]);
2163 const int16x4_t c4 =
2164 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
2165 (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
2166 const int16x4_t c5 =
2167 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
2168 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2169 const int16x4_t c6 =
2170 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2171 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2172 const int16x4_t c7 =
2173 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2174 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
2175
2176 // stage 1
2177 // stage 2
2178
2179 step2[0] = in[0];
2180 step2[2] = in[16];
2181 step2[4] = in[8];
2182 step2[6] = in[24];
2183 step2[8] = in[4];
2184 step2[10] = in[20];
2185 step2[12] = in[12];
2186 step2[14] = in[28];
2187 step2[16] = in[2];
2188 step2[18] = in[18];
2189 step2[20] = in[10];
2190 step2[22] = in[26];
2191 step2[24] = in[6];
2192 step2[26] = in[22];
2193 step2[28] = in[14];
2194 step2[30] = in[30];
2195
2196 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2197 btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
2198 btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
2199 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
2200 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
2201 btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
2202 btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
2203 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2204 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2205 btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
2206 btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
2207 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
2208 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
2209 btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
2210 btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
2211 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2212
2213 // stage 3
2214
2215 step1[0] = step2[0];
2216 step1[2] = step2[2];
2217 step1[4] = step2[4];
2218 step1[6] = step2[6];
2219 step1[8] = step2[8];
2220 step1[10] = step2[10];
2221 step1[12] = step2[12];
2222 step1[14] = step2[14];
2223
2224 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2225 btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
2226 btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
2227 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
2228 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
2229 btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
2230 btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
2231 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2232
2233 step1[32] = vqaddq_s16(step2[32], step2[33]);
2234 step1[33] = vqsubq_s16(step2[32], step2[33]);
2235 step1[34] = vqsubq_s16(step2[35], step2[34]);
2236 step1[35] = vqaddq_s16(step2[35], step2[34]);
2237 step1[36] = vqaddq_s16(step2[36], step2[37]);
2238 step1[37] = vqsubq_s16(step2[36], step2[37]);
2239 step1[38] = vqsubq_s16(step2[39], step2[38]);
2240 step1[39] = vqaddq_s16(step2[39], step2[38]);
2241 step1[40] = vqaddq_s16(step2[40], step2[41]);
2242 step1[41] = vqsubq_s16(step2[40], step2[41]);
2243 step1[42] = vqsubq_s16(step2[43], step2[42]);
2244 step1[43] = vqaddq_s16(step2[43], step2[42]);
2245 step1[44] = vqaddq_s16(step2[44], step2[45]);
2246 step1[45] = vqsubq_s16(step2[44], step2[45]);
2247 step1[46] = vqsubq_s16(step2[47], step2[46]);
2248 step1[47] = vqaddq_s16(step2[47], step2[46]);
2249 step1[48] = vqaddq_s16(step2[48], step2[49]);
2250 step1[49] = vqsubq_s16(step2[48], step2[49]);
2251 step1[50] = vqsubq_s16(step2[51], step2[50]);
2252 step1[51] = vqaddq_s16(step2[51], step2[50]);
2253 step1[52] = vqaddq_s16(step2[52], step2[53]);
2254 step1[53] = vqsubq_s16(step2[52], step2[53]);
2255 step1[54] = vqsubq_s16(step2[55], step2[54]);
2256 step1[55] = vqaddq_s16(step2[55], step2[54]);
2257 step1[56] = vqaddq_s16(step2[56], step2[57]);
2258 step1[57] = vqsubq_s16(step2[56], step2[57]);
2259 step1[58] = vqsubq_s16(step2[59], step2[58]);
2260 step1[59] = vqaddq_s16(step2[59], step2[58]);
2261 step1[60] = vqaddq_s16(step2[60], step2[61]);
2262 step1[61] = vqsubq_s16(step2[60], step2[61]);
2263 step1[62] = vqsubq_s16(step2[63], step2[62]);
2264 step1[63] = vqaddq_s16(step2[63], step2[62]);
2265
2266 // stage 4
2267
2268 step2[0] = step1[0];
2269 step2[2] = step1[2];
2270 step2[4] = step1[4];
2271 step2[6] = step1[6];
2272
2273 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2274 btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
2275 btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
2276 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
2277 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
2278 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
2279 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
2280 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
2281 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
2282 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
2283 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
2284 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
2285
2286 step2[16] = vqaddq_s16(step1[16], step1[17]);
2287 step2[17] = vqsubq_s16(step1[16], step1[17]);
2288 step2[18] = vqsubq_s16(step1[19], step1[18]);
2289 step2[19] = vqaddq_s16(step1[19], step1[18]);
2290 step2[20] = vqaddq_s16(step1[20], step1[21]);
2291 step2[21] = vqsubq_s16(step1[20], step1[21]);
2292 step2[22] = vqsubq_s16(step1[23], step1[22]);
2293 step2[23] = vqaddq_s16(step1[23], step1[22]);
2294 step2[24] = vqaddq_s16(step1[24], step1[25]);
2295 step2[25] = vqsubq_s16(step1[24], step1[25]);
2296 step2[26] = vqsubq_s16(step1[27], step1[26]);
2297 step2[27] = vqaddq_s16(step1[27], step1[26]);
2298 step2[28] = vqaddq_s16(step1[28], step1[29]);
2299 step2[29] = vqsubq_s16(step1[28], step1[29]);
2300 step2[30] = vqsubq_s16(step1[31], step1[30]);
2301 step2[31] = vqaddq_s16(step1[31], step1[30]);
2302 step2[32] = step1[32];
2303 step2[35] = step1[35];
2304 step2[36] = step1[36];
2305 step2[39] = step1[39];
2306 step2[40] = step1[40];
2307 step2[43] = step1[43];
2308 step2[44] = step1[44];
2309 step2[47] = step1[47];
2310 step2[48] = step1[48];
2311 step2[51] = step1[51];
2312 step2[52] = step1[52];
2313 step2[55] = step1[55];
2314 step2[56] = step1[56];
2315 step2[59] = step1[59];
2316 step2[60] = step1[60];
2317 step2[63] = step1[63];
2318
2319 // stage 5
2320
2321 step1[0] = step2[0];
2322 step1[2] = step2[2];
2323
2324 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
2325 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
2326 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
2327 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
2328 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
2329 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
2330
2331 step1[8] = vqaddq_s16(step2[8], step2[9]);
2332 step1[9] = vqsubq_s16(step2[8], step2[9]);
2333 step1[10] = vqsubq_s16(step2[11], step2[10]);
2334 step1[11] = vqaddq_s16(step2[11], step2[10]);
2335 step1[12] = vqaddq_s16(step2[12], step2[13]);
2336 step1[13] = vqsubq_s16(step2[12], step2[13]);
2337 step1[14] = vqsubq_s16(step2[15], step2[14]);
2338 step1[15] = vqaddq_s16(step2[15], step2[14]);
2339 step1[16] = step2[16];
2340 step1[19] = step2[19];
2341 step1[20] = step2[20];
2342 step1[23] = step2[23];
2343 step1[24] = step2[24];
2344 step1[27] = step2[27];
2345 step1[28] = step2[28];
2346 step1[31] = step2[31];
2347 step1[32] = vqaddq_s16(step2[32], step2[35]);
2348 step1[33] = vqaddq_s16(step2[33], step2[34]);
2349 step1[34] = vqsubq_s16(step2[33], step2[34]);
2350 step1[35] = vqsubq_s16(step2[32], step2[35]);
2351 step1[36] = vqsubq_s16(step2[39], step2[36]);
2352 step1[37] = vqsubq_s16(step2[38], step2[37]);
2353 step1[38] = vqaddq_s16(step2[38], step2[37]);
2354 step1[39] = vqaddq_s16(step2[39], step2[36]);
2355 step1[40] = vqaddq_s16(step2[40], step2[43]);
2356 step1[41] = vqaddq_s16(step2[41], step2[42]);
2357 step1[42] = vqsubq_s16(step2[41], step2[42]);
2358 step1[43] = vqsubq_s16(step2[40], step2[43]);
2359 step1[44] = vqsubq_s16(step2[47], step2[44]);
2360 step1[45] = vqsubq_s16(step2[46], step2[45]);
2361 step1[46] = vqaddq_s16(step2[46], step2[45]);
2362 step1[47] = vqaddq_s16(step2[47], step2[44]);
2363 step1[48] = vqaddq_s16(step2[48], step2[51]);
2364 step1[49] = vqaddq_s16(step2[49], step2[50]);
2365 step1[50] = vqsubq_s16(step2[49], step2[50]);
2366 step1[51] = vqsubq_s16(step2[48], step2[51]);
2367 step1[52] = vqsubq_s16(step2[55], step2[52]);
2368 step1[53] = vqsubq_s16(step2[54], step2[53]);
2369 step1[54] = vqaddq_s16(step2[54], step2[53]);
2370 step1[55] = vqaddq_s16(step2[55], step2[52]);
2371 step1[56] = vqaddq_s16(step2[56], step2[59]);
2372 step1[57] = vqaddq_s16(step2[57], step2[58]);
2373 step1[58] = vqsubq_s16(step2[57], step2[58]);
2374 step1[59] = vqsubq_s16(step2[56], step2[59]);
2375 step1[60] = vqsubq_s16(step2[63], step2[60]);
2376 step1[61] = vqsubq_s16(step2[62], step2[61]);
2377 step1[62] = vqaddq_s16(step2[62], step2[61]);
2378 step1[63] = vqaddq_s16(step2[63], step2[60]);
2379
2380 // stage 6
2381
2382 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2383 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
2384 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2385 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
2386 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2387 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
2388 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
2389 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
2390 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2391 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
2392 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
2393 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
2394
2395 step2[4] = vqaddq_s16(step1[4], step1[5]);
2396 step2[5] = vqsubq_s16(step1[4], step1[5]);
2397 step2[6] = vqsubq_s16(step1[7], step1[6]);
2398 step2[7] = vqaddq_s16(step1[7], step1[6]);
2399 step2[8] = step1[8];
2400 step2[11] = step1[11];
2401 step2[12] = step1[12];
2402 step2[15] = step1[15];
2403 step2[16] = vqaddq_s16(step1[16], step1[19]);
2404 step2[17] = vqaddq_s16(step1[17], step1[18]);
2405 step2[18] = vqsubq_s16(step1[17], step1[18]);
2406 step2[19] = vqsubq_s16(step1[16], step1[19]);
2407 step2[20] = vqsubq_s16(step1[23], step1[20]);
2408 step2[21] = vqsubq_s16(step1[22], step1[21]);
2409 step2[22] = vqaddq_s16(step1[22], step1[21]);
2410 step2[23] = vqaddq_s16(step1[23], step1[20]);
2411 step2[24] = vqaddq_s16(step1[24], step1[27]);
2412 step2[25] = vqaddq_s16(step1[25], step1[26]);
2413 step2[26] = vqsubq_s16(step1[25], step1[26]);
2414 step2[27] = vqsubq_s16(step1[24], step1[27]);
2415 step2[28] = vqsubq_s16(step1[31], step1[28]);
2416 step2[29] = vqsubq_s16(step1[30], step1[29]);
2417 step2[30] = vqaddq_s16(step1[30], step1[29]);
2418 step2[31] = vqaddq_s16(step1[31], step1[28]);
2419 step2[32] = step1[32];
2420 step2[33] = step1[33];
2421 step2[38] = step1[38];
2422 step2[39] = step1[39];
2423 step2[40] = step1[40];
2424 step2[41] = step1[41];
2425 step2[46] = step1[46];
2426 step2[47] = step1[47];
2427 step2[48] = step1[48];
2428 step2[49] = step1[49];
2429 step2[54] = step1[54];
2430 step2[55] = step1[55];
2431 step2[56] = step1[56];
2432 step2[57] = step1[57];
2433 step2[62] = step1[62];
2434 step2[63] = step1[63];
2435
2436 // stage 7
2437
2438 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
2439 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2440 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
2441 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
2442 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
2443
2444 step1[0] = vqaddq_s16(step2[0], step2[3]);
2445 step1[1] = vqaddq_s16(step2[1], step2[2]);
2446 step1[2] = vqsubq_s16(step2[1], step2[2]);
2447 step1[3] = vqsubq_s16(step2[0], step2[3]);
2448 step1[4] = step2[4];
2449 step1[7] = step2[7];
2450 step1[8] = vqaddq_s16(step2[8], step2[11]);
2451 step1[9] = vqaddq_s16(step2[9], step2[10]);
2452 step1[10] = vqsubq_s16(step2[9], step2[10]);
2453 step1[11] = vqsubq_s16(step2[8], step2[11]);
2454 step1[12] = vqsubq_s16(step2[15], step2[12]);
2455 step1[13] = vqsubq_s16(step2[14], step2[13]);
2456 step1[14] = vqaddq_s16(step2[14], step2[13]);
2457 step1[15] = vqaddq_s16(step2[15], step2[12]);
2458 step1[16] = step2[16];
2459 step1[17] = step2[17];
2460 step1[22] = step2[22];
2461 step1[23] = step2[23];
2462 step1[24] = step2[24];
2463 step1[25] = step2[25];
2464 step1[30] = step2[30];
2465 step1[31] = step2[31];
2466 step1[32] = vqaddq_s16(step2[32], step2[39]);
2467 step1[33] = vqaddq_s16(step2[33], step2[38]);
2468 step1[34] = vqaddq_s16(step2[34], step2[37]);
2469 step1[35] = vqaddq_s16(step2[35], step2[36]);
2470 step1[36] = vqsubq_s16(step2[35], step2[36]);
2471 step1[37] = vqsubq_s16(step2[34], step2[37]);
2472 step1[38] = vqsubq_s16(step2[33], step2[38]);
2473 step1[39] = vqsubq_s16(step2[32], step2[39]);
2474 step1[40] = vqsubq_s16(step2[47], step2[40]);
2475 step1[41] = vqsubq_s16(step2[46], step2[41]);
2476 step1[42] = vqsubq_s16(step2[45], step2[42]);
2477 step1[43] = vqsubq_s16(step2[44], step2[43]);
2478 step1[44] = vqaddq_s16(step2[43], step2[44]);
2479 step1[45] = vqaddq_s16(step2[42], step2[45]);
2480 step1[46] = vqaddq_s16(step2[41], step2[46]);
2481 step1[47] = vqaddq_s16(step2[40], step2[47]);
2482 step1[48] = vqaddq_s16(step2[48], step2[55]);
2483 step1[49] = vqaddq_s16(step2[49], step2[54]);
2484 step1[50] = vqaddq_s16(step2[50], step2[53]);
2485 step1[51] = vqaddq_s16(step2[51], step2[52]);
2486 step1[52] = vqsubq_s16(step2[51], step2[52]);
2487 step1[53] = vqsubq_s16(step2[50], step2[53]);
2488 step1[54] = vqsubq_s16(step2[49], step2[54]);
2489 step1[55] = vqsubq_s16(step2[48], step2[55]);
2490 step1[56] = vqsubq_s16(step2[63], step2[56]);
2491 step1[57] = vqsubq_s16(step2[62], step2[57]);
2492 step1[58] = vqsubq_s16(step2[61], step2[58]);
2493 step1[59] = vqsubq_s16(step2[60], step2[59]);
2494 step1[60] = vqaddq_s16(step2[59], step2[60]);
2495 step1[61] = vqaddq_s16(step2[58], step2[61]);
2496 step1[62] = vqaddq_s16(step2[57], step2[62]);
2497 step1[63] = vqaddq_s16(step2[56], step2[63]);
2498
2499 // stage 8
2500
2501 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2502 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2503 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2504 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2505 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2506 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
2507 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
2508 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
2509 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
2510 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
2511
2512 step2[0] = vqaddq_s16(step1[0], step1[7]);
2513 step2[1] = vqaddq_s16(step1[1], step1[6]);
2514 step2[2] = vqaddq_s16(step1[2], step1[5]);
2515 step2[3] = vqaddq_s16(step1[3], step1[4]);
2516 step2[4] = vqsubq_s16(step1[3], step1[4]);
2517 step2[5] = vqsubq_s16(step1[2], step1[5]);
2518 step2[6] = vqsubq_s16(step1[1], step1[6]);
2519 step2[7] = vqsubq_s16(step1[0], step1[7]);
2520 step2[8] = step1[8];
2521 step2[9] = step1[9];
2522 step2[14] = step1[14];
2523 step2[15] = step1[15];
2524 step2[16] = vqaddq_s16(step1[16], step1[23]);
2525 step2[17] = vqaddq_s16(step1[17], step1[22]);
2526 step2[18] = vqaddq_s16(step1[18], step1[21]);
2527 step2[19] = vqaddq_s16(step1[19], step1[20]);
2528 step2[20] = vqsubq_s16(step1[19], step1[20]);
2529 step2[21] = vqsubq_s16(step1[18], step1[21]);
2530 step2[22] = vqsubq_s16(step1[17], step1[22]);
2531 step2[23] = vqsubq_s16(step1[16], step1[23]);
2532 step2[24] = vqsubq_s16(step1[31], step1[24]);
2533 step2[25] = vqsubq_s16(step1[30], step1[25]);
2534 step2[26] = vqsubq_s16(step1[29], step1[26]);
2535 step2[27] = vqsubq_s16(step1[28], step1[27]);
2536 step2[28] = vqaddq_s16(step1[28], step1[27]);
2537 step2[29] = vqaddq_s16(step1[29], step1[26]);
2538 step2[30] = vqaddq_s16(step1[30], step1[25]);
2539 step2[31] = vqaddq_s16(step1[31], step1[24]);
2540 step2[32] = step1[32];
2541 step2[33] = step1[33];
2542 step2[34] = step1[34];
2543 step2[35] = step1[35];
2544 step2[44] = step1[44];
2545 step2[45] = step1[45];
2546 step2[46] = step1[46];
2547 step2[47] = step1[47];
2548 step2[48] = step1[48];
2549 step2[49] = step1[49];
2550 step2[50] = step1[50];
2551 step2[51] = step1[51];
2552 step2[60] = step1[60];
2553 step2[61] = step1[61];
2554 step2[62] = step1[62];
2555 step2[63] = step1[63];
2556
2557 // stage 9
2558 idct64_stage9_neon(step2, step1, cos_bit);
2559
2560 // stage 10
2561 idct64_stage10_neon(step1, step2, cos_bit);
2562
2563 // stage 11
2564
2565 out[0] = vqaddq_s16(step2[0], step2[63]);
2566 out[1] = vqaddq_s16(step2[1], step2[62]);
2567 out[2] = vqaddq_s16(step2[2], step2[61]);
2568 out[3] = vqaddq_s16(step2[3], step2[60]);
2569 out[4] = vqaddq_s16(step2[4], step2[59]);
2570 out[5] = vqaddq_s16(step2[5], step2[58]);
2571 out[6] = vqaddq_s16(step2[6], step2[57]);
2572 out[7] = vqaddq_s16(step2[7], step2[56]);
2573 out[8] = vqaddq_s16(step2[8], step2[55]);
2574 out[9] = vqaddq_s16(step2[9], step2[54]);
2575 out[10] = vqaddq_s16(step2[10], step2[53]);
2576 out[11] = vqaddq_s16(step2[11], step2[52]);
2577 out[12] = vqaddq_s16(step2[12], step2[51]);
2578 out[13] = vqaddq_s16(step2[13], step2[50]);
2579 out[14] = vqaddq_s16(step2[14], step2[49]);
2580 out[15] = vqaddq_s16(step2[15], step2[48]);
2581 out[16] = vqaddq_s16(step2[16], step2[47]);
2582 out[17] = vqaddq_s16(step2[17], step2[46]);
2583 out[18] = vqaddq_s16(step2[18], step2[45]);
2584 out[19] = vqaddq_s16(step2[19], step2[44]);
2585 out[20] = vqaddq_s16(step2[20], step2[43]);
2586 out[21] = vqaddq_s16(step2[21], step2[42]);
2587 out[22] = vqaddq_s16(step2[22], step2[41]);
2588 out[23] = vqaddq_s16(step2[23], step2[40]);
2589 out[24] = vqaddq_s16(step2[24], step2[39]);
2590 out[25] = vqaddq_s16(step2[25], step2[38]);
2591 out[26] = vqaddq_s16(step2[26], step2[37]);
2592 out[27] = vqaddq_s16(step2[27], step2[36]);
2593 out[28] = vqaddq_s16(step2[28], step2[35]);
2594 out[29] = vqaddq_s16(step2[29], step2[34]);
2595 out[30] = vqaddq_s16(step2[30], step2[33]);
2596 out[31] = vqaddq_s16(step2[31], step2[32]);
2597 out[32] = vqsubq_s16(step2[31], step2[32]);
2598 out[33] = vqsubq_s16(step2[30], step2[33]);
2599 out[34] = vqsubq_s16(step2[29], step2[34]);
2600 out[35] = vqsubq_s16(step2[28], step2[35]);
2601 out[36] = vqsubq_s16(step2[27], step2[36]);
2602 out[37] = vqsubq_s16(step2[26], step2[37]);
2603 out[38] = vqsubq_s16(step2[25], step2[38]);
2604 out[39] = vqsubq_s16(step2[24], step2[39]);
2605 out[40] = vqsubq_s16(step2[23], step2[40]);
2606 out[41] = vqsubq_s16(step2[22], step2[41]);
2607 out[42] = vqsubq_s16(step2[21], step2[42]);
2608 out[43] = vqsubq_s16(step2[20], step2[43]);
2609 out[44] = vqsubq_s16(step2[19], step2[44]);
2610 out[45] = vqsubq_s16(step2[18], step2[45]);
2611 out[46] = vqsubq_s16(step2[17], step2[46]);
2612 out[47] = vqsubq_s16(step2[16], step2[47]);
2613 out[48] = vqsubq_s16(step2[15], step2[48]);
2614 out[49] = vqsubq_s16(step2[14], step2[49]);
2615 out[50] = vqsubq_s16(step2[13], step2[50]);
2616 out[51] = vqsubq_s16(step2[12], step2[51]);
2617 out[52] = vqsubq_s16(step2[11], step2[52]);
2618 out[53] = vqsubq_s16(step2[10], step2[53]);
2619 out[54] = vqsubq_s16(step2[9], step2[54]);
2620 out[55] = vqsubq_s16(step2[8], step2[55]);
2621 out[56] = vqsubq_s16(step2[7], step2[56]);
2622 out[57] = vqsubq_s16(step2[6], step2[57]);
2623 out[58] = vqsubq_s16(step2[5], step2[58]);
2624 out[59] = vqsubq_s16(step2[4], step2[59]);
2625 out[60] = vqsubq_s16(step2[3], step2[60]);
2626 out[61] = vqsubq_s16(step2[2], step2[61]);
2627 out[62] = vqsubq_s16(step2[1], step2[62]);
2628 out[63] = vqsubq_s16(step2[0], step2[63]);
2629 }
2630
idct64_low1_neon(int16x8_t * input,int16x8_t * out,int8_t cos_bit)2631 static inline void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
2632 int8_t cos_bit) {
2633 const int32_t *cospi = cospi_arr(cos_bit);
2634 int16x8_t step1;
2635 int32x4_t t32[2];
2636
2637 // stage 1
2638 // stage 2
2639 // stage 3
2640 // stage 4
2641 // stage 5
2642 // stage 6
2643
2644 t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
2645 t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
2646
2647 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
2648 vrshrn_n_s32(t32[1], INV_COS_BIT));
2649 // stage 7
2650 // stage 8
2651 // stage 9
2652 // stage 10
2653 // stage 11
2654 out[0] = step1;
2655 out[1] = step1;
2656 out[2] = step1;
2657 out[3] = step1;
2658 out[4] = step1;
2659 out[5] = step1;
2660 out[6] = step1;
2661 out[7] = step1;
2662 out[8] = step1;
2663 out[9] = step1;
2664 out[10] = step1;
2665 out[11] = step1;
2666 out[12] = step1;
2667 out[13] = step1;
2668 out[14] = step1;
2669 out[15] = step1;
2670 out[16] = step1;
2671 out[17] = step1;
2672 out[18] = step1;
2673 out[19] = step1;
2674 out[20] = step1;
2675 out[21] = step1;
2676 out[22] = step1;
2677 out[23] = step1;
2678 out[24] = step1;
2679 out[25] = step1;
2680 out[26] = step1;
2681 out[27] = step1;
2682 out[28] = step1;
2683 out[29] = step1;
2684 out[30] = step1;
2685 out[31] = step1;
2686 out[32] = step1;
2687 out[33] = step1;
2688 out[34] = step1;
2689 out[35] = step1;
2690 out[36] = step1;
2691 out[37] = step1;
2692 out[38] = step1;
2693 out[39] = step1;
2694 out[40] = step1;
2695 out[41] = step1;
2696 out[42] = step1;
2697 out[43] = step1;
2698 out[44] = step1;
2699 out[45] = step1;
2700 out[46] = step1;
2701 out[47] = step1;
2702 out[48] = step1;
2703 out[49] = step1;
2704 out[50] = step1;
2705 out[51] = step1;
2706 out[52] = step1;
2707 out[53] = step1;
2708 out[54] = step1;
2709 out[55] = step1;
2710 out[56] = step1;
2711 out[57] = step1;
2712 out[58] = step1;
2713 out[59] = step1;
2714 out[60] = step1;
2715 out[61] = step1;
2716 out[62] = step1;
2717 out[63] = step1;
2718 }
2719
idct64_low8_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)2720 static inline void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
2721 int8_t cos_bit) {
2722 const int32_t *cospi = cospi_arr(cos_bit);
2723 int16x8_t step2[64], step1[64];
2724
2725 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2726 (int16_t)cospi[36], (int16_t)cospi[28]);
2727 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2728 (int16_t)cospi[52], (int16_t)cospi[12]);
2729 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2730 (int16_t)cospi[40], (int16_t)cospi[24]);
2731 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2732 (int16_t)cospi[16], (int16_t)cospi[48]);
2733 const int16x4_t c4 =
2734 set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
2735 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2736 const int16x4_t c5 =
2737 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2738 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2739 const int16x4_t c6 =
2740 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2741 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
2742
2743 // stage 1
2744 // stage 2
2745
2746 step2[0] = in[0];
2747 step2[8] = in[4];
2748 step2[16] = in[2];
2749 step2[24] = in[6];
2750
2751 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2752 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2753 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2754 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2755
2756 // stage 3
2757
2758 step1[0] = step2[0];
2759 step1[8] = step2[8];
2760
2761 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2762 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2763
2764 step1[32] = step2[32];
2765 step1[33] = step2[32];
2766 step1[38] = step2[39];
2767 step1[39] = step2[39];
2768 step1[40] = step2[40];
2769 step1[41] = step2[40];
2770 step1[46] = step2[47];
2771 step1[47] = step2[47];
2772 step1[48] = step2[48];
2773 step1[49] = step2[48];
2774 step1[54] = step2[55];
2775 step1[55] = step2[55];
2776 step1[56] = step2[56];
2777 step1[57] = step2[56];
2778 step1[62] = step2[63];
2779 step1[63] = step2[63];
2780
2781 // stage 4
2782
2783 step2[0] = step1[0];
2784
2785 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2786 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
2787 btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
2788 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
2789 btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
2790
2791 step2[16] = step1[16];
2792 step2[17] = step1[16];
2793 step2[22] = step1[23];
2794 step2[23] = step1[23];
2795 step2[24] = step1[24];
2796 step2[25] = step1[24];
2797 step2[30] = step1[31];
2798 step2[31] = step1[31];
2799 step2[32] = step1[32];
2800 step2[39] = step1[39];
2801 step2[40] = step1[40];
2802 step2[47] = step1[47];
2803 step2[48] = step1[48];
2804 step2[55] = step1[55];
2805 step2[56] = step1[56];
2806 step2[63] = step1[63];
2807
2808 // stage 5
2809
2810 step1[0] = step2[0];
2811
2812 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
2813 btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
2814
2815 step1[8] = step2[8];
2816 step1[9] = step2[8];
2817 step1[14] = step2[15];
2818 step1[15] = step2[15];
2819
2820 step1[16] = step2[16];
2821 step1[23] = step2[23];
2822 step1[24] = step2[24];
2823 step1[31] = step2[31];
2824 step1[32] = step2[32];
2825 step1[33] = step2[33];
2826 step1[34] = step2[33];
2827 step1[35] = step2[32];
2828 step1[36] = step2[39];
2829 step1[37] = step2[38];
2830 step1[38] = step2[38];
2831 step1[39] = step2[39];
2832 step1[40] = step2[40];
2833 step1[41] = step2[41];
2834 step1[42] = step2[41];
2835 step1[43] = step2[40];
2836 step1[44] = step2[47];
2837 step1[45] = step2[46];
2838 step1[46] = step2[46];
2839 step1[47] = step2[47];
2840 step1[48] = step2[48];
2841 step1[49] = step2[49];
2842 step1[50] = step2[49];
2843 step1[51] = step2[48];
2844 step1[52] = step2[55];
2845 step1[53] = step2[54];
2846 step1[54] = step2[54];
2847 step1[55] = step2[55];
2848 step1[56] = step2[56];
2849 step1[57] = step2[57];
2850 step1[58] = step2[57];
2851 step1[59] = step2[56];
2852 step1[60] = step2[63];
2853 step1[61] = step2[62];
2854 step1[62] = step2[62];
2855 step1[63] = step2[63];
2856
2857 // stage 6
2858
2859 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2860 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2861 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2862 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
2863 btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
2864 btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
2865 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2866 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
2867 btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
2868 btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
2869
2870 step2[8] = step1[8];
2871 step2[15] = step1[15];
2872 step2[16] = step1[16];
2873 step2[17] = step1[17];
2874 step2[18] = step1[17];
2875 step2[19] = step1[16];
2876 step2[20] = step1[23];
2877 step2[21] = step1[22];
2878 step2[22] = step1[22];
2879 step2[23] = step1[23];
2880 step2[24] = step1[24];
2881 step2[25] = step1[25];
2882 step2[26] = step1[25];
2883 step2[27] = step1[24];
2884 step2[28] = step1[31];
2885 step2[29] = step1[30];
2886 step2[30] = step1[30];
2887 step2[31] = step1[31];
2888 step2[32] = step1[32];
2889 step2[33] = step1[33];
2890 step2[38] = step1[38];
2891 step2[39] = step1[39];
2892 step2[40] = step1[40];
2893 step2[41] = step1[41];
2894 step2[46] = step1[46];
2895 step2[47] = step1[47];
2896 step2[48] = step1[48];
2897 step2[49] = step1[49];
2898 step2[54] = step1[54];
2899 step2[55] = step1[55];
2900 step2[56] = step1[56];
2901 step2[57] = step1[57];
2902 step2[62] = step1[62];
2903 step2[63] = step1[63];
2904
2905 // stage 7
2906
2907 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2908 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
2909 btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
2910 btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
2911
2912 step1[0] = step2[0];
2913 step1[1] = step2[1];
2914 step1[2] = step2[1];
2915 step1[3] = step2[0];
2916 step1[8] = step2[8];
2917 step1[9] = step2[9];
2918 step1[10] = step2[9];
2919 step1[11] = step2[8];
2920 step1[12] = step2[15];
2921 step1[13] = step2[14];
2922 step1[14] = step2[14];
2923 step1[15] = step2[15];
2924 step1[16] = step2[16];
2925 step1[17] = step2[17];
2926 step1[22] = step2[22];
2927 step1[23] = step2[23];
2928 step1[24] = step2[24];
2929 step1[25] = step2[25];
2930 step1[30] = step2[30];
2931 step1[31] = step2[31];
2932 step1[32] = vqaddq_s16(step2[32], step2[39]);
2933 step1[33] = vqaddq_s16(step2[33], step2[38]);
2934 step1[34] = vqaddq_s16(step2[34], step2[37]);
2935 step1[35] = vqaddq_s16(step2[35], step2[36]);
2936 step1[36] = vqsubq_s16(step2[35], step2[36]);
2937 step1[37] = vqsubq_s16(step2[34], step2[37]);
2938 step1[38] = vqsubq_s16(step2[33], step2[38]);
2939 step1[39] = vqsubq_s16(step2[32], step2[39]);
2940 step1[40] = vqsubq_s16(step2[47], step2[40]);
2941 step1[41] = vqsubq_s16(step2[46], step2[41]);
2942 step1[42] = vqsubq_s16(step2[45], step2[42]);
2943 step1[43] = vqsubq_s16(step2[44], step2[43]);
2944 step1[44] = vqaddq_s16(step2[43], step2[44]);
2945 step1[45] = vqaddq_s16(step2[42], step2[45]);
2946 step1[46] = vqaddq_s16(step2[41], step2[46]);
2947 step1[47] = vqaddq_s16(step2[40], step2[47]);
2948 step1[48] = vqaddq_s16(step2[48], step2[55]);
2949 step1[49] = vqaddq_s16(step2[49], step2[54]);
2950 step1[50] = vqaddq_s16(step2[50], step2[53]);
2951 step1[51] = vqaddq_s16(step2[51], step2[52]);
2952 step1[52] = vqsubq_s16(step2[51], step2[52]);
2953 step1[53] = vqsubq_s16(step2[50], step2[53]);
2954 step1[54] = vqsubq_s16(step2[49], step2[54]);
2955 step1[55] = vqsubq_s16(step2[48], step2[55]);
2956 step1[56] = vqsubq_s16(step2[63], step2[56]);
2957 step1[57] = vqsubq_s16(step2[62], step2[57]);
2958 step1[58] = vqsubq_s16(step2[61], step2[58]);
2959 step1[59] = vqsubq_s16(step2[60], step2[59]);
2960 step1[60] = vqaddq_s16(step2[59], step2[60]);
2961 step1[61] = vqaddq_s16(step2[58], step2[61]);
2962 step1[62] = vqaddq_s16(step2[57], step2[62]);
2963 step1[63] = vqaddq_s16(step2[56], step2[63]);
2964
2965 // stage 8
2966
2967 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2968 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2969 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2970 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2971 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2972 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
2973 btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
2974 btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
2975 btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
2976 btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
2977
2978 step2[0] = step1[0];
2979 step2[1] = step1[1];
2980 step2[2] = step1[2];
2981 step2[3] = step1[3];
2982 step2[4] = step1[3];
2983 step2[5] = step1[2];
2984 step2[6] = step1[1];
2985 step2[7] = step1[0];
2986 step2[8] = step1[8];
2987 step2[9] = step1[9];
2988 step2[14] = step1[14];
2989 step2[15] = step1[15];
2990 step2[16] = vqaddq_s16(step1[16], step1[23]);
2991 step2[17] = vqaddq_s16(step1[17], step1[22]);
2992 step2[18] = vqaddq_s16(step1[18], step1[21]);
2993 step2[19] = vqaddq_s16(step1[19], step1[20]);
2994 step2[20] = vqsubq_s16(step1[19], step1[20]);
2995 step2[21] = vqsubq_s16(step1[18], step1[21]);
2996 step2[22] = vqsubq_s16(step1[17], step1[22]);
2997 step2[23] = vqsubq_s16(step1[16], step1[23]);
2998 step2[24] = vqsubq_s16(step1[31], step1[24]);
2999 step2[25] = vqsubq_s16(step1[30], step1[25]);
3000 step2[26] = vqsubq_s16(step1[29], step1[26]);
3001 step2[27] = vqsubq_s16(step1[28], step1[27]);
3002 step2[28] = vqaddq_s16(step1[28], step1[27]);
3003 step2[29] = vqaddq_s16(step1[29], step1[26]);
3004 step2[30] = vqaddq_s16(step1[30], step1[25]);
3005 step2[31] = vqaddq_s16(step1[31], step1[24]);
3006 step2[32] = step1[32];
3007 step2[33] = step1[33];
3008 step2[34] = step1[34];
3009 step2[35] = step1[35];
3010 step2[44] = step1[44];
3011 step2[45] = step1[45];
3012 step2[46] = step1[46];
3013 step2[47] = step1[47];
3014 step2[48] = step1[48];
3015 step2[49] = step1[49];
3016 step2[50] = step1[50];
3017 step2[51] = step1[51];
3018 step2[60] = step1[60];
3019 step2[61] = step1[61];
3020 step2[62] = step1[62];
3021 step2[63] = step1[63];
3022
3023 // stage 9
3024 idct64_stage9_neon(step2, step1, cos_bit);
3025
3026 // stage 10
3027 idct64_stage10_neon(step1, step2, cos_bit);
3028
3029 // stage 11
3030
3031 out[0] = vqaddq_s16(step2[0], step2[63]);
3032 out[1] = vqaddq_s16(step2[1], step2[62]);
3033 out[2] = vqaddq_s16(step2[2], step2[61]);
3034 out[3] = vqaddq_s16(step2[3], step2[60]);
3035 out[4] = vqaddq_s16(step2[4], step2[59]);
3036 out[5] = vqaddq_s16(step2[5], step2[58]);
3037 out[6] = vqaddq_s16(step2[6], step2[57]);
3038 out[7] = vqaddq_s16(step2[7], step2[56]);
3039 out[8] = vqaddq_s16(step2[8], step2[55]);
3040 out[9] = vqaddq_s16(step2[9], step2[54]);
3041 out[10] = vqaddq_s16(step2[10], step2[53]);
3042 out[11] = vqaddq_s16(step2[11], step2[52]);
3043 out[12] = vqaddq_s16(step2[12], step2[51]);
3044 out[13] = vqaddq_s16(step2[13], step2[50]);
3045 out[14] = vqaddq_s16(step2[14], step2[49]);
3046 out[15] = vqaddq_s16(step2[15], step2[48]);
3047 out[16] = vqaddq_s16(step2[16], step2[47]);
3048 out[17] = vqaddq_s16(step2[17], step2[46]);
3049 out[18] = vqaddq_s16(step2[18], step2[45]);
3050 out[19] = vqaddq_s16(step2[19], step2[44]);
3051 out[20] = vqaddq_s16(step2[20], step2[43]);
3052 out[21] = vqaddq_s16(step2[21], step2[42]);
3053 out[22] = vqaddq_s16(step2[22], step2[41]);
3054 out[23] = vqaddq_s16(step2[23], step2[40]);
3055 out[24] = vqaddq_s16(step2[24], step2[39]);
3056 out[25] = vqaddq_s16(step2[25], step2[38]);
3057 out[26] = vqaddq_s16(step2[26], step2[37]);
3058 out[27] = vqaddq_s16(step2[27], step2[36]);
3059 out[28] = vqaddq_s16(step2[28], step2[35]);
3060 out[29] = vqaddq_s16(step2[29], step2[34]);
3061 out[30] = vqaddq_s16(step2[30], step2[33]);
3062 out[31] = vqaddq_s16(step2[31], step2[32]);
3063 out[32] = vqsubq_s16(step2[31], step2[32]);
3064 out[33] = vqsubq_s16(step2[30], step2[33]);
3065 out[34] = vqsubq_s16(step2[29], step2[34]);
3066 out[35] = vqsubq_s16(step2[28], step2[35]);
3067 out[36] = vqsubq_s16(step2[27], step2[36]);
3068 out[37] = vqsubq_s16(step2[26], step2[37]);
3069 out[38] = vqsubq_s16(step2[25], step2[38]);
3070 out[39] = vqsubq_s16(step2[24], step2[39]);
3071 out[40] = vqsubq_s16(step2[23], step2[40]);
3072 out[41] = vqsubq_s16(step2[22], step2[41]);
3073 out[42] = vqsubq_s16(step2[21], step2[42]);
3074 out[43] = vqsubq_s16(step2[20], step2[43]);
3075 out[44] = vqsubq_s16(step2[19], step2[44]);
3076 out[45] = vqsubq_s16(step2[18], step2[45]);
3077 out[46] = vqsubq_s16(step2[17], step2[46]);
3078 out[47] = vqsubq_s16(step2[16], step2[47]);
3079 out[48] = vqsubq_s16(step2[15], step2[48]);
3080 out[49] = vqsubq_s16(step2[14], step2[49]);
3081 out[50] = vqsubq_s16(step2[13], step2[50]);
3082 out[51] = vqsubq_s16(step2[12], step2[51]);
3083 out[52] = vqsubq_s16(step2[11], step2[52]);
3084 out[53] = vqsubq_s16(step2[10], step2[53]);
3085 out[54] = vqsubq_s16(step2[9], step2[54]);
3086 out[55] = vqsubq_s16(step2[8], step2[55]);
3087 out[56] = vqsubq_s16(step2[7], step2[56]);
3088 out[57] = vqsubq_s16(step2[6], step2[57]);
3089 out[58] = vqsubq_s16(step2[5], step2[58]);
3090 out[59] = vqsubq_s16(step2[4], step2[59]);
3091 out[60] = vqsubq_s16(step2[3], step2[60]);
3092 out[61] = vqsubq_s16(step2[2], step2[61]);
3093 out[62] = vqsubq_s16(step2[1], step2[62]);
3094 out[63] = vqsubq_s16(step2[0], step2[63]);
3095 }
3096
idct64_low16_neon(int16x8_t * in,int16x8_t * out,int8_t cos_bit)3097 static inline void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
3098 int8_t cos_bit) {
3099 const int32_t *cospi = cospi_arr(cos_bit);
3100 int16x8_t step2[64], step1[64];
3101
3102 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
3103 (int16_t)cospi[36], (int16_t)cospi[28]);
3104 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
3105 (int16_t)cospi[52], (int16_t)cospi[12]);
3106 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
3107 (int16_t)cospi[40], (int16_t)cospi[24]);
3108 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
3109 (int16_t)cospi[16], (int16_t)cospi[48]);
3110 const int16x4_t c4 =
3111 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
3112 (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
3113 const int16x4_t c5 =
3114 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
3115 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
3116 const int16x4_t c6 =
3117 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
3118 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
3119 const int16x4_t c7 =
3120 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
3121 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
3122
3123 // stage 1
3124 // stage 2
3125
3126 step2[0] = in[0];
3127 step2[4] = in[8];
3128 step2[8] = in[4];
3129 step2[12] = in[12];
3130 step2[16] = in[2];
3131 step2[20] = in[10];
3132 step2[24] = in[6];
3133 step2[28] = in[14];
3134
3135 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
3136 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
3137 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
3138 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
3139 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
3140 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
3141 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
3142 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
3143
3144 // stage 3
3145
3146 step1[0] = step2[0];
3147 step1[4] = step2[4];
3148 step1[8] = step2[8];
3149 step1[12] = step2[12];
3150
3151 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
3152 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
3153 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
3154 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
3155
3156 step1[32] = step2[32];
3157 step1[33] = step2[32];
3158 step1[34] = step2[35];
3159 step1[35] = step2[35];
3160 step1[36] = step2[36];
3161 step1[37] = step2[36];
3162 step1[38] = step2[39];
3163 step1[39] = step2[39];
3164 step1[40] = step2[40];
3165 step1[41] = step2[40];
3166 step1[42] = step2[43];
3167 step1[43] = step2[43];
3168 step1[44] = step2[44];
3169 step1[45] = step2[44];
3170 step1[46] = step2[47];
3171 step1[47] = step2[47];
3172 step1[48] = step2[48];
3173 step1[49] = step2[48];
3174 step1[50] = step2[51];
3175 step1[51] = step2[51];
3176 step1[52] = step2[52];
3177 step1[53] = step2[52];
3178 step1[54] = step2[55];
3179 step1[55] = step2[55];
3180 step1[56] = step2[56];
3181 step1[57] = step2[56];
3182 step1[58] = step2[59];
3183 step1[59] = step2[59];
3184 step1[60] = step2[60];
3185 step1[61] = step2[60];
3186 step1[62] = step2[63];
3187 step1[63] = step2[63];
3188
3189 // stage 4
3190
3191 step2[0] = step1[0];
3192 step2[4] = step1[4];
3193
3194 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
3195 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
3196 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
3197 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
3198 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
3199 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
3200 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
3201 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
3202 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
3203 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
3204
3205 step2[16] = step1[16];
3206 step2[17] = step1[16];
3207 step2[18] = step1[19];
3208 step2[19] = step1[19];
3209 step2[20] = step1[20];
3210 step2[21] = step1[20];
3211 step2[22] = step1[23];
3212 step2[23] = step1[23];
3213 step2[24] = step1[24];
3214 step2[25] = step1[24];
3215 step2[26] = step1[27];
3216 step2[27] = step1[27];
3217 step2[28] = step1[28];
3218 step2[29] = step1[28];
3219 step2[30] = step1[31];
3220 step2[31] = step1[31];
3221 step2[32] = step1[32];
3222 step2[35] = step1[35];
3223 step2[36] = step1[36];
3224 step2[39] = step1[39];
3225 step2[40] = step1[40];
3226 step2[43] = step1[43];
3227 step2[44] = step1[44];
3228 step2[47] = step1[47];
3229 step2[48] = step1[48];
3230 step2[51] = step1[51];
3231 step2[52] = step1[52];
3232 step2[55] = step1[55];
3233 step2[56] = step1[56];
3234 step2[59] = step1[59];
3235 step2[60] = step1[60];
3236 step2[63] = step1[63];
3237
3238 // stage 5
3239
3240 step1[0] = step2[0];
3241
3242 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
3243 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
3244 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
3245 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
3246 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
3247
3248 step1[8] = step2[8];
3249 step1[9] = step2[8];
3250 step1[10] = step2[11];
3251 step1[11] = step2[11];
3252 step1[12] = step2[12];
3253 step1[13] = step2[12];
3254 step1[14] = step2[15];
3255 step1[15] = step2[15];
3256 step1[16] = step2[16];
3257 step1[19] = step2[19];
3258 step1[20] = step2[20];
3259 step1[23] = step2[23];
3260 step1[24] = step2[24];
3261 step1[27] = step2[27];
3262 step1[28] = step2[28];
3263 step1[31] = step2[31];
3264 step1[32] = vqaddq_s16(step2[32], step2[35]);
3265 step1[33] = vqaddq_s16(step2[33], step2[34]);
3266 step1[34] = vqsubq_s16(step2[33], step2[34]);
3267 step1[35] = vqsubq_s16(step2[32], step2[35]);
3268 step1[36] = vqsubq_s16(step2[39], step2[36]);
3269 step1[37] = vqsubq_s16(step2[38], step2[37]);
3270 step1[38] = vqaddq_s16(step2[38], step2[37]);
3271 step1[39] = vqaddq_s16(step2[39], step2[36]);
3272 step1[40] = vqaddq_s16(step2[40], step2[43]);
3273 step1[41] = vqaddq_s16(step2[41], step2[42]);
3274 step1[42] = vqsubq_s16(step2[41], step2[42]);
3275 step1[43] = vqsubq_s16(step2[40], step2[43]);
3276 step1[44] = vqsubq_s16(step2[47], step2[44]);
3277 step1[45] = vqsubq_s16(step2[46], step2[45]);
3278 step1[46] = vqaddq_s16(step2[46], step2[45]);
3279 step1[47] = vqaddq_s16(step2[47], step2[44]);
3280 step1[48] = vqaddq_s16(step2[48], step2[51]);
3281 step1[49] = vqaddq_s16(step2[49], step2[50]);
3282 step1[50] = vqsubq_s16(step2[49], step2[50]);
3283 step1[51] = vqsubq_s16(step2[48], step2[51]);
3284 step1[52] = vqsubq_s16(step2[55], step2[52]);
3285 step1[53] = vqsubq_s16(step2[54], step2[53]);
3286 step1[54] = vqaddq_s16(step2[54], step2[53]);
3287 step1[55] = vqaddq_s16(step2[55], step2[52]);
3288 step1[56] = vqaddq_s16(step2[56], step2[59]);
3289 step1[57] = vqaddq_s16(step2[57], step2[58]);
3290 step1[58] = vqsubq_s16(step2[57], step2[58]);
3291 step1[59] = vqsubq_s16(step2[56], step2[59]);
3292 step1[60] = vqsubq_s16(step2[63], step2[60]);
3293 step1[61] = vqsubq_s16(step2[62], step2[61]);
3294 step1[62] = vqaddq_s16(step2[62], step2[61]);
3295 step1[63] = vqaddq_s16(step2[63], step2[60]);
3296
3297 // stage 6
3298
3299 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
3300 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
3301 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
3302 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
3303 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
3304 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
3305 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
3306 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
3307 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
3308 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
3309 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
3310
3311 step2[4] = step1[4];
3312 step2[5] = step1[4];
3313 step2[6] = step1[7];
3314 step2[7] = step1[7];
3315 step2[8] = step1[8];
3316 step2[11] = step1[11];
3317 step2[12] = step1[12];
3318 step2[15] = step1[15];
3319 step2[16] = vqaddq_s16(step1[16], step1[19]);
3320 step2[17] = vqaddq_s16(step1[17], step1[18]);
3321 step2[18] = vqsubq_s16(step1[17], step1[18]);
3322 step2[19] = vqsubq_s16(step1[16], step1[19]);
3323 step2[20] = vqsubq_s16(step1[23], step1[20]);
3324 step2[21] = vqsubq_s16(step1[22], step1[21]);
3325 step2[22] = vqaddq_s16(step1[22], step1[21]);
3326 step2[23] = vqaddq_s16(step1[23], step1[20]);
3327 step2[24] = vqaddq_s16(step1[24], step1[27]);
3328 step2[25] = vqaddq_s16(step1[25], step1[26]);
3329 step2[26] = vqsubq_s16(step1[25], step1[26]);
3330 step2[27] = vqsubq_s16(step1[24], step1[27]);
3331 step2[28] = vqsubq_s16(step1[31], step1[28]);
3332 step2[29] = vqsubq_s16(step1[30], step1[29]);
3333 step2[30] = vqaddq_s16(step1[30], step1[29]);
3334 step2[31] = vqaddq_s16(step1[31], step1[28]);
3335 step2[32] = step1[32];
3336 step2[33] = step1[33];
3337 step2[38] = step1[38];
3338 step2[39] = step1[39];
3339 step2[40] = step1[40];
3340 step2[41] = step1[41];
3341 step2[46] = step1[46];
3342 step2[47] = step1[47];
3343 step2[48] = step1[48];
3344 step2[49] = step1[49];
3345 step2[54] = step1[54];
3346 step2[55] = step1[55];
3347 step2[56] = step1[56];
3348 step2[57] = step1[57];
3349 step2[62] = step1[62];
3350 step2[63] = step1[63];
3351
3352 // stage 7
3353
3354 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
3355 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
3356 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
3357 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
3358 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
3359
3360 step1[0] = step2[0];
3361 step1[1] = step2[1];
3362 step1[2] = step2[1];
3363 step1[3] = step2[0];
3364 step1[4] = step2[4];
3365 step1[7] = step2[7];
3366 step1[8] = vqaddq_s16(step2[8], step2[11]);
3367 step1[9] = vqaddq_s16(step2[9], step2[10]);
3368 step1[10] = vqsubq_s16(step2[9], step2[10]);
3369 step1[11] = vqsubq_s16(step2[8], step2[11]);
3370 step1[12] = vqsubq_s16(step2[15], step2[12]);
3371 step1[13] = vqsubq_s16(step2[14], step2[13]);
3372 step1[14] = vqaddq_s16(step2[14], step2[13]);
3373 step1[15] = vqaddq_s16(step2[15], step2[12]);
3374 step1[16] = step2[16];
3375 step1[17] = step2[17];
3376 step1[22] = step2[22];
3377 step1[23] = step2[23];
3378 step1[24] = step2[24];
3379 step1[25] = step2[25];
3380 step1[30] = step2[30];
3381 step1[31] = step2[31];
3382 step1[32] = vqaddq_s16(step2[32], step2[39]);
3383 step1[33] = vqaddq_s16(step2[33], step2[38]);
3384 step1[34] = vqaddq_s16(step2[34], step2[37]);
3385 step1[35] = vqaddq_s16(step2[35], step2[36]);
3386 step1[36] = vqsubq_s16(step2[35], step2[36]);
3387 step1[37] = vqsubq_s16(step2[34], step2[37]);
3388 step1[38] = vqsubq_s16(step2[33], step2[38]);
3389 step1[39] = vqsubq_s16(step2[32], step2[39]);
3390 step1[40] = vqsubq_s16(step2[47], step2[40]);
3391 step1[41] = vqsubq_s16(step2[46], step2[41]);
3392 step1[42] = vqsubq_s16(step2[45], step2[42]);
3393 step1[43] = vqsubq_s16(step2[44], step2[43]);
3394 step1[44] = vqaddq_s16(step2[43], step2[44]);
3395 step1[45] = vqaddq_s16(step2[42], step2[45]);
3396 step1[46] = vqaddq_s16(step2[41], step2[46]);
3397 step1[47] = vqaddq_s16(step2[40], step2[47]);
3398 step1[48] = vqaddq_s16(step2[48], step2[55]);
3399 step1[49] = vqaddq_s16(step2[49], step2[54]);
3400 step1[50] = vqaddq_s16(step2[50], step2[53]);
3401 step1[51] = vqaddq_s16(step2[51], step2[52]);
3402 step1[52] = vqsubq_s16(step2[51], step2[52]);
3403 step1[53] = vqsubq_s16(step2[50], step2[53]);
3404 step1[54] = vqsubq_s16(step2[49], step2[54]);
3405 step1[55] = vqsubq_s16(step2[48], step2[55]);
3406 step1[56] = vqsubq_s16(step2[63], step2[56]);
3407 step1[57] = vqsubq_s16(step2[62], step2[57]);
3408 step1[58] = vqsubq_s16(step2[61], step2[58]);
3409 step1[59] = vqsubq_s16(step2[60], step2[59]);
3410 step1[60] = vqaddq_s16(step2[59], step2[60]);
3411 step1[61] = vqaddq_s16(step2[58], step2[61]);
3412 step1[62] = vqaddq_s16(step2[57], step2[62]);
3413 step1[63] = vqaddq_s16(step2[56], step2[63]);
3414
3415 // stage 8
3416
3417 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
3418 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
3419 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
3420 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
3421 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
3422 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
3423 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
3424 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
3425 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
3426 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
3427
3428 step2[0] = vqaddq_s16(step1[0], step1[7]);
3429 step2[1] = vqaddq_s16(step1[1], step1[6]);
3430 step2[2] = vqaddq_s16(step1[2], step1[5]);
3431 step2[3] = vqaddq_s16(step1[3], step1[4]);
3432 step2[4] = vqsubq_s16(step1[3], step1[4]);
3433 step2[5] = vqsubq_s16(step1[2], step1[5]);
3434 step2[6] = vqsubq_s16(step1[1], step1[6]);
3435 step2[7] = vqsubq_s16(step1[0], step1[7]);
3436 step2[8] = step1[8];
3437 step2[9] = step1[9];
3438 step2[14] = step1[14];
3439 step2[15] = step1[15];
3440 step2[16] = vqaddq_s16(step1[16], step1[23]);
3441 step2[17] = vqaddq_s16(step1[17], step1[22]);
3442 step2[18] = vqaddq_s16(step1[18], step1[21]);
3443 step2[19] = vqaddq_s16(step1[19], step1[20]);
3444 step2[20] = vqsubq_s16(step1[19], step1[20]);
3445 step2[21] = vqsubq_s16(step1[18], step1[21]);
3446 step2[22] = vqsubq_s16(step1[17], step1[22]);
3447 step2[23] = vqsubq_s16(step1[16], step1[23]);
3448 step2[24] = vqsubq_s16(step1[31], step1[24]);
3449 step2[25] = vqsubq_s16(step1[30], step1[25]);
3450 step2[26] = vqsubq_s16(step1[29], step1[26]);
3451 step2[27] = vqsubq_s16(step1[28], step1[27]);
3452 step2[28] = vqaddq_s16(step1[28], step1[27]);
3453 step2[29] = vqaddq_s16(step1[29], step1[26]);
3454 step2[30] = vqaddq_s16(step1[30], step1[25]);
3455 step2[31] = vqaddq_s16(step1[31], step1[24]);
3456 step2[32] = step1[32];
3457 step2[33] = step1[33];
3458 step2[34] = step1[34];
3459 step2[35] = step1[35];
3460 step2[44] = step1[44];
3461 step2[45] = step1[45];
3462 step2[46] = step1[46];
3463 step2[47] = step1[47];
3464 step2[48] = step1[48];
3465 step2[49] = step1[49];
3466 step2[50] = step1[50];
3467 step2[51] = step1[51];
3468 step2[60] = step1[60];
3469 step2[61] = step1[61];
3470 step2[62] = step1[62];
3471 step2[63] = step1[63];
3472
3473 // stage 9
3474 idct64_stage9_neon(step2, step1, cos_bit);
3475
3476 // stage 10
3477 idct64_stage10_neon(step1, step2, cos_bit);
3478
3479 // stage 11
3480
3481 out[0] = vqaddq_s16(step2[0], step2[63]);
3482 out[1] = vqaddq_s16(step2[1], step2[62]);
3483 out[2] = vqaddq_s16(step2[2], step2[61]);
3484 out[3] = vqaddq_s16(step2[3], step2[60]);
3485 out[4] = vqaddq_s16(step2[4], step2[59]);
3486 out[5] = vqaddq_s16(step2[5], step2[58]);
3487 out[6] = vqaddq_s16(step2[6], step2[57]);
3488 out[7] = vqaddq_s16(step2[7], step2[56]);
3489 out[8] = vqaddq_s16(step2[8], step2[55]);
3490 out[9] = vqaddq_s16(step2[9], step2[54]);
3491 out[10] = vqaddq_s16(step2[10], step2[53]);
3492 out[11] = vqaddq_s16(step2[11], step2[52]);
3493 out[12] = vqaddq_s16(step2[12], step2[51]);
3494 out[13] = vqaddq_s16(step2[13], step2[50]);
3495 out[14] = vqaddq_s16(step2[14], step2[49]);
3496 out[15] = vqaddq_s16(step2[15], step2[48]);
3497 out[16] = vqaddq_s16(step2[16], step2[47]);
3498 out[17] = vqaddq_s16(step2[17], step2[46]);
3499 out[18] = vqaddq_s16(step2[18], step2[45]);
3500 out[19] = vqaddq_s16(step2[19], step2[44]);
3501 out[20] = vqaddq_s16(step2[20], step2[43]);
3502 out[21] = vqaddq_s16(step2[21], step2[42]);
3503 out[22] = vqaddq_s16(step2[22], step2[41]);
3504 out[23] = vqaddq_s16(step2[23], step2[40]);
3505 out[24] = vqaddq_s16(step2[24], step2[39]);
3506 out[25] = vqaddq_s16(step2[25], step2[38]);
3507 out[26] = vqaddq_s16(step2[26], step2[37]);
3508 out[27] = vqaddq_s16(step2[27], step2[36]);
3509 out[28] = vqaddq_s16(step2[28], step2[35]);
3510 out[29] = vqaddq_s16(step2[29], step2[34]);
3511 out[30] = vqaddq_s16(step2[30], step2[33]);
3512 out[31] = vqaddq_s16(step2[31], step2[32]);
3513 out[32] = vqsubq_s16(step2[31], step2[32]);
3514 out[33] = vqsubq_s16(step2[30], step2[33]);
3515 out[34] = vqsubq_s16(step2[29], step2[34]);
3516 out[35] = vqsubq_s16(step2[28], step2[35]);
3517 out[36] = vqsubq_s16(step2[27], step2[36]);
3518 out[37] = vqsubq_s16(step2[26], step2[37]);
3519 out[38] = vqsubq_s16(step2[25], step2[38]);
3520 out[39] = vqsubq_s16(step2[24], step2[39]);
3521 out[40] = vqsubq_s16(step2[23], step2[40]);
3522 out[41] = vqsubq_s16(step2[22], step2[41]);
3523 out[42] = vqsubq_s16(step2[21], step2[42]);
3524 out[43] = vqsubq_s16(step2[20], step2[43]);
3525 out[44] = vqsubq_s16(step2[19], step2[44]);
3526 out[45] = vqsubq_s16(step2[18], step2[45]);
3527 out[46] = vqsubq_s16(step2[17], step2[46]);
3528 out[47] = vqsubq_s16(step2[16], step2[47]);
3529 out[48] = vqsubq_s16(step2[15], step2[48]);
3530 out[49] = vqsubq_s16(step2[14], step2[49]);
3531 out[50] = vqsubq_s16(step2[13], step2[50]);
3532 out[51] = vqsubq_s16(step2[12], step2[51]);
3533 out[52] = vqsubq_s16(step2[11], step2[52]);
3534 out[53] = vqsubq_s16(step2[10], step2[53]);
3535 out[54] = vqsubq_s16(step2[9], step2[54]);
3536 out[55] = vqsubq_s16(step2[8], step2[55]);
3537 out[56] = vqsubq_s16(step2[7], step2[56]);
3538 out[57] = vqsubq_s16(step2[6], step2[57]);
3539 out[58] = vqsubq_s16(step2[5], step2[58]);
3540 out[59] = vqsubq_s16(step2[4], step2[59]);
3541 out[60] = vqsubq_s16(step2[3], step2[60]);
3542 out[61] = vqsubq_s16(step2[2], step2[61]);
3543 out[62] = vqsubq_s16(step2[1], step2[62]);
3544 out[63] = vqsubq_s16(step2[0], step2[63]);
3545 }
3546
3547 // Functions for blocks with eob at DC and within
3548 // topleft 8x8, 16x16, 32x32 corner
3549 static const transform_neon
3550 lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
3551 {
3552 { NULL, NULL, NULL, NULL },
3553 { NULL, NULL, NULL, NULL },
3554 { NULL, NULL, NULL, NULL },
3555 },
3556 { { idct8_low1_neon, idct8_neon, NULL, NULL },
3557 { iadst8_low1_neon, iadst8_neon, NULL, NULL },
3558 { NULL, NULL, NULL, NULL } },
3559 {
3560 { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
3561 { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
3562 { NULL, NULL, NULL, NULL },
3563 },
3564 { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
3565 { NULL, NULL, NULL, NULL },
3566 { NULL, NULL, NULL, NULL } },
3567 { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
3568 idct64_low32_neon },
3569 { NULL, NULL, NULL, NULL },
3570 { NULL, NULL, NULL, NULL } }
3571 };
3572
lowbd_inv_txfm2d_add_idtx_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)3573 static inline void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
3574 uint8_t *output, int stride,
3575 TX_TYPE tx_type,
3576 TX_SIZE tx_size, int eob) {
3577 (void)tx_type;
3578 int16x8_t a[32 * 4];
3579 int16x8_t b[32 * 4];
3580 int eobx, eoby;
3581 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
3582 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3583 const int txw_idx = get_txw_idx(tx_size);
3584 const int txh_idx = get_txh_idx(tx_size);
3585 const int txfm_size_col = tx_size_wide[tx_size];
3586 const int txfm_size_row = tx_size_high[tx_size];
3587 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3588 0);
3589 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3590 0);
3591 const int buf_size_w_div8 = txfm_size_col >> 3;
3592 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
3593 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3594 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3595 const int input_stride = txfm_size_row;
3596 int temp_b = 0;
3597
3598 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3599 int16x8_t *cur_a = &a[i * txfm_size_col];
3600 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3601 buf_size_nonzero_w);
3602 input += 8;
3603 if (abs(rect_type) == 1) {
3604 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
3605 }
3606 identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
3607 -shift[0]);
3608 for (int j = 0; j < buf_size_w_div8; ++j) {
3609 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
3610 }
3611 temp_b += 8;
3612 }
3613 for (int j = 0; j < buf_size_w_div8; ++j) {
3614 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3615 txh_idx, txfm_size_row, -shift[1]);
3616 }
3617 if (txfm_size_col >= 16) {
3618 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3619 lowbd_add_flip_buffer_16xn_neon(
3620 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
3621 }
3622 } else if (txfm_size_col == 8) {
3623 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
3624 }
3625 }
3626
lowbd_inv_txfm2d_add_v_identity_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)3627 static inline void lowbd_inv_txfm2d_add_v_identity_neon(
3628 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3629 TX_SIZE tx_size, int eob) {
3630 int16x8_t a[16 * 2];
3631 int16x8_t b[16 * 2];
3632 int eobx, eoby, ud_flip, lr_flip;
3633 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
3634 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3635 const int txw_idx = get_txw_idx(tx_size);
3636 const int txh_idx = get_txh_idx(tx_size);
3637 const int txfm_size_col = tx_size_wide[tx_size];
3638 const int txfm_size_row = tx_size_high[tx_size];
3639 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3640 0);
3641 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
3642 const int buf_size_w_div8 = txfm_size_col >> 3;
3643 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3644 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3645 const int input_stride = txfm_size_row;
3646 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
3647 int temp_b = 0;
3648 const transform_neon row_txfm =
3649 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
3650
3651 assert(row_txfm != NULL);
3652
3653 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3654
3655 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3656 int16x8_t *cur_a = &a[i * txfm_size_col];
3657 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3658 buf_size_nonzero_w);
3659 input += 8;
3660 if (abs(rect_type) == 1) {
3661 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
3662 }
3663 row_txfm(cur_a, cur_a, INV_COS_BIT);
3664 round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
3665 if (lr_flip == 1) {
3666 for (int j = 0; j < buf_size_w_div8; ++j) {
3667 flip_buf_ud_neon(&cur_a[j * 8], 8);
3668 transpose_arrays_s16_8x8(
3669 &cur_a[j * 8],
3670 &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
3671 }
3672 temp_b += 8;
3673 } else {
3674 for (int j = 0; j < buf_size_w_div8; ++j) {
3675 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
3676 }
3677 temp_b += 8;
3678 }
3679 }
3680 for (int j = 0; j < buf_size_w_div8; ++j) {
3681 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3682 txh_idx, txfm_size_row, -shift[1]);
3683 }
3684 if (txfm_size_col >= 16) {
3685 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3686 lowbd_add_flip_buffer_16xn_neon(
3687 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
3688 }
3689 } else if (txfm_size_col == 8) {
3690 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
3691 }
3692 }
3693
lowbd_inv_txfm2d_add_h_identity_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)3694 static inline void lowbd_inv_txfm2d_add_h_identity_neon(
3695 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3696 TX_SIZE tx_size, int eob) {
3697 int16x8_t a[16 * 2];
3698 int16x8_t b[16 * 2];
3699 int eobx, eoby, ud_flip, lr_flip;
3700 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
3701 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3702 const int txw_idx = get_txw_idx(tx_size);
3703 const int txh_idx = get_txh_idx(tx_size);
3704 const int txfm_size_col = tx_size_wide[tx_size];
3705 const int txfm_size_row = tx_size_high[tx_size];
3706 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3707 0);
3708 const int buf_size_w_div8 = txfm_size_col >> 3;
3709 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
3710 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3711 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3712 const int input_stride = txfm_size_row;
3713 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
3714 int temp_b = 0;
3715 const transform_neon col_txfm =
3716 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
3717
3718 assert(col_txfm != NULL);
3719
3720 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3721
3722 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3723 int16x8_t *cur_a = &a[i * txfm_size_col];
3724 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3725 buf_size_nonzero_w);
3726 input += 8;
3727 if (abs(rect_type) == 1) {
3728 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
3729 }
3730 identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
3731 -shift[0]);
3732 for (int j = 0; j < buf_size_w_div8; ++j) {
3733 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
3734 }
3735 temp_b += 8;
3736 }
3737 for (int j = 0; j < buf_size_w_div8; ++j) {
3738 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
3739 round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
3740 }
3741 if (txfm_size_col >= 16) {
3742 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3743 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
3744 output + 16 * i, stride, ud_flip,
3745 txfm_size_row);
3746 }
3747 } else if (txfm_size_col == 8) {
3748 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
3749 }
3750 }
3751
lowbd_inv_txfm2d_add_4x4_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3752 static inline void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
3753 uint8_t *output, int stride,
3754 TX_TYPE tx_type, int eob) {
3755 (void)eob;
3756 TX_SIZE tx_size = TX_4X4;
3757 DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
3758 int32_t *temp_in = txfm_buf;
3759
3760 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3761 const int txw_idx = get_txw_idx(tx_size);
3762 const int txh_idx = get_txh_idx(tx_size);
3763 const int txfm_size_col = tx_size_wide[tx_size];
3764 const int txfm_size_row = tx_size_high[tx_size];
3765 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3766 int32_t *temp_out = temp_in + buf_offset;
3767 int32_t *buf = temp_out + buf_offset;
3768 int32_t *buf_ptr = buf;
3769 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
3770 int r;
3771 const transform_1d_neon row_txfm =
3772 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3773 const transform_1d_neon col_txfm =
3774 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3775
3776 int ud_flip, lr_flip;
3777 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3778
3779 for (int i = 0; i < txfm_size_row; i++) {
3780 for (int c = 0; c < txfm_size_col; ++c)
3781 temp_in[c] = input[c * txfm_size_row];
3782 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3783
3784 input++;
3785 buf_ptr += txfm_size_col;
3786 }
3787
3788 for (int c = 0; c < txfm_size_col; ++c) {
3789 if (lr_flip == 0) {
3790 for (r = 0; r < txfm_size_row; ++r)
3791 temp_in[r] = buf[r * txfm_size_col + c];
3792 } else {
3793 // flip left right
3794 for (r = 0; r < txfm_size_row; ++r)
3795 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3796 }
3797 clamp_buf(temp_in, txfm_size_row, 16);
3798 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3799 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3800
3801 if (ud_flip == 0) {
3802 for (r = 0; r < txfm_size_row; ++r) {
3803 output[r * stride + c] =
3804 clip_pixel(output[r * stride + c] + temp_out[r]);
3805 }
3806 } else {
3807 // flip upside down
3808 for (r = 0; r < txfm_size_row; ++r) {
3809 output[r * stride + c] = clip_pixel(output[r * stride + c] +
3810 temp_out[txfm_size_row - r - 1]);
3811 }
3812 }
3813 }
3814 }
3815
lowbd_inv_txfm2d_add_4x8_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3816 static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
3817 int stride, TX_TYPE tx_type,
3818 int eob) {
3819 (void)eob;
3820 TX_SIZE tx_size = TX_4X8;
3821 DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
3822 int32_t *temp_in = txfm_buf;
3823
3824 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3825 const int txw_idx = get_txw_idx(tx_size);
3826 const int txh_idx = get_txh_idx(tx_size);
3827 const int txfm_size_col = tx_size_wide[tx_size];
3828 const int txfm_size_row = tx_size_high[tx_size];
3829 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3830 int32_t *temp_out = temp_in + buf_offset;
3831 int32_t *buf = temp_out + buf_offset;
3832 int32_t *buf_ptr = buf;
3833 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3834 16, 16, 16, 16 };
3835 int r;
3836 const transform_1d_neon row_txfm =
3837 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3838 const transform_1d_neon col_txfm =
3839 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3840
3841 int ud_flip, lr_flip;
3842 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3843
3844 for (int i = 0; i < txfm_size_row; i++) {
3845 for (int c = 0; c < txfm_size_col; c++)
3846 temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
3847 NewSqrt2Bits);
3848
3849 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3850 input++;
3851 buf_ptr += txfm_size_col;
3852 }
3853
3854 for (int c = 0; c < txfm_size_col; ++c) {
3855 if (lr_flip == 0) {
3856 for (r = 0; r < txfm_size_row; ++r)
3857 temp_in[r] = buf[r * txfm_size_col + c];
3858 } else {
3859 // flip left right
3860 for (r = 0; r < txfm_size_row; ++r)
3861 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3862 }
3863 clamp_buf(temp_in, txfm_size_row, 16);
3864 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3865 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3866
3867 if (ud_flip == 0) {
3868 for (r = 0; r < txfm_size_row; ++r) {
3869 output[r * stride + c] =
3870 clip_pixel(output[r * stride + c] + temp_out[r]);
3871 }
3872 } else {
3873 // flip upside down
3874 for (r = 0; r < txfm_size_row; ++r) {
3875 output[r * stride + c] = clip_pixel(output[r * stride + c] +
3876 temp_out[txfm_size_row - r - 1]);
3877 }
3878 }
3879 }
3880 }
3881
lowbd_inv_txfm2d_add_8x4_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3882 static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
3883 int stride, TX_TYPE tx_type,
3884 int eob) {
3885 (void)eob;
3886 TX_SIZE tx_size = TX_8X4;
3887 DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
3888 int32_t *temp_in = txfm_buf;
3889
3890 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3891 const int txw_idx = get_txw_idx(tx_size);
3892 const int txh_idx = get_txh_idx(tx_size);
3893 const int txfm_size_col = tx_size_wide[tx_size];
3894 const int txfm_size_row = tx_size_high[tx_size];
3895 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3896 int32_t *temp_out = temp_in + buf_offset;
3897 int32_t *buf = temp_out + buf_offset;
3898 int32_t *buf_ptr = buf;
3899 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3900 16, 16, 16, 16 };
3901 int r;
3902 const transform_1d_neon row_txfm =
3903 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3904 const transform_1d_neon col_txfm =
3905 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3906
3907 int ud_flip, lr_flip;
3908 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3909
3910 for (int i = 0; i < txfm_size_row; i++) {
3911 for (int c = 0; c < txfm_size_col; c++)
3912 temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
3913 NewSqrt2Bits);
3914
3915 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3916 input++;
3917 buf_ptr += txfm_size_col;
3918 }
3919
3920 for (int c = 0; c < txfm_size_col; ++c) {
3921 if (lr_flip == 0) {
3922 for (r = 0; r < txfm_size_row; ++r)
3923 temp_in[r] = buf[r * txfm_size_col + c];
3924 } else {
3925 // flip left right
3926 for (r = 0; r < txfm_size_row; ++r)
3927 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3928 }
3929 clamp_buf(temp_in, txfm_size_row, 16);
3930 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3931 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3932
3933 if (ud_flip == 0) {
3934 for (r = 0; r < txfm_size_row; ++r) {
3935 output[r * stride + c] =
3936 clip_pixel(output[r * stride + c] + temp_out[r]);
3937 }
3938 } else {
3939 // flip upside down
3940 for (r = 0; r < txfm_size_row; ++r) {
3941 output[r * stride + c] = clip_pixel(output[r * stride + c] +
3942 temp_out[txfm_size_row - r - 1]);
3943 }
3944 }
3945 }
3946 }
3947
lowbd_inv_txfm2d_add_4x16_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)3948 static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input,
3949 uint8_t *output, int stride,
3950 TX_TYPE tx_type, int eob) {
3951 (void)eob;
3952 TX_SIZE tx_size = TX_4X16;
3953 DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
3954 int32_t *temp_in = txfm_buf;
3955
3956 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
3957 const int txw_idx = get_txw_idx(tx_size);
3958 const int txh_idx = get_txh_idx(tx_size);
3959 const int txfm_size_col = tx_size_wide[tx_size];
3960 const int txfm_size_row = tx_size_high[tx_size];
3961 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3962 int32_t *temp_out = temp_in + buf_offset;
3963 int32_t *buf = temp_out + buf_offset;
3964 int32_t *buf_ptr = buf;
3965 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
3966 16, 16, 16, 16, 16 };
3967 int r;
3968 const transform_1d_neon row_txfm =
3969 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3970 const transform_1d_neon col_txfm =
3971 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3972
3973 int ud_flip, lr_flip;
3974 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3975
3976 for (int i = 0; i < txfm_size_row; i++) {
3977 for (int c = 0; c < txfm_size_col; c++)
3978 temp_in[c] = input[c * txfm_size_row];
3979 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
3980 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
3981 input++;
3982 buf_ptr += txfm_size_col;
3983 }
3984
3985 for (int c = 0; c < txfm_size_col; ++c) {
3986 if (lr_flip == 0) {
3987 for (r = 0; r < txfm_size_row; ++r)
3988 temp_in[r] = buf[r * txfm_size_col + c];
3989 } else {
3990 // flip left right
3991 for (r = 0; r < txfm_size_row; ++r)
3992 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3993 }
3994 clamp_buf(temp_in, txfm_size_row, 16);
3995 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
3996 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3997
3998 if (ud_flip == 0) {
3999 for (r = 0; r < txfm_size_row; ++r) {
4000 output[r * stride + c] =
4001 clip_pixel(output[r * stride + c] + temp_out[r]);
4002 }
4003 } else {
4004 // flip upside down
4005 for (r = 0; r < txfm_size_row; ++r) {
4006 output[r * stride + c] = clip_pixel(output[r * stride + c] +
4007 temp_out[txfm_size_row - r - 1]);
4008 }
4009 }
4010 }
4011 }
4012
lowbd_inv_txfm2d_add_16x4_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,int eob)4013 static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input,
4014 uint8_t *output, int stride,
4015 TX_TYPE tx_type, int eob) {
4016 (void)eob;
4017 TX_SIZE tx_size = TX_16X4;
4018 DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
4019 int32_t *temp_in = txfm_buf;
4020
4021 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4022 const int txw_idx = get_txw_idx(tx_size);
4023 const int txh_idx = get_txh_idx(tx_size);
4024 const int txfm_size_col = tx_size_wide[tx_size];
4025 const int txfm_size_row = tx_size_high[tx_size];
4026 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
4027 int32_t *temp_out = temp_in + buf_offset;
4028 int32_t *buf = temp_out + buf_offset;
4029 int32_t *buf_ptr = buf;
4030 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
4031 16, 16, 16, 16, 16 };
4032 int r;
4033 const transform_1d_neon row_txfm =
4034 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
4035 const transform_1d_neon col_txfm =
4036 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
4037
4038 int ud_flip, lr_flip;
4039 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4040
4041 for (int i = 0; i < txfm_size_row; i++) {
4042 for (int c = 0; c < txfm_size_col; c++)
4043 temp_in[c] = input[c * txfm_size_row];
4044 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
4045 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
4046 input++;
4047 buf_ptr += txfm_size_col;
4048 }
4049
4050 for (int c = 0; c < txfm_size_col; ++c) {
4051 if (lr_flip == 0) {
4052 for (r = 0; r < txfm_size_row; ++r)
4053 temp_in[r] = buf[r * txfm_size_col + c];
4054 } else {
4055 // flip left right
4056 for (r = 0; r < txfm_size_row; ++r)
4057 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4058 }
4059 clamp_buf(temp_in, txfm_size_row, 16);
4060 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
4061 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4062
4063 if (ud_flip == 0) {
4064 for (r = 0; r < txfm_size_row; ++r) {
4065 output[r * stride + c] =
4066 clip_pixel(output[r * stride + c] + temp_out[r]);
4067 }
4068 } else {
4069 // flip upside down
4070 for (r = 0; r < txfm_size_row; ++r) {
4071 output[r * stride + c] = clip_pixel(output[r * stride + c] +
4072 temp_out[txfm_size_row - r - 1]);
4073 }
4074 }
4075 }
4076 }
4077
lowbd_inv_txfm2d_add_no_identity_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)4078 static inline void lowbd_inv_txfm2d_add_no_identity_neon(
4079 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4080 TX_SIZE tx_size, int eob) {
4081 int16x8_t a[64 * 8];
4082 int16x8_t b[64 * 8];
4083 int eobx, eoby, ud_flip, lr_flip;
4084 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
4085 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4086 const int txw_idx = get_txw_idx(tx_size);
4087 const int txh_idx = get_txh_idx(tx_size);
4088 const int txfm_size_col = tx_size_wide[tx_size];
4089 const int txfm_size_row = tx_size_high[tx_size];
4090 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
4091 const int buf_size_w_div8 = txfm_size_col >> 3;
4092 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4093 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
4094 const int input_stride = AOMMIN(32, txfm_size_row);
4095 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4096 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4097 int temp_b = 0;
4098
4099 const transform_neon row_txfm =
4100 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4101 const transform_neon col_txfm =
4102 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4103
4104 assert(col_txfm != NULL);
4105 assert(row_txfm != NULL);
4106
4107 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4108
4109 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4110 int16x8_t *cur_a = &a[i * txfm_size_col];
4111 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
4112 buf_size_nonzero_w);
4113 input += 8;
4114 if (abs(rect_type) == 1) {
4115 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
4116 }
4117 row_txfm(cur_a, cur_a, INV_COS_BIT);
4118 round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
4119 if (lr_flip == 1) {
4120 for (int j = 0; j < buf_size_w_div8; ++j) {
4121 flip_buf_ud_neon(&cur_a[j * 8], 8);
4122 transpose_arrays_s16_8x8(
4123 &cur_a[j * 8],
4124 &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
4125 }
4126 temp_b += 8;
4127 } else {
4128 for (int j = 0; j < buf_size_w_div8; ++j) {
4129 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
4130 }
4131 temp_b += 8;
4132 }
4133 }
4134 for (int j = 0; j < buf_size_w_div8; ++j) {
4135 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
4136 round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]);
4137 }
4138
4139 if (txfm_size_col >= 16) {
4140 for (int i = 0; i < (txfm_size_col >> 4); i++) {
4141 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
4142 output + 16 * i, stride, ud_flip,
4143 txfm_size_row);
4144 }
4145 } else if (txfm_size_col == 8) {
4146 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
4147 }
4148 }
4149
lowbd_inv_txfm2d_add_universe_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)4150 static inline void lowbd_inv_txfm2d_add_universe_neon(
4151 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4152 TX_SIZE tx_size, int eob) {
4153 switch (tx_type) {
4154 case IDTX:
4155 lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
4156 eob);
4157 break;
4158
4159 case H_DCT:
4160 case H_ADST:
4161 case H_FLIPADST:
4162 lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
4163 tx_size, eob);
4164 break;
4165
4166 case V_DCT:
4167 case V_ADST:
4168 case V_FLIPADST:
4169 lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
4170 tx_size, eob);
4171 break;
4172
4173 default:
4174 lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
4175 tx_size, eob);
4176 break;
4177 }
4178 }
4179
4180 // This function is used by av1_inv_txfm2d_test.cc.
4181 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
4182 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
4183 int eob);
4184
av1_lowbd_inv_txfm2d_add_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)4185 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
4186 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
4187 int eob) {
4188 switch (tx_size) {
4189 case TX_4X4:
4190 lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
4191 break;
4192
4193 case TX_4X8:
4194 lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
4195 break;
4196
4197 case TX_8X4:
4198 lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
4199 break;
4200
4201 case TX_4X16:
4202 lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
4203 break;
4204
4205 case TX_16X4:
4206 lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
4207 break;
4208
4209 default:
4210 lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
4211 tx_size, eob);
4212 break;
4213 }
4214 }
av1_inv_txfm_add_neon(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)4215 void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
4216 const TxfmParam *txfm_param) {
4217 const TX_TYPE tx_type = txfm_param->tx_type;
4218 if (!txfm_param->lossless) {
4219 av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
4220 txfm_param->tx_size, txfm_param->eob);
4221 } else {
4222 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
4223 }
4224 }
4225