1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you canzip
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14
15 #include "av1/common/av1_inv_txfm1d_cfg.h"
16 #include "av1/common/idct.h"
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19
20 #if AOM_ARCH_AARCH64
21 #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
22 do { \
23 int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
24 int32x4x2_t swap_high = vtrnq_s32(x2, x3); \
25 y0 = vreinterpretq_s32_s64( \
26 vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \
27 vreinterpretq_s64_s32(swap_high.val[0]))); \
28 y1 = vreinterpretq_s32_s64( \
29 vzip1q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \
30 vreinterpretq_s64_s32(swap_high.val[1]))); \
31 y2 = vreinterpretq_s32_s64( \
32 vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[0]), \
33 vreinterpretq_s64_s32(swap_high.val[0]))); \
34 y3 = vreinterpretq_s32_s64( \
35 vzip2q_s64(vreinterpretq_s64_s32(swap_low.val[1]), \
36 vreinterpretq_s64_s32(swap_high.val[1]))); \
37 } while (0)
38 #else
39 #define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
40 do { \
41 int32x4x2_t swap_low = vtrnq_s32(x0, x1); \
42 int32x4x2_t swap_high = vtrnq_s32(x2, x3); \
43 y0 = vextq_s32(vextq_s32(swap_low.val[0], swap_low.val[0], 2), \
44 swap_high.val[0], 2); \
45 y1 = vextq_s32(vextq_s32(swap_low.val[1], swap_low.val[1], 2), \
46 swap_high.val[1], 2); \
47 y2 = vextq_s32(swap_low.val[0], \
48 vextq_s32(swap_high.val[0], swap_high.val[0], 2), 2); \
49 y3 = vextq_s32(swap_low.val[1], \
50 vextq_s32(swap_high.val[1], swap_high.val[1], 2), 2); \
51 } while (0)
52 #endif // AOM_ARCH_AARCH64
53
transpose_4x4(const int32x4_t * in,int32x4_t * out)54 static inline void transpose_4x4(const int32x4_t *in, int32x4_t *out) {
55 TRANSPOSE_4X4(in[0], in[1], in[2], in[3], out[0], out[1], out[2], out[3]);
56 }
57
transpose_8x8(const int32x4_t * in,int32x4_t * out)58 static inline void transpose_8x8(const int32x4_t *in, int32x4_t *out) {
59 TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
60 TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
61 TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
62 TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
63 out[15]);
64 }
65
round_shift_array_32_neon(int32x4_t * input,int32x4_t * output,const int size,const int bit)66 static inline void round_shift_array_32_neon(int32x4_t *input,
67 int32x4_t *output, const int size,
68 const int bit) {
69 const int32x4_t v_bit = vdupq_n_s32(-bit);
70 for (int i = 0; i < size; i++) {
71 output[i] = vrshlq_s32(input[i], v_bit);
72 }
73 }
74
round_shift_rect_array_32_neon(int32x4_t * input,int32x4_t * output,const int size)75 static inline void round_shift_rect_array_32_neon(int32x4_t *input,
76 int32x4_t *output,
77 const int size) {
78 for (int i = 0; i < size; i++) {
79 const int32x4_t r0 = vmulq_n_s32(input[i], NewInvSqrt2);
80 output[i] = vrshrq_n_s32(r0, NewSqrt2Bits);
81 }
82 }
83
half_btf_neon_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)84 static inline int32x4_t half_btf_neon_r(const int32_t *n0, const int32x4_t *w0,
85 const int32_t *n1, const int32x4_t *w1,
86 const int32x4_t *v_bit,
87 const int32x4_t *rnding) {
88 int32x4_t x;
89 x = vmlaq_n_s32(*rnding, *w0, *n0);
90 x = vmlaq_n_s32(x, *w1, *n1);
91 x = vshlq_s32(x, *v_bit);
92 return x;
93 }
94
half_btf_neon_mode11_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)95 static inline int32x4_t half_btf_neon_mode11_r(
96 const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
97 const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
98 int32x4_t x;
99 x = vmlaq_n_s32(*rnding, *w0, -*n0);
100 x = vmlaq_n_s32(x, *w1, -*n1);
101 x = vshlq_s32(x, *v_bit);
102 return x;
103 }
104
half_btf_neon_mode01_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)105 static inline int32x4_t half_btf_neon_mode01_r(
106 const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
107 const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
108 int32x4_t x;
109 x = vmlaq_n_s32(*rnding, *w0, *n0);
110 x = vmlsq_n_s32(x, *w1, *n1);
111 x = vshlq_s32(x, *v_bit);
112 return x;
113 }
114
half_btf_neon_mode10_r(const int32_t * n0,const int32x4_t * w0,const int32_t * n1,const int32x4_t * w1,const int32x4_t * v_bit,const int32x4_t * rnding)115 static inline int32x4_t half_btf_neon_mode10_r(
116 const int32_t *n0, const int32x4_t *w0, const int32_t *n1,
117 const int32x4_t *w1, const int32x4_t *v_bit, const int32x4_t *rnding) {
118 int32x4_t x;
119 x = vmlaq_n_s32(*rnding, *w1, *n1);
120 x = vmlsq_n_s32(x, *w0, *n0);
121 x = vshlq_s32(x, *v_bit);
122 return x;
123 }
124
half_btf_0_neon_r(const int32_t * n0,const int32x4_t * w0,const int32x4_t * v_bit,const int32x4_t * rnding)125 static inline int32x4_t half_btf_0_neon_r(const int32_t *n0,
126 const int32x4_t *w0,
127 const int32x4_t *v_bit,
128 const int32x4_t *rnding) {
129 int32x4_t x;
130 x = vmlaq_n_s32(*rnding, *w0, *n0);
131 x = vshlq_s32(x, *v_bit);
132 return x;
133 }
134
half_btf_0_m_neon_r(const int32_t * n0,const int32x4_t * w0,const int32x4_t * v_bit,const int32x4_t * rnding)135 static inline int32x4_t half_btf_0_m_neon_r(const int32_t *n0,
136 const int32x4_t *w0,
137 const int32x4_t *v_bit,
138 const int32x4_t *rnding) {
139 int32x4_t x;
140 x = vmlaq_n_s32(*rnding, *w0, -*n0);
141 x = vshlq_s32(x, *v_bit);
142 return x;
143 }
144
flip_buf_neon(int32x4_t * in,int32x4_t * out,int size)145 static inline void flip_buf_neon(int32x4_t *in, int32x4_t *out, int size) {
146 for (int i = 0; i < size; ++i) {
147 out[size - i - 1] = in[i];
148 }
149 }
150
151 typedef void (*fwd_transform_1d_neon)(int32x4_t *in, int32x4_t *out, int bit,
152 const int num_cols);
153
154 typedef void (*transform_1d_neon)(int32x4_t *in, int32x4_t *out, int32_t bit,
155 int32_t do_cols, int32_t bd,
156 int32_t out_shift);
157
highbd_clamp_u16(uint16x8_t * u,const uint16x8_t * min,const uint16x8_t * max)158 static inline uint16x8_t highbd_clamp_u16(uint16x8_t *u, const uint16x8_t *min,
159 const uint16x8_t *max) {
160 int16x8_t clamped;
161 clamped = vminq_s16(vreinterpretq_s16_u16(*u), vreinterpretq_s16_u16(*max));
162 clamped = vmaxq_s16(clamped, vreinterpretq_s16_u16(*min));
163 return vreinterpretq_u16_s16(clamped);
164 }
165
round_shift_4x4(int32x4_t * in,int shift)166 static inline void round_shift_4x4(int32x4_t *in, int shift) {
167 if (shift != 0) {
168 const int32x4_t v_shift = vdupq_n_s32(-shift);
169 in[0] = vrshlq_s32(in[0], v_shift);
170 in[1] = vrshlq_s32(in[1], v_shift);
171 in[2] = vrshlq_s32(in[2], v_shift);
172 in[3] = vrshlq_s32(in[3], v_shift);
173 }
174 }
175
round_shift_8x8(int32x4_t * in,int shift)176 static void round_shift_8x8(int32x4_t *in, int shift) {
177 assert(shift != 0);
178 const int32x4_t v_shift = vdupq_n_s32(-shift);
179 in[0] = vrshlq_s32(in[0], v_shift);
180 in[1] = vrshlq_s32(in[1], v_shift);
181 in[2] = vrshlq_s32(in[2], v_shift);
182 in[3] = vrshlq_s32(in[3], v_shift);
183 in[4] = vrshlq_s32(in[4], v_shift);
184 in[5] = vrshlq_s32(in[5], v_shift);
185 in[6] = vrshlq_s32(in[6], v_shift);
186 in[7] = vrshlq_s32(in[7], v_shift);
187 in[8] = vrshlq_s32(in[8], v_shift);
188 in[9] = vrshlq_s32(in[9], v_shift);
189 in[10] = vrshlq_s32(in[10], v_shift);
190 in[11] = vrshlq_s32(in[11], v_shift);
191 in[12] = vrshlq_s32(in[12], v_shift);
192 in[13] = vrshlq_s32(in[13], v_shift);
193 in[14] = vrshlq_s32(in[14], v_shift);
194 in[15] = vrshlq_s32(in[15], v_shift);
195 }
196
highbd_clamp_s32_neon(int32x4_t * in,int32x4_t * out,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,int size)197 static void highbd_clamp_s32_neon(int32x4_t *in, int32x4_t *out,
198 const int32x4_t *clamp_lo,
199 const int32x4_t *clamp_hi, int size) {
200 int32x4_t a0, a1;
201 for (int i = 0; i < size; i += 4) {
202 a0 = vmaxq_s32(in[i], *clamp_lo);
203 out[i] = vminq_s32(a0, *clamp_hi);
204
205 a1 = vmaxq_s32(in[i + 1], *clamp_lo);
206 out[i + 1] = vminq_s32(a1, *clamp_hi);
207
208 a0 = vmaxq_s32(in[i + 2], *clamp_lo);
209 out[i + 2] = vminq_s32(a0, *clamp_hi);
210
211 a1 = vmaxq_s32(in[i + 3], *clamp_lo);
212 out[i + 3] = vminq_s32(a1, *clamp_hi);
213 }
214 }
215
highbd_get_recon_8x8_neon(const uint16x8_t pred,int32x4_t res0,int32x4_t res1,const int bd)216 static inline uint16x8_t highbd_get_recon_8x8_neon(const uint16x8_t pred,
217 int32x4_t res0,
218 int32x4_t res1,
219 const int bd) {
220 const uint16x8_t v_zero = vdupq_n_u16(0);
221 int32x4_t min_clip_val = vreinterpretq_s32_u16(v_zero);
222 int32x4_t max_clip_val = vdupq_n_s32((1 << bd) - 1);
223 uint16x8x2_t x;
224 x.val[0] = vreinterpretq_u16_s32(
225 vaddw_s16(res0, vreinterpret_s16_u16(vget_low_u16(pred))));
226 x.val[1] = vreinterpretq_u16_s32(
227 vaddw_s16(res1, vreinterpret_s16_u16(vget_high_u16(pred))));
228 x.val[0] = vreinterpretq_u16_s32(
229 vmaxq_s32(vreinterpretq_s32_u16(x.val[0]), min_clip_val));
230 x.val[0] = vreinterpretq_u16_s32(
231 vminq_s32(vreinterpretq_s32_u16(x.val[0]), max_clip_val));
232 x.val[1] = vreinterpretq_u16_s32(
233 vmaxq_s32(vreinterpretq_s32_u16(x.val[1]), min_clip_val));
234 x.val[1] = vreinterpretq_u16_s32(
235 vminq_s32(vreinterpretq_s32_u16(x.val[1]), max_clip_val));
236 uint16x8_t res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
237 vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
238 return res;
239 }
240
highbd_get_recon_4xn_neon(uint16x4_t pred,int32x4_t res0,const int bd)241 static inline uint16x4_t highbd_get_recon_4xn_neon(uint16x4_t pred,
242 int32x4_t res0,
243 const int bd) {
244 uint16x4_t x0_ = vreinterpret_u16_s16(
245 vmovn_s32(vaddw_s16(res0, vreinterpret_s16_u16(pred))));
246 uint16x8_t x0 = vcombine_u16(x0_, x0_);
247 const uint16x8_t vmin = vdupq_n_u16(0);
248 const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
249 x0 = highbd_clamp_u16(&x0, &vmin, &vmax);
250 return vget_low_u16(x0);
251 }
252
highbd_write_buffer_4xn_neon(int32x4_t * in,uint16_t * output,int stride,int flipud,int height,const int bd)253 static inline void highbd_write_buffer_4xn_neon(int32x4_t *in, uint16_t *output,
254 int stride, int flipud,
255 int height, const int bd) {
256 int j = flipud ? (height - 1) : 0;
257 const int step = flipud ? -1 : 1;
258 for (int i = 0; i < height; ++i, j += step) {
259 uint16x4_t v = vld1_u16(output + i * stride);
260 uint16x4_t u = highbd_get_recon_4xn_neon(v, in[j], bd);
261
262 vst1_u16(output + i * stride, u);
263 }
264 }
265
highbd_write_buffer_8xn_neon(int32x4_t * in,uint16_t * output,int stride,int flipud,int height,const int bd)266 static inline void highbd_write_buffer_8xn_neon(int32x4_t *in, uint16_t *output,
267 int stride, int flipud,
268 int height, const int bd) {
269 int j = flipud ? (height - 1) : 0;
270 const int step = flipud ? -1 : 1;
271 for (int i = 0; i < height; ++i, j += step) {
272 uint16x8_t v = vld1q_u16(output + i * stride);
273 uint16x8_t u = highbd_get_recon_8x8_neon(v, in[j], in[j + height], bd);
274
275 vst1q_u16(output + i * stride, u);
276 }
277 }
278
load_buffer_32bit_input(const int32_t * in,int stride,int32x4_t * out,int out_size)279 static inline void load_buffer_32bit_input(const int32_t *in, int stride,
280 int32x4_t *out, int out_size) {
281 for (int i = 0; i < out_size; ++i) {
282 out[i] = vld1q_s32(in + i * stride);
283 }
284 }
285
load_buffer_4x4(const int32_t * coeff,int32x4_t * in)286 static inline void load_buffer_4x4(const int32_t *coeff, int32x4_t *in) {
287 in[0] = vld1q_s32(coeff + 0);
288 in[1] = vld1q_s32(coeff + 4);
289 in[2] = vld1q_s32(coeff + 8);
290 in[3] = vld1q_s32(coeff + 12);
291 }
292
addsub_neon(const int32x4_t in0,const int32x4_t in1,int32x4_t * out0,int32x4_t * out1,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi)293 static void addsub_neon(const int32x4_t in0, const int32x4_t in1,
294 int32x4_t *out0, int32x4_t *out1,
295 const int32x4_t *clamp_lo, const int32x4_t *clamp_hi) {
296 int32x4_t a0 = vaddq_s32(in0, in1);
297 int32x4_t a1 = vsubq_s32(in0, in1);
298
299 a0 = vmaxq_s32(a0, *clamp_lo);
300 a0 = vminq_s32(a0, *clamp_hi);
301 a1 = vmaxq_s32(a1, *clamp_lo);
302 a1 = vminq_s32(a1, *clamp_hi);
303
304 *out0 = a0;
305 *out1 = a1;
306 }
307
shift_and_clamp_neon(int32x4_t * in0,int32x4_t * in1,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_shift)308 static void shift_and_clamp_neon(int32x4_t *in0, int32x4_t *in1,
309 const int32x4_t *clamp_lo,
310 const int32x4_t *clamp_hi,
311 const int32x4_t *v_shift) {
312 int32x4_t in0_w_offset = vrshlq_s32(*in0, *v_shift);
313 int32x4_t in1_w_offset = vrshlq_s32(*in1, *v_shift);
314
315 in0_w_offset = vmaxq_s32(in0_w_offset, *clamp_lo);
316 in0_w_offset = vminq_s32(in0_w_offset, *clamp_hi);
317 in1_w_offset = vmaxq_s32(in1_w_offset, *clamp_lo);
318 in1_w_offset = vminq_s32(in1_w_offset, *clamp_hi);
319
320 *in0 = in0_w_offset;
321 *in1 = in1_w_offset;
322 }
323
idct32_stage4_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * v_bit,const int32x4_t * rnding)324 static inline void idct32_stage4_neon(int32x4_t *bf1, const int32_t *cospi,
325 const int32x4_t *v_bit,
326 const int32x4_t *rnding) {
327 int32x4_t temp1, temp2;
328 temp1 = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
329 v_bit, rnding);
330 bf1[30] =
331 half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], v_bit, rnding);
332 bf1[17] = temp1;
333
334 temp2 = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
335 v_bit, rnding);
336 bf1[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
337 v_bit, rnding);
338 bf1[18] = temp2;
339
340 temp1 = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
341 v_bit, rnding);
342 bf1[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], v_bit,
343 rnding);
344 bf1[21] = temp1;
345
346 temp2 = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
347 v_bit, rnding);
348 bf1[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
349 v_bit, rnding);
350 bf1[22] = temp2;
351 }
352
idct32_stage5_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)353 static inline void idct32_stage5_neon(int32x4_t *bf1, const int32_t *cospi,
354 const int32x4_t *clamp_lo,
355 const int32x4_t *clamp_hi,
356 const int32x4_t *v_bit,
357 const int32x4_t *rnding) {
358 int32x4_t temp1, temp2;
359 temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[9], &cospi[48], &bf1[14],
360 v_bit, rnding);
361 bf1[14] =
362 half_btf_neon_r(&cospi[48], &bf1[9], &cospi[16], &bf1[14], v_bit, rnding);
363 bf1[9] = temp1;
364
365 temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[10], &cospi[16], &bf1[13],
366 v_bit, rnding);
367 bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf1[10], &cospi[48], &bf1[13],
368 v_bit, rnding);
369 bf1[10] = temp2;
370
371 addsub_neon(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
372 addsub_neon(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
373 addsub_neon(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
374 addsub_neon(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
375 addsub_neon(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
376 addsub_neon(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
377 addsub_neon(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
378 addsub_neon(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
379 }
380
idct32_stage6_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)381 static inline void idct32_stage6_neon(int32x4_t *bf1, const int32_t *cospi,
382 const int32x4_t *clamp_lo,
383 const int32x4_t *clamp_hi,
384 const int32x4_t *v_bit,
385 const int32x4_t *rnding) {
386 int32x4_t temp1, temp2;
387 temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
388 v_bit, rnding);
389 bf1[6] =
390 half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], v_bit, rnding);
391 bf1[5] = temp1;
392
393 addsub_neon(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
394 addsub_neon(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
395 addsub_neon(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
396 addsub_neon(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
397
398 temp1 = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
399 v_bit, rnding);
400 bf1[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], v_bit,
401 rnding);
402 bf1[18] = temp1;
403 temp2 = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
404 v_bit, rnding);
405 bf1[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], v_bit,
406 rnding);
407 bf1[19] = temp2;
408 temp1 = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
409 v_bit, rnding);
410 bf1[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
411 v_bit, rnding);
412 bf1[20] = temp1;
413 temp2 = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
414 v_bit, rnding);
415 bf1[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
416 v_bit, rnding);
417 bf1[21] = temp2;
418 }
419
idct32_stage7_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)420 static inline void idct32_stage7_neon(int32x4_t *bf1, const int32_t *cospi,
421 const int32x4_t *clamp_lo,
422 const int32x4_t *clamp_hi,
423 const int32x4_t *v_bit,
424 const int32x4_t *rnding) {
425 int32x4_t temp1, temp2;
426 addsub_neon(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
427 addsub_neon(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
428 addsub_neon(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
429 addsub_neon(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
430 temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13],
431 v_bit, rnding);
432 bf1[13] = half_btf_neon_r(&cospi[32], &bf1[10], &cospi[32], &bf1[13], v_bit,
433 rnding);
434 bf1[10] = temp1;
435 temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12],
436 v_bit, rnding);
437 bf1[12] = half_btf_neon_r(&cospi[32], &bf1[11], &cospi[32], &bf1[12], v_bit,
438 rnding);
439 bf1[11] = temp2;
440
441 addsub_neon(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
442 addsub_neon(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
443 addsub_neon(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
444 addsub_neon(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
445 addsub_neon(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
446 addsub_neon(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
447 addsub_neon(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
448 addsub_neon(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
449 }
450
idct32_stage8_neon(int32x4_t * bf1,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)451 static inline void idct32_stage8_neon(int32x4_t *bf1, const int32_t *cospi,
452 const int32x4_t *clamp_lo,
453 const int32x4_t *clamp_hi,
454 const int32x4_t *v_bit,
455 const int32x4_t *rnding) {
456 int32x4_t temp1, temp2;
457 addsub_neon(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
458 addsub_neon(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
459 addsub_neon(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
460 addsub_neon(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
461 addsub_neon(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
462 addsub_neon(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
463 addsub_neon(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
464 addsub_neon(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
465 temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
466 v_bit, rnding);
467 bf1[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], v_bit,
468 rnding);
469 bf1[20] = temp1;
470 temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
471 v_bit, rnding);
472 bf1[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], v_bit,
473 rnding);
474 bf1[21] = temp2;
475 temp1 = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
476 v_bit, rnding);
477 bf1[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], v_bit,
478 rnding);
479 bf1[22] = temp1;
480 temp2 = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
481 v_bit, rnding);
482 bf1[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], v_bit,
483 rnding);
484 bf1[23] = temp2;
485 }
486
idct32_stage9_neon(int32x4_t * bf1,int32x4_t * out,const int do_cols,const int bd,const int out_shift,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi)487 static inline void idct32_stage9_neon(int32x4_t *bf1, int32x4_t *out,
488 const int do_cols, const int bd,
489 const int out_shift,
490 const int32x4_t *clamp_lo,
491 const int32x4_t *clamp_hi) {
492 addsub_neon(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
493 addsub_neon(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
494 addsub_neon(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
495 addsub_neon(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
496 addsub_neon(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
497 addsub_neon(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
498 addsub_neon(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
499 addsub_neon(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
500 addsub_neon(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
501 addsub_neon(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
502 addsub_neon(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
503 addsub_neon(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
504 addsub_neon(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
505 addsub_neon(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
506 addsub_neon(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
507 addsub_neon(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
508
509 if (!do_cols) {
510 const int log_range_out = AOMMAX(16, bd + 6);
511 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
512 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
513 for (int i = 0; i < 32; i += 8) {
514 round_shift_4x4(out + i, out_shift);
515 round_shift_4x4(out + i + 4, out_shift);
516 }
517 highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
518 }
519 }
520
neg_shift_neon(const int32x4_t * in0,const int32x4_t * in1,int32x4_t * out0,int32x4_t * out1,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_shift,int32x4_t * offset)521 static void neg_shift_neon(const int32x4_t *in0, const int32x4_t *in1,
522 int32x4_t *out0, int32x4_t *out1,
523 const int32x4_t *clamp_lo, const int32x4_t *clamp_hi,
524 const int32x4_t *v_shift, int32x4_t *offset) {
525 int32x4_t a0 = vaddq_s32(*offset, *in0);
526 int32x4_t a1 = vsubq_s32(*offset, *in1);
527
528 a0 = vshlq_s32(a0, *v_shift);
529 a1 = vshlq_s32(a1, *v_shift);
530
531 a0 = vmaxq_s32(a0, *clamp_lo);
532 a0 = vminq_s32(a0, *clamp_hi);
533 a1 = vmaxq_s32(a1, *clamp_lo);
534 a1 = vminq_s32(a1, *clamp_hi);
535
536 *out0 = a0;
537 *out1 = a1;
538 }
539
idct4x4_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)540 static void idct4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
541 int bd, int out_shift) {
542 const int32_t *cospi = cospi_arr(bit);
543 int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
544 int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
545 int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
546 int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
547
548 int32x4_t u0, u1, u2, u3;
549 int32x4_t v0, v1, v2, v3, x, y;
550
551 // Stage 0-1-2
552
553 u0 = in[0];
554 u1 = in[1];
555 u2 = in[2];
556 u3 = in[3];
557
558 const int32x4_t v_bit = vdupq_n_s32(-bit);
559
560 x = vmlaq_n_s32(rnding, u0, cospi[32]);
561 y = vmulq_n_s32(u2, cospi[32]);
562 v0 = vaddq_s32(x, y);
563 v0 = vshlq_s32(v0, v_bit);
564
565 v1 = vsubq_s32(x, y);
566 v1 = vshlq_s32(v1, v_bit);
567
568 x = vmlaq_n_s32(rnding, u1, cospi[48]);
569 v2 = vmlsq_n_s32(x, u3, cospi[16]);
570 v2 = vshlq_s32(v2, v_bit);
571
572 x = vmlaq_n_s32(rnding, u1, cospi[16]);
573 v3 = vmlaq_n_s32(x, u3, cospi[48]);
574 v3 = vshlq_s32(v3, v_bit);
575 // Stage 3
576 addsub_neon(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
577 addsub_neon(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
578
579 if (!do_cols) {
580 log_range = AOMMAX(16, bd + 6);
581 clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
582 clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
583 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
584 shift_and_clamp_neon(out + 0, out + 3, &clamp_lo, &clamp_hi, &v_shift);
585 shift_and_clamp_neon(out + 1, out + 2, &clamp_lo, &clamp_hi, &v_shift);
586 }
587 }
588
iadst4x4_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)589 static void iadst4x4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
590 int bd, int out_shift) {
591 const int32_t *sinpi = sinpi_arr(bit);
592 const int32x4_t zero = vdupq_n_s32(0);
593 int64x2_t rnding = vdupq_n_s64(1ll << (bit + 4 - 1));
594 const int32x2_t mul = vdup_n_s32(1 << 4);
595 int32x4_t t;
596 int32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
597 int32x4_t x0, x1, x2, x3;
598 int32x4_t u0, u1, u2, u3;
599
600 x0 = in[0];
601 x1 = in[1];
602 x2 = in[2];
603 x3 = in[3];
604
605 s0 = vmulq_n_s32(x0, sinpi[1]);
606 s1 = vmulq_n_s32(x0, sinpi[2]);
607 s2 = vmulq_n_s32(x1, sinpi[3]);
608 s3 = vmulq_n_s32(x2, sinpi[4]);
609 s4 = vmulq_n_s32(x2, sinpi[1]);
610 s5 = vmulq_n_s32(x3, sinpi[2]);
611 s6 = vmulq_n_s32(x3, sinpi[4]);
612 t = vsubq_s32(x0, x2);
613 s7 = vaddq_s32(t, x3);
614
615 t = vaddq_s32(s0, s3);
616 s0 = vaddq_s32(t, s5);
617 t = vsubq_s32(s1, s4);
618 s1 = vsubq_s32(t, s6);
619 s3 = s2;
620 s2 = vmulq_n_s32(s7, sinpi[3]);
621
622 u0 = vaddq_s32(s0, s3);
623 u1 = vaddq_s32(s1, s3);
624 u2 = s2;
625 t = vaddq_s32(s0, s1);
626 u3 = vsubq_s32(t, s3);
627
628 // u0
629 int32x4x2_t u0x;
630 u0x.val[0] = vreinterpretq_s32_s64(
631 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
632 u0x.val[0] = vreinterpretq_s32_s64(
633 vaddq_s64(vreinterpretq_s64_s32(u0x.val[0]), rnding));
634
635 u0 = vextq_s32(u0, zero, 1);
636 u0x.val[1] = vreinterpretq_s32_s64(
637 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u0)), mul));
638 u0x.val[1] = vreinterpretq_s32_s64(
639 vaddq_s64(vreinterpretq_s64_s32(u0x.val[1]), rnding));
640
641 u0x.val[0] = vreinterpretq_s32_s16(vextq_s16(
642 vreinterpretq_s16_s32(u0x.val[0]), vreinterpretq_s16_s32(zero), 1));
643 u0x.val[1] = vreinterpretq_s32_s16(vextq_s16(
644 vreinterpretq_s16_s32(u0x.val[1]), vreinterpretq_s16_s32(zero), 1));
645
646 u0x = vzipq_s32(u0x.val[0], u0x.val[1]);
647 #if AOM_ARCH_AARCH64
648 u0 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u0x.val[0]),
649 vreinterpretq_s64_s32(u0x.val[1])));
650 #else
651 u0 = vcombine_s32(vget_low_s32(u0x.val[0]), vget_low_s32(u0x.val[1]));
652 #endif // AOM_ARCH_AARCH64
653 // u1
654 int32x4x2_t u1x;
655 u1x.val[0] = vreinterpretq_s32_s64(
656 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
657 u1x.val[0] = vreinterpretq_s32_s64(
658 vaddq_s64(vreinterpretq_s64_s32(u1x.val[0]), rnding));
659
660 u1 = vextq_s32(u1, zero, 1);
661 u1x.val[1] = vreinterpretq_s32_s64(
662 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u1)), mul));
663 u1x.val[1] = vreinterpretq_s32_s64(
664 vaddq_s64(vreinterpretq_s64_s32(u1x.val[1]), rnding));
665
666 u1x.val[0] = vreinterpretq_s32_s16(vextq_s16(
667 vreinterpretq_s16_s32(u1x.val[0]), vreinterpretq_s16_s32(zero), 1));
668 u1x.val[1] = vreinterpretq_s32_s16(vextq_s16(
669 vreinterpretq_s16_s32(u1x.val[1]), vreinterpretq_s16_s32(zero), 1));
670
671 u1x = vzipq_s32(u1x.val[0], u1x.val[1]);
672 #if AOM_ARCH_AARCH64
673 u1 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u1x.val[0]),
674 vreinterpretq_s64_s32(u1x.val[1])));
675 #else
676 u1 = vcombine_s32(vget_low_s32(u1x.val[0]), vget_low_s32(u1x.val[1]));
677 #endif // AOM_ARCH_AARCH64
678
679 // u2
680 int32x4x2_t u2x;
681 u2x.val[0] = vreinterpretq_s32_s64(
682 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
683 u2x.val[0] = vreinterpretq_s32_s64(
684 vaddq_s64(vreinterpretq_s64_s32(u2x.val[0]), rnding));
685
686 u2 = vextq_s32(u2, zero, 1);
687 u2x.val[1] = vreinterpretq_s32_s64(
688 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u2)), mul));
689 u2x.val[1] = vreinterpretq_s32_s64(
690 vaddq_s64(vreinterpretq_s64_s32(u2x.val[1]), rnding));
691
692 u2x.val[0] = vreinterpretq_s32_s16(vextq_s16(
693 vreinterpretq_s16_s32(u2x.val[0]), vreinterpretq_s16_s32(zero), 1));
694 u2x.val[1] = vreinterpretq_s32_s16(vextq_s16(
695 vreinterpretq_s16_s32(u2x.val[1]), vreinterpretq_s16_s32(zero), 1));
696
697 u2x = vzipq_s32(u2x.val[0], u2x.val[1]);
698 #if AOM_ARCH_AARCH64
699 u2 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u2x.val[0]),
700 vreinterpretq_s64_s32(u2x.val[1])));
701 #else
702 u2 = vcombine_s32(vget_low_s32(u2x.val[0]), vget_low_s32(u2x.val[1]));
703 #endif // AOM_ARCH_AARCH64
704
705 // u3
706 int32x4x2_t u3x;
707 u3x.val[0] = vreinterpretq_s32_s64(
708 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
709 u3x.val[0] = vreinterpretq_s32_s64(
710 vaddq_s64(vreinterpretq_s64_s32(u3x.val[0]), rnding));
711
712 u3 = vextq_s32(u3, zero, 1);
713 u3x.val[1] = vreinterpretq_s32_s64(
714 vmull_s32(vmovn_s64(vreinterpretq_s64_s32(u3)), mul));
715 u3x.val[1] = vreinterpretq_s32_s64(
716 vaddq_s64(vreinterpretq_s64_s32(u3x.val[1]), rnding));
717
718 u3x.val[0] = vreinterpretq_s32_s16(vextq_s16(
719 vreinterpretq_s16_s32(u3x.val[0]), vreinterpretq_s16_s32(zero), 1));
720 u3x.val[1] = vreinterpretq_s32_s16(vextq_s16(
721 vreinterpretq_s16_s32(u3x.val[1]), vreinterpretq_s16_s32(zero), 1));
722
723 u3x = vzipq_s32(u3x.val[0], u3x.val[1]);
724 #if AOM_ARCH_AARCH64
725 u3 = vreinterpretq_s32_s64(vzip1q_s64(vreinterpretq_s64_s32(u3x.val[0]),
726 vreinterpretq_s64_s32(u3x.val[1])));
727 #else
728 u3 = vcombine_s32(vget_low_s32(u3x.val[0]), vget_low_s32(u3x.val[1]));
729 #endif // AOM_ARCH_AARCH64
730
731 out[0] = u0;
732 out[1] = u1;
733 out[2] = u2;
734 out[3] = u3;
735
736 if (!do_cols) {
737 const int log_range = AOMMAX(16, bd + 6);
738 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
739 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
740 round_shift_4x4(out, out_shift);
741 highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
742 }
743 }
744
write_buffer_4x4(int32x4_t * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)745 static void write_buffer_4x4(int32x4_t *in, uint16_t *output, int stride,
746 int fliplr, int flipud, int shift, int bd) {
747 uint32x4_t u0, u1, u2, u3;
748 uint16x4_t v0, v1, v2, v3;
749 round_shift_4x4(in, shift);
750
751 v0 = vld1_u16(output + 0 * stride);
752 v1 = vld1_u16(output + 1 * stride);
753 v2 = vld1_u16(output + 2 * stride);
754 v3 = vld1_u16(output + 3 * stride);
755
756 if (fliplr) {
757 u0 = vrev64q_u32(vreinterpretq_u32_s32(in[0]));
758 in[0] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
759 u0 = vrev64q_u32(vreinterpretq_u32_s32(in[1]));
760 in[1] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
761 u0 = vrev64q_u32(vreinterpretq_u32_s32(in[2]));
762 in[2] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
763 u0 = vrev64q_u32(vreinterpretq_u32_s32(in[3]));
764 in[3] = vreinterpretq_s32_u32(vextq_u32(u0, u0, 2));
765 }
766
767 if (flipud) {
768 u0 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v0);
769 u1 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v1);
770 u2 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v2);
771 u3 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v3);
772 } else {
773 u0 = vaddw_u16(vreinterpretq_u32_s32(in[0]), v0);
774 u1 = vaddw_u16(vreinterpretq_u32_s32(in[1]), v1);
775 u2 = vaddw_u16(vreinterpretq_u32_s32(in[2]), v2);
776 u3 = vaddw_u16(vreinterpretq_u32_s32(in[3]), v3);
777 }
778
779 uint16x8_t u4 = vcombine_u16(vqmovn_u32(u0), vqmovn_u32(u1));
780 uint16x8_t u5 = vcombine_u16(vqmovn_u32(u2), vqmovn_u32(u3));
781 const uint16x8_t vmin = vdupq_n_u16(0);
782 const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
783 u4 = highbd_clamp_u16(&u4, &vmin, &vmax);
784 u5 = highbd_clamp_u16(&u5, &vmin, &vmax);
785
786 vst1_u16(output + 0 * stride, vget_low_u16(u4));
787 vst1_u16(output + 1 * stride, vget_high_u16(u4));
788 vst1_u16(output + 2 * stride, vget_low_u16(u5));
789 vst1_u16(output + 3 * stride, vget_high_u16(u5));
790 }
791
iidentity4_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)792 static void iidentity4_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
793 int bd, int out_shift) {
794 (void)bit;
795 int32x4_t zero = vdupq_n_s32(0);
796 int32x2_t fact = vdup_n_s32(NewSqrt2);
797 int32x4x2_t a0;
798 const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
799
800 for (int i = 0; i < 4; i++) {
801 a0.val[0] = vreinterpretq_s32_s64(
802 vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
803 a0.val[0] = vreinterpretq_s32_s64(
804 vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
805 a0.val[1] = vextq_s32(in[i], zero, 1);
806 a0.val[1] = vreinterpretq_s32_s64(
807 vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
808 a0.val[1] = vreinterpretq_s32_s64(
809 vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
810
811 a0 = vzipq_s32(a0.val[0], a0.val[1]);
812 #if AOM_ARCH_AARCH64
813 out[i] = vreinterpretq_s32_s64(vzip1q_s64(
814 vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
815 #else
816 out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
817 #endif
818 }
819 if (!do_cols) {
820 const int log_range = AOMMAX(16, bd + 6);
821 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
822 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
823 round_shift_4x4(out, out_shift);
824 highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 4);
825 }
826 }
827
av1_inv_txfm2d_add_4x4_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)828 void av1_inv_txfm2d_add_4x4_neon(const int32_t *input, uint16_t *output,
829 int stride, TX_TYPE tx_type, int bd) {
830 int32x4_t in[4];
831
832 const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
833
834 switch (tx_type) {
835 case DCT_DCT:
836 load_buffer_4x4(input, in);
837 idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
838 transpose_4x4(in, in);
839 idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
840 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
841 break;
842 case ADST_DCT:
843 load_buffer_4x4(input, in);
844 idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
845 transpose_4x4(in, in);
846 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
847 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
848 break;
849 case DCT_ADST:
850 load_buffer_4x4(input, in);
851 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
852 transpose_4x4(in, in);
853 idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
854 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
855 break;
856 case ADST_ADST:
857 load_buffer_4x4(input, in);
858 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
859 transpose_4x4(in, in);
860 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
861 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
862 break;
863 case FLIPADST_DCT:
864 load_buffer_4x4(input, in);
865 idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
866 transpose_4x4(in, in);
867 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
868 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
869 break;
870 case DCT_FLIPADST:
871 load_buffer_4x4(input, in);
872 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
873 transpose_4x4(in, in);
874 idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
875 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
876 break;
877 case FLIPADST_FLIPADST:
878 load_buffer_4x4(input, in);
879 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
880 transpose_4x4(in, in);
881 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
882 write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
883 break;
884 case ADST_FLIPADST:
885 load_buffer_4x4(input, in);
886 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
887 transpose_4x4(in, in);
888 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
889 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
890 break;
891 case FLIPADST_ADST:
892 load_buffer_4x4(input, in);
893 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
894 transpose_4x4(in, in);
895 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
896 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
897 break;
898 case IDTX:
899 load_buffer_4x4(input, in);
900 iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
901 transpose_4x4(in, in);
902 iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
903 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
904 break;
905 case V_DCT:
906 load_buffer_4x4(input, in);
907 iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
908 transpose_4x4(in, in);
909 idct4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
910 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
911 break;
912 case H_DCT:
913 load_buffer_4x4(input, in);
914 idct4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
915 transpose_4x4(in, in);
916 iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
917 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
918 break;
919 case V_ADST:
920 load_buffer_4x4(input, in);
921 iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
922 transpose_4x4(in, in);
923 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
924 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
925 break;
926 case H_ADST:
927 load_buffer_4x4(input, in);
928 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
929 transpose_4x4(in, in);
930 iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
931 write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
932 break;
933 case V_FLIPADST:
934 load_buffer_4x4(input, in);
935 iidentity4_neon(in, in, INV_COS_BIT, 0, bd, 0);
936 transpose_4x4(in, in);
937 iadst4x4_neon(in, in, INV_COS_BIT, 1, bd, 0);
938 write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
939 break;
940 case H_FLIPADST:
941 load_buffer_4x4(input, in);
942 iadst4x4_neon(in, in, INV_COS_BIT, 0, bd, 0);
943 transpose_4x4(in, in);
944 iidentity4_neon(in, in, INV_COS_BIT, 1, bd, 0);
945 write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
946 break;
947 default: assert(0);
948 }
949 }
950
951 // 8x8
load_buffer_8x8(const int32_t * coeff,int32x4_t * in)952 static void load_buffer_8x8(const int32_t *coeff, int32x4_t *in) {
953 in[0] = vld1q_s32(coeff + 0);
954 in[1] = vld1q_s32(coeff + 4);
955 in[2] = vld1q_s32(coeff + 8);
956 in[3] = vld1q_s32(coeff + 12);
957 in[4] = vld1q_s32(coeff + 16);
958 in[5] = vld1q_s32(coeff + 20);
959 in[6] = vld1q_s32(coeff + 24);
960 in[7] = vld1q_s32(coeff + 28);
961 in[8] = vld1q_s32(coeff + 32);
962 in[9] = vld1q_s32(coeff + 36);
963 in[10] = vld1q_s32(coeff + 40);
964 in[11] = vld1q_s32(coeff + 44);
965 in[12] = vld1q_s32(coeff + 48);
966 in[13] = vld1q_s32(coeff + 52);
967 in[14] = vld1q_s32(coeff + 56);
968 in[15] = vld1q_s32(coeff + 60);
969 }
970
idct8x8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)971 static void idct8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
972 int bd, int out_shift) {
973 const int32_t *cospi = cospi_arr(bit);
974 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
975 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
976 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
977 int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
978 int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
979 int32x4_t x, y;
980 int col;
981 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
982 const int32x4_t v_bit = vdupq_n_s32(-bit);
983 // Note:
984 // Even column: 0, 2, ..., 14
985 // Odd column: 1, 3, ..., 15
986 // one even column plus one odd column constructs one row (8 coeffs)
987 // total we have 8 rows (8x8).
988 for (col = 0; col < 2; ++col) {
989 // stage 0
990 // stage 1
991 // stage 2
992 u0 = in[0 * 2 + col];
993 u1 = in[4 * 2 + col];
994 u2 = in[2 * 2 + col];
995 u3 = in[6 * 2 + col];
996
997 x = vmulq_n_s32(in[1 * 2 + col], cospi[56]);
998 u4 = vmlaq_n_s32(x, in[7 * 2 + col], -cospi[8]);
999 u4 = vaddq_s32(u4, rnding);
1000 u4 = vshlq_s32(u4, v_bit);
1001
1002 x = vmulq_n_s32(in[1 * 2 + col], cospi[8]);
1003 u7 = vmlaq_n_s32(x, in[7 * 2 + col], cospi[56]);
1004 u7 = vaddq_s32(u7, rnding);
1005 u7 = vshlq_s32(u7, v_bit);
1006
1007 x = vmulq_n_s32(in[5 * 2 + col], cospi[24]);
1008 u5 = vmlaq_n_s32(x, in[3 * 2 + col], -cospi[40]);
1009 u5 = vaddq_s32(u5, rnding);
1010 u5 = vshlq_s32(u5, v_bit);
1011
1012 x = vmulq_n_s32(in[5 * 2 + col], cospi[40]);
1013 u6 = vmlaq_n_s32(x, in[3 * 2 + col], cospi[24]);
1014 u6 = vaddq_s32(u6, rnding);
1015 u6 = vshlq_s32(u6, v_bit);
1016
1017 // stage 3
1018 x = vmulq_n_s32(u0, cospi[32]);
1019 y = vmulq_n_s32(u1, cospi[32]);
1020 v0 = vaddq_s32(x, y);
1021 v0 = vaddq_s32(v0, rnding);
1022 v0 = vshlq_s32(v0, v_bit);
1023
1024 v1 = vsubq_s32(x, y);
1025 v1 = vaddq_s32(v1, rnding);
1026 v1 = vshlq_s32(v1, v_bit);
1027
1028 x = vmulq_n_s32(u2, cospi[48]);
1029 v2 = vmlaq_n_s32(x, u3, -cospi[16]);
1030 v2 = vaddq_s32(v2, rnding);
1031 v2 = vshlq_s32(v2, v_bit);
1032
1033 x = vmulq_n_s32(u2, cospi[16]);
1034 v3 = vmlaq_n_s32(x, u3, cospi[48]);
1035 v3 = vaddq_s32(v3, rnding);
1036 v3 = vshlq_s32(v3, v_bit);
1037
1038 addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1039 addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1040
1041 // stage 4
1042 addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1043 addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1044 u4 = v4;
1045 u7 = v7;
1046
1047 x = vmulq_n_s32(v5, cospi[32]);
1048 y = vmulq_n_s32(v6, cospi[32]);
1049 u6 = vaddq_s32(y, x);
1050 u6 = vaddq_s32(u6, rnding);
1051 u6 = vshlq_s32(u6, v_bit);
1052
1053 u5 = vsubq_s32(y, x);
1054 u5 = vaddq_s32(u5, rnding);
1055 u5 = vshlq_s32(u5, v_bit);
1056
1057 // stage 5
1058 addsub_neon(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
1059 &clamp_hi);
1060 addsub_neon(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
1061 &clamp_hi);
1062 addsub_neon(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
1063 &clamp_hi);
1064 addsub_neon(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
1065 &clamp_hi);
1066 }
1067
1068 if (!do_cols) {
1069 const int log_range_out = AOMMAX(16, bd + 6);
1070 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1071 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1072 round_shift_8x8(out, out_shift);
1073 highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1074 }
1075 }
1076
iadst8x8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1077 static void iadst8x8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
1078 int bd, int out_shift) {
1079 const int32_t *cospi = cospi_arr(bit);
1080 const int32x4_t kZero = vdupq_n_s32(0);
1081 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1082 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1083 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1084 int32x4_t u[8], v[8], x;
1085 const int32x4_t v_bit = vdupq_n_s32(-bit);
1086 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1087 // stage 0-1-2
1088 // (1)
1089 u[0] = vmlaq_n_s32(rnding, in[14], cospi[4]);
1090 u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
1091 u[0] = vshlq_s32(u[0], v_bit);
1092
1093 u[1] = vmlaq_n_s32(rnding, in[14], cospi[60]);
1094 u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
1095 u[1] = vshlq_s32(u[1], v_bit);
1096
1097 // (2)
1098 u[2] = vmlaq_n_s32(rnding, in[10], cospi[20]);
1099 u[2] = vmlaq_n_s32(u[2], in[4], cospi[44]);
1100 u[2] = vshlq_s32(u[2], v_bit);
1101
1102 u[3] = vmlaq_n_s32(rnding, in[10], cospi[44]);
1103 u[3] = vmlsq_n_s32(u[3], in[4], cospi[20]);
1104 u[3] = vshlq_s32(u[3], v_bit);
1105
1106 // (3)
1107 u[4] = vmlaq_n_s32(rnding, in[6], cospi[36]);
1108 u[4] = vmlaq_n_s32(u[4], in[8], cospi[28]);
1109 u[4] = vshlq_s32(u[4], v_bit);
1110
1111 u[5] = vmlaq_n_s32(rnding, in[6], cospi[28]);
1112 u[5] = vmlsq_n_s32(u[5], in[8], cospi[36]);
1113 u[5] = vshlq_s32(u[5], v_bit);
1114
1115 // (4)
1116 u[6] = vmlaq_n_s32(rnding, in[2], cospi[52]);
1117 u[6] = vmlaq_n_s32(u[6], in[12], cospi[12]);
1118 u[6] = vshlq_s32(u[6], v_bit);
1119
1120 u[7] = vmlaq_n_s32(rnding, in[2], cospi[12]);
1121 u[7] = vmlsq_n_s32(u[7], in[12], cospi[52]);
1122 u[7] = vshlq_s32(u[7], v_bit);
1123
1124 // stage 3
1125 addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1126 addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1127 addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1128 addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1129
1130 // stage 4
1131 u[0] = v[0];
1132 u[1] = v[1];
1133 u[2] = v[2];
1134 u[3] = v[3];
1135
1136 u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
1137 u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
1138 u[4] = vshlq_s32(u[4], v_bit);
1139
1140 u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
1141 u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
1142 u[5] = vshlq_s32(u[5], v_bit);
1143
1144 u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
1145 u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
1146 u[6] = vshlq_s32(u[6], v_bit);
1147
1148 u[7] = vmlaq_n_s32(rnding, v[7], cospi[48]);
1149 u[7] = vmlaq_n_s32(u[7], v[6], cospi[16]);
1150 u[7] = vshlq_s32(u[7], v_bit);
1151
1152 // stage 5
1153 addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1154 addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1155 addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1156 addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1157
1158 // stage 6
1159 u[0] = v[0];
1160 u[1] = v[1];
1161 u[4] = v[4];
1162 u[5] = v[5];
1163
1164 v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
1165 x = vmulq_n_s32(v[3], cospi[32]);
1166 u[2] = vaddq_s32(v[0], x);
1167 u[2] = vshlq_s32(u[2], v_bit);
1168
1169 u[3] = vsubq_s32(v[0], x);
1170 u[3] = vshlq_s32(u[3], v_bit);
1171
1172 v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
1173 x = vmulq_n_s32(v[7], cospi[32]);
1174 u[6] = vaddq_s32(v[0], x);
1175 u[6] = vshlq_s32(u[6], v_bit);
1176
1177 u[7] = vsubq_s32(v[0], x);
1178 u[7] = vshlq_s32(u[7], v_bit);
1179
1180 // stage 7
1181 if (do_cols) {
1182 out[0] = u[0];
1183 out[2] = vsubq_s32(kZero, u[4]);
1184 out[4] = u[6];
1185 out[6] = vsubq_s32(kZero, u[2]);
1186 out[8] = u[3];
1187 out[10] = vsubq_s32(kZero, u[7]);
1188 out[12] = u[5];
1189 out[14] = vsubq_s32(kZero, u[1]);
1190 } else {
1191 const int log_range_out = AOMMAX(16, bd + 6);
1192 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1193 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1194 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1195 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1196 neg_shift_neon(&u[0], &u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
1197 &v_shift, &offset);
1198 neg_shift_neon(&u[6], &u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
1199 &v_shift, &offset);
1200 neg_shift_neon(&u[3], &u[7], out + 8, out + 10, &clamp_lo_out,
1201 &clamp_hi_out, &v_shift, &offset);
1202 neg_shift_neon(&u[5], &u[1], out + 12, out + 14, &clamp_lo_out,
1203 &clamp_hi_out, &v_shift, &offset);
1204 }
1205
1206 // Odd 8 points: 1, 3, ..., 15
1207 // stage 0
1208 // stage 1
1209 // stage 2
1210 // (1)
1211 u[0] = vmlaq_n_s32(rnding, in[15], cospi[4]);
1212 u[0] = vmlaq_n_s32(u[0], in[1], cospi[60]);
1213 u[0] = vshlq_s32(u[0], v_bit);
1214
1215 u[1] = vmlaq_n_s32(rnding, in[15], cospi[60]);
1216 u[1] = vmlsq_n_s32(u[1], in[1], cospi[4]);
1217 u[1] = vshlq_s32(u[1], v_bit);
1218
1219 // (2)
1220 u[2] = vmlaq_n_s32(rnding, in[11], cospi[20]);
1221 u[2] = vmlaq_n_s32(u[2], in[5], cospi[44]);
1222 u[2] = vshlq_s32(u[2], v_bit);
1223
1224 u[3] = vmlaq_n_s32(rnding, in[11], cospi[44]);
1225 u[3] = vmlsq_n_s32(u[3], in[5], cospi[20]);
1226 u[3] = vshlq_s32(u[3], v_bit);
1227
1228 // (3)
1229 u[4] = vmlaq_n_s32(rnding, in[7], cospi[36]);
1230 u[4] = vmlaq_n_s32(u[4], in[9], cospi[28]);
1231 u[4] = vshlq_s32(u[4], v_bit);
1232
1233 u[5] = vmlaq_n_s32(rnding, in[7], cospi[28]);
1234 u[5] = vmlsq_n_s32(u[5], in[9], cospi[36]);
1235 u[5] = vshlq_s32(u[5], v_bit);
1236
1237 // (4)
1238 u[6] = vmlaq_n_s32(rnding, in[3], cospi[52]);
1239 u[6] = vmlaq_n_s32(u[6], in[13], cospi[12]);
1240 u[6] = vshlq_s32(u[6], v_bit);
1241
1242 u[7] = vmlaq_n_s32(rnding, in[3], cospi[12]);
1243 u[7] = vmlsq_n_s32(u[7], in[13], cospi[52]);
1244 u[7] = vshlq_s32(u[7], v_bit);
1245
1246 // stage 3
1247 addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1248 addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1249 addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1250 addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1251
1252 // stage 4
1253 u[0] = v[0];
1254 u[1] = v[1];
1255 u[2] = v[2];
1256 u[3] = v[3];
1257
1258 u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
1259 u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
1260 u[4] = vshlq_s32(u[4], v_bit);
1261
1262 u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
1263 u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
1264 u[5] = vshlq_s32(u[5], v_bit);
1265
1266 u[6] = vmlaq_n_s32(rnding, v[7], cospi[16]);
1267 u[6] = vmlsq_n_s32(u[6], v[6], cospi[48]);
1268 u[6] = vshlq_s32(u[6], v_bit);
1269
1270 u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
1271 u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
1272 u[7] = vshlq_s32(u[7], v_bit);
1273
1274 // stage 5
1275 addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1276 addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1277 addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1278 addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1279
1280 // stage 6
1281 u[0] = v[0];
1282 u[1] = v[1];
1283 u[4] = v[4];
1284 u[5] = v[5];
1285
1286 v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
1287 x = vmulq_n_s32(v[3], cospi[32]);
1288 u[2] = vaddq_s32(v[0], x);
1289 u[2] = vshlq_s32(u[2], v_bit);
1290
1291 u[3] = vsubq_s32(v[0], x);
1292 u[3] = vshlq_s32(u[3], v_bit);
1293
1294 v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
1295 x = vmulq_n_s32(v[7], cospi[32]);
1296 u[6] = vaddq_s32(v[0], x);
1297 u[6] = vshlq_s32(u[6], v_bit);
1298
1299 u[7] = vsubq_s32(v[0], x);
1300 u[7] = vshlq_s32(u[7], v_bit);
1301
1302 // stage 7
1303 if (do_cols) {
1304 out[1] = u[0];
1305 out[3] = vsubq_s32(kZero, u[4]);
1306 out[5] = u[6];
1307 out[7] = vsubq_s32(kZero, u[2]);
1308 out[9] = u[3];
1309 out[11] = vsubq_s32(kZero, u[7]);
1310 out[13] = u[5];
1311 out[15] = vsubq_s32(kZero, u[1]);
1312 } else {
1313 const int log_range_out = AOMMAX(16, bd + 6);
1314 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1315 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1316 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1317 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1318 neg_shift_neon(&u[0], &u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1319 &v_shift, &offset);
1320 neg_shift_neon(&u[6], &u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1321 &v_shift, &offset);
1322 neg_shift_neon(&u[3], &u[7], out + 9, out + 11, &clamp_lo_out,
1323 &clamp_hi_out, &v_shift, &offset);
1324 neg_shift_neon(&u[5], &u[1], out + 13, out + 15, &clamp_lo_out,
1325 &clamp_hi_out, &v_shift, &offset);
1326 }
1327 }
1328
iidentity8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1329 static void iidentity8_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
1330 int bd, int out_shift) {
1331 (void)bit;
1332 out[0] = vaddq_s32(in[0], in[0]);
1333 out[1] = vaddq_s32(in[1], in[1]);
1334 out[2] = vaddq_s32(in[2], in[2]);
1335 out[3] = vaddq_s32(in[3], in[3]);
1336 out[4] = vaddq_s32(in[4], in[4]);
1337 out[5] = vaddq_s32(in[5], in[5]);
1338 out[6] = vaddq_s32(in[6], in[6]);
1339 out[7] = vaddq_s32(in[7], in[7]);
1340
1341 if (!do_cols) {
1342 const int log_range = AOMMAX(16, bd + 6);
1343 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1344 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1345 round_shift_4x4(out, out_shift);
1346 round_shift_4x4(out + 4, out_shift);
1347 highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 8);
1348 }
1349 }
1350
get_recon_8x8(const uint16x8_t pred,int32x4_t res_lo,int32x4_t res_hi,int fliplr,int bd)1351 static uint16x8_t get_recon_8x8(const uint16x8_t pred, int32x4_t res_lo,
1352 int32x4_t res_hi, int fliplr, int bd) {
1353 uint16x8x2_t x;
1354
1355 if (fliplr) {
1356 res_lo = vrev64q_s32(res_lo);
1357 res_lo = vextq_s32(res_lo, res_lo, 2);
1358 res_hi = vrev64q_s32(res_hi);
1359 res_hi = vextq_s32(res_hi, res_hi, 2);
1360 x.val[0] = vreinterpretq_u16_s32(
1361 vaddw_s16(res_hi, vreinterpret_s16_u16(vget_low_u16(pred))));
1362 x.val[1] = vreinterpretq_u16_s32(
1363 vaddw_s16(res_lo, vreinterpret_s16_u16(vget_high_u16(pred))));
1364
1365 } else {
1366 x.val[0] = vreinterpretq_u16_s32(
1367 vaddw_s16(res_lo, vreinterpret_s16_u16(vget_low_u16(pred))));
1368 x.val[1] = vreinterpretq_u16_s32(
1369 vaddw_s16(res_hi, vreinterpret_s16_u16(vget_high_u16(pred))));
1370 }
1371
1372 uint16x8_t x2 = vcombine_u16(vqmovn_u32(vreinterpretq_u32_u16(x.val[0])),
1373 vqmovn_u32(vreinterpretq_u32_u16(x.val[1])));
1374 const uint16x8_t vmin = vdupq_n_u16(0);
1375 const uint16x8_t vmax = vdupq_n_u16((1 << bd) - 1);
1376 return highbd_clamp_u16(&x2, &vmin, &vmax);
1377 }
1378
write_buffer_8x8(int32x4_t * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1379 static void write_buffer_8x8(int32x4_t *in, uint16_t *output, int stride,
1380 int fliplr, int flipud, int shift, int bd) {
1381 uint16x8_t u0, u1, u2, u3, u4, u5, u6, u7;
1382 uint16x8_t v0, v1, v2, v3, v4, v5, v6, v7;
1383 round_shift_8x8(in, shift);
1384
1385 v0 = vld1q_u16(output + 0 * stride);
1386 v1 = vld1q_u16(output + 1 * stride);
1387 v2 = vld1q_u16(output + 2 * stride);
1388 v3 = vld1q_u16(output + 3 * stride);
1389 v4 = vld1q_u16(output + 4 * stride);
1390 v5 = vld1q_u16(output + 5 * stride);
1391 v6 = vld1q_u16(output + 6 * stride);
1392 v7 = vld1q_u16(output + 7 * stride);
1393
1394 if (flipud) {
1395 u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1396 u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1397 u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1398 u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1399 u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1400 u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1401 u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1402 u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1403 } else {
1404 u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1405 u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1406 u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1407 u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1408 u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1409 u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1410 u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1411 u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1412 }
1413
1414 vst1q_u16(output + 0 * stride, u0);
1415 vst1q_u16(output + 1 * stride, u1);
1416 vst1q_u16(output + 2 * stride, u2);
1417 vst1q_u16(output + 3 * stride, u3);
1418 vst1q_u16(output + 4 * stride, u4);
1419 vst1q_u16(output + 5 * stride, u5);
1420 vst1q_u16(output + 6 * stride, u6);
1421 vst1q_u16(output + 7 * stride, u7);
1422 }
1423
av1_inv_txfm2d_add_8x8_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1424 void av1_inv_txfm2d_add_8x8_neon(const int32_t *input, uint16_t *output,
1425 int stride, TX_TYPE tx_type, int bd) {
1426 int32x4_t in[16], out[16];
1427 const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
1428
1429 switch (tx_type) {
1430 case DCT_DCT:
1431 load_buffer_8x8(input, in);
1432 idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1433 transpose_8x8(out, in);
1434 idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1435 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1436 break;
1437 case DCT_ADST:
1438 load_buffer_8x8(input, in);
1439 iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1440 transpose_8x8(out, in);
1441 idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1442 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1443 break;
1444 case ADST_DCT:
1445 load_buffer_8x8(input, in);
1446 idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1447 transpose_8x8(out, in);
1448 iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1449 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1450 break;
1451 case ADST_ADST:
1452 load_buffer_8x8(input, in);
1453 iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1454 transpose_8x8(out, in);
1455 iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1456 write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1457 break;
1458 case FLIPADST_DCT:
1459 load_buffer_8x8(input, in);
1460 idct8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1461 transpose_8x8(out, in);
1462 iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1463 write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1464 break;
1465 case DCT_FLIPADST:
1466 load_buffer_8x8(input, in);
1467 iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1468 transpose_8x8(out, in);
1469 idct8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1470 write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1471 break;
1472 case ADST_FLIPADST:
1473 load_buffer_8x8(input, in);
1474 iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1475 transpose_8x8(out, in);
1476 iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1477 write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1478 break;
1479 case FLIPADST_FLIPADST:
1480 load_buffer_8x8(input, in);
1481 iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1482 transpose_8x8(out, in);
1483 iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1484 write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
1485 break;
1486 case FLIPADST_ADST:
1487 load_buffer_8x8(input, in);
1488 iadst8x8_neon(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1489 transpose_8x8(out, in);
1490 iadst8x8_neon(in, out, INV_COS_BIT, 1, bd, 0);
1491 write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1492 break;
1493 default: assert(0);
1494 }
1495 }
1496
idct8x8_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1497 static void idct8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1498 int do_cols, int bd, int out_shift) {
1499 const int32_t *cospi = cospi_arr(bit);
1500 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1501 int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1502 int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1503 int32x4_t x;
1504 const int32x4_t v_bit = vdupq_n_s32(-bit);
1505 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1506 // stage 0-1-2-3
1507 x = vmulq_n_s32(in[0], cospi[32]);
1508 x = vaddq_s32(vshlq_s32(x, v_bit), rnding);
1509
1510 // stage 4-5
1511 if (!do_cols) {
1512 const int log_range_out = AOMMAX(16, bd + 6);
1513 clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
1514 clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1515
1516 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1517 x = vaddq_s32(x, offset);
1518 x = vshlq_s32(x, vdupq_n_s32(-out_shift));
1519 }
1520
1521 x = vmaxq_s32(x, clamp_lo);
1522 x = vminq_s32(x, clamp_hi);
1523 out[0] = x;
1524 out[1] = x;
1525 out[2] = x;
1526 out[3] = x;
1527 out[4] = x;
1528 out[5] = x;
1529 out[6] = x;
1530 out[7] = x;
1531 }
1532
idct8x8_new_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1533 static void idct8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
1534 int do_cols, int bd, int out_shift) {
1535 const int32_t *cospi = cospi_arr(bit);
1536 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1537 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1538 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1539 int32x4_t u0, u1, u2, u3, u4, u5, u6, u7;
1540 int32x4_t v0, v1, v2, v3, v4, v5, v6, v7;
1541 int32x4_t x, y;
1542 const int32x4_t v_bit = vdupq_n_s32(-bit);
1543 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1544
1545 // stage 0
1546 // stage 1
1547 // stage 2
1548 u0 = in[0];
1549 u1 = in[4];
1550 u2 = in[2];
1551 u3 = in[6];
1552
1553 x = vmlaq_n_s32(rnding, in[1], cospi[56]);
1554 u4 = vmlaq_n_s32(x, in[7], -cospi[8]);
1555 u4 = vshlq_s32(u4, v_bit);
1556
1557 x = vmlaq_n_s32(rnding, in[1], cospi[8]);
1558 u7 = vmlaq_n_s32(x, in[7], cospi[56]);
1559 u7 = vshlq_s32(u7, v_bit);
1560
1561 x = vmlaq_n_s32(rnding, in[5], cospi[24]);
1562 u5 = vmlaq_n_s32(x, in[3], -cospi[40]);
1563 u5 = vshlq_s32(u5, v_bit);
1564
1565 x = vmlaq_n_s32(rnding, in[5], cospi[40]);
1566 u6 = vmlaq_n_s32(x, in[3], cospi[24]);
1567 u6 = vshlq_s32(u6, v_bit);
1568
1569 // stage 3
1570 x = vmlaq_n_s32(rnding, u0, cospi[32]);
1571 y = vmulq_n_s32(u1, cospi[32]);
1572 v0 = vaddq_s32(x, y);
1573 v0 = vshlq_s32(v0, v_bit);
1574
1575 v1 = vsubq_s32(x, y);
1576 v1 = vshlq_s32(v1, v_bit);
1577
1578 x = vmlaq_n_s32(rnding, u2, cospi[48]);
1579 v2 = vmlaq_n_s32(x, u3, -cospi[16]);
1580 v2 = vshlq_s32(v2, v_bit);
1581
1582 x = vmlaq_n_s32(rnding, u2, cospi[16]);
1583 v3 = vmlaq_n_s32(x, u3, cospi[48]);
1584 v3 = vshlq_s32(v3, v_bit);
1585
1586 addsub_neon(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1587 addsub_neon(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1588
1589 // stage 4
1590 addsub_neon(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1591 addsub_neon(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1592 u4 = v4;
1593 u7 = v7;
1594
1595 x = vmulq_n_s32(v5, cospi[32]);
1596 y = vmlaq_n_s32(rnding, v6, cospi[32]);
1597 u6 = vaddq_s32(y, x);
1598 u6 = vshlq_s32(u6, v_bit);
1599
1600 u5 = vsubq_s32(y, x);
1601 u5 = vshlq_s32(u5, v_bit);
1602
1603 // stage 5
1604 addsub_neon(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
1605 addsub_neon(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
1606 addsub_neon(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
1607 addsub_neon(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
1608
1609 if (!do_cols) {
1610 const int log_range_out = AOMMAX(16, bd + 6);
1611 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1612 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1613 round_shift_4x4(out, out_shift);
1614 round_shift_4x4(out + 4, out_shift);
1615 highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 8);
1616 }
1617 }
1618
iadst8x8_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1619 static void iadst8x8_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1620 int do_cols, int bd, int out_shift) {
1621 const int32_t *cospi = cospi_arr(bit);
1622 int32x4_t u[8], x;
1623 const int32x4_t v_bit = vdupq_n_s32(-bit);
1624 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1625 // stage 0-2
1626
1627 u[0] = vmlaq_n_s32(rnding, in[0], cospi[60]);
1628 u[0] = vshlq_s32(u[0], v_bit);
1629
1630 u[1] = vmlaq_n_s32(rnding, in[0], cospi[4]);
1631 u[1] = vshlq_s32(vnegq_s32(u[1]), v_bit);
1632
1633 // stage 3-4
1634 int32x4_t temp1, temp2;
1635 temp1 = vmlaq_n_s32(rnding, u[0], cospi[16]);
1636 temp1 = vmlaq_n_s32(temp1, u[1], cospi[48]);
1637 temp1 = vshlq_s32(temp1, v_bit);
1638 u[4] = temp1;
1639
1640 temp2 = vmlaq_n_s32(rnding, u[0], cospi[48]);
1641 u[5] = vmlsq_n_s32(temp2, u[1], cospi[16]);
1642 u[5] = vshlq_s32(u[5], v_bit);
1643
1644 // stage 5-6
1645 temp1 = vmlaq_n_s32(rnding, u[0], cospi[32]);
1646 x = vmulq_n_s32(u[1], cospi[32]);
1647 u[2] = vaddq_s32(temp1, x);
1648 u[2] = vshlq_s32(u[2], v_bit);
1649
1650 u[3] = vsubq_s32(temp1, x);
1651 u[3] = vshlq_s32(u[3], v_bit);
1652
1653 temp1 = vmlaq_n_s32(rnding, u[4], cospi[32]);
1654 x = vmulq_n_s32(u[5], cospi[32]);
1655 u[6] = vaddq_s32(temp1, x);
1656 u[6] = vshlq_s32(u[6], v_bit);
1657
1658 u[7] = vsubq_s32(temp1, x);
1659 u[7] = vshlq_s32(u[7], v_bit);
1660
1661 // stage 7
1662 if (do_cols) {
1663 out[0] = u[0];
1664 out[1] = vnegq_s32(u[4]);
1665 out[2] = u[6];
1666 out[3] = vnegq_s32(u[2]);
1667 out[4] = u[3];
1668 out[5] = vnegq_s32(u[7]);
1669 out[6] = u[5];
1670 out[7] = vnegq_s32(u[1]);
1671 } else {
1672 const int log_range_out = AOMMAX(16, bd + 6);
1673 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1674 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1675 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1676 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1677 neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1678 &v_shift, &offset);
1679 neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1680 &v_shift, &offset);
1681 neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1682 &v_shift, &offset);
1683 neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1684 &v_shift, &offset);
1685 }
1686 }
1687
iadst8x8_new_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1688 static void iadst8x8_new_neon(int32x4_t *in, int32x4_t *out, int bit,
1689 int do_cols, int bd, int out_shift) {
1690 const int32_t *cospi = cospi_arr(bit);
1691 // const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1692 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1693 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1694 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1695 int32x4_t u[8], v[8], x;
1696 const int32x4_t v_bit = vdupq_n_s32(-bit);
1697 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1698 // stage 0-2
1699
1700 u[0] = vmlaq_n_s32(rnding, in[7], cospi[4]);
1701 u[0] = vmlaq_n_s32(u[0], in[0], cospi[60]);
1702 u[0] = vshlq_s32(u[0], v_bit);
1703
1704 u[1] = vmlaq_n_s32(rnding, in[7], cospi[60]);
1705 u[1] = vmlsq_n_s32(u[1], in[0], cospi[4]);
1706 u[1] = vshlq_s32(u[1], v_bit);
1707
1708 // (2)
1709 u[2] = vmlaq_n_s32(rnding, in[5], cospi[20]);
1710 u[2] = vmlaq_n_s32(u[2], in[2], cospi[44]);
1711 u[2] = vshlq_s32(u[2], v_bit);
1712
1713 u[3] = vmlaq_n_s32(rnding, in[5], cospi[44]);
1714 u[3] = vmlsq_n_s32(u[3], in[2], cospi[20]);
1715 u[3] = vshlq_s32(u[3], v_bit);
1716
1717 // (3)
1718 u[4] = vmlaq_n_s32(rnding, in[3], cospi[36]);
1719 u[4] = vmlaq_n_s32(u[4], in[4], cospi[28]);
1720 u[4] = vshlq_s32(u[4], v_bit);
1721
1722 u[5] = vmlaq_n_s32(rnding, in[3], cospi[28]);
1723 u[5] = vmlsq_n_s32(u[5], in[4], cospi[36]);
1724 u[5] = vshlq_s32(u[5], v_bit);
1725
1726 // (4)
1727 u[6] = vmulq_n_s32(in[1], cospi[52]);
1728 u[6] = vmlaq_n_s32(u[6], in[6], cospi[12]);
1729 u[6] = vaddq_s32(u[6], rnding);
1730 u[6] = vshlq_s32(u[6], v_bit);
1731
1732 u[7] = vmulq_n_s32(in[1], cospi[12]);
1733 u[7] = vmlsq_n_s32(u[7], in[6], cospi[52]);
1734 u[7] = vaddq_s32(u[7], rnding);
1735 u[7] = vshlq_s32(u[7], v_bit);
1736
1737 // stage 3
1738 addsub_neon(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1739 addsub_neon(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1740 addsub_neon(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1741 addsub_neon(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1742
1743 // stage 4
1744 u[0] = v[0];
1745 u[1] = v[1];
1746 u[2] = v[2];
1747 u[3] = v[3];
1748
1749 u[4] = vmlaq_n_s32(rnding, v[4], cospi[16]);
1750 u[4] = vmlaq_n_s32(u[4], v[5], cospi[48]);
1751 u[4] = vshlq_s32(u[4], v_bit);
1752
1753 u[5] = vmlaq_n_s32(rnding, v[4], cospi[48]);
1754 u[5] = vmlsq_n_s32(u[5], v[5], cospi[16]);
1755 u[5] = vshlq_s32(u[5], v_bit);
1756
1757 u[6] = vmlsq_n_s32(rnding, v[6], cospi[48]);
1758 u[6] = vmlaq_n_s32(u[6], v[7], cospi[16]);
1759 u[6] = vshlq_s32(u[6], v_bit);
1760
1761 u[7] = vmlaq_n_s32(rnding, v[6], cospi[16]);
1762 u[7] = vmlaq_n_s32(u[7], v[7], cospi[48]);
1763 u[7] = vshlq_s32(u[7], v_bit);
1764
1765 // stage 5
1766 addsub_neon(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1767 addsub_neon(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1768 addsub_neon(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1769 addsub_neon(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1770
1771 // stage 6
1772 u[0] = v[0];
1773 u[1] = v[1];
1774 u[4] = v[4];
1775 u[5] = v[5];
1776
1777 v[0] = vmlaq_n_s32(rnding, v[2], cospi[32]);
1778 x = vmulq_n_s32(v[3], cospi[32]);
1779 u[2] = vaddq_s32(v[0], x);
1780 u[2] = vshlq_s32(u[2], v_bit);
1781
1782 u[3] = vsubq_s32(v[0], x);
1783 u[3] = vshlq_s32(u[3], v_bit);
1784
1785 v[0] = vmlaq_n_s32(rnding, v[6], cospi[32]);
1786 x = vmulq_n_s32(v[7], cospi[32]);
1787 u[6] = vaddq_s32(v[0], x);
1788 u[6] = vshlq_s32(u[6], v_bit);
1789
1790 u[7] = vsubq_s32(v[0], x);
1791 u[7] = vshlq_s32(u[7], v_bit);
1792
1793 // stage 7
1794 if (do_cols) {
1795 out[0] = u[0];
1796 out[1] = vnegq_s32(u[4]);
1797 out[2] = u[6];
1798 out[3] = vnegq_s32(u[2]);
1799 out[4] = u[3];
1800 out[5] = vnegq_s32(u[7]);
1801 out[6] = u[5];
1802 out[7] = vnegq_s32(u[1]);
1803 } else {
1804 const int log_range_out = AOMMAX(16, bd + 6);
1805 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1806 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1807 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
1808 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1809 neg_shift_neon(&u[0], &u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1810 &v_shift, &offset);
1811 neg_shift_neon(&u[6], &u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1812 &v_shift, &offset);
1813 neg_shift_neon(&u[3], &u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1814 &v_shift, &offset);
1815 neg_shift_neon(&u[5], &u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1816 &v_shift, &offset);
1817 }
1818 }
1819
idct16x16_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1820 static void idct16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1821 int do_cols, int bd, int out_shift) {
1822 const int32_t *cospi = cospi_arr(bit);
1823 int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1824 int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1825 int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1826 const int32x4_t v_bit = vdupq_n_s32(-bit);
1827 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1828 // stage 0-4
1829 in[0] = vmlaq_n_s32(rnding, in[0], cospi[32]);
1830 in[0] = vshlq_s32(in[0], v_bit);
1831
1832 // stage 5-7
1833 if (!do_cols) {
1834 log_range = AOMMAX(16, bd + 6);
1835 clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1836 clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1837 if (out_shift != 0) {
1838 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
1839 in[0] = vaddq_s32(in[0], offset);
1840 in[0] = vshlq_s32(in[0], vdupq_n_s32(-out_shift));
1841 }
1842 }
1843
1844 in[0] = vmaxq_s32(in[0], clamp_lo);
1845 in[0] = vminq_s32(in[0], clamp_hi);
1846 out[0] = in[0];
1847 out[1] = in[0];
1848 out[2] = in[0];
1849 out[3] = in[0];
1850 out[4] = in[0];
1851 out[5] = in[0];
1852 out[6] = in[0];
1853 out[7] = in[0];
1854 out[8] = in[0];
1855 out[9] = in[0];
1856 out[10] = in[0];
1857 out[11] = in[0];
1858 out[12] = in[0];
1859 out[13] = in[0];
1860 out[14] = in[0];
1861 out[15] = in[0];
1862 }
1863
idct16x16_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1864 static void idct16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
1865 int do_cols, int bd, int out_shift) {
1866 const int32_t *cospi = cospi_arr(bit);
1867 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1868 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
1869 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
1870 const int32x4_t v_bit = vdupq_n_s32(-bit);
1871 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1872 int32x4_t u[16], x, y;
1873 // stage 0-1
1874 u[0] = in[0];
1875 u[2] = in[4];
1876 u[4] = in[2];
1877 u[6] = in[6];
1878 u[8] = in[1];
1879 u[10] = in[5];
1880 u[12] = in[3];
1881 u[14] = in[7];
1882
1883 // stage 2
1884 u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
1885 u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
1886
1887 u[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
1888 u[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
1889
1890 u[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
1891 u[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
1892
1893 u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
1894 u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
1895
1896 // stage 3
1897 u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
1898 u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
1899 u[5] = half_btf_0_m_neon_r(&cospi[40], &u[6], &v_bit, &rnding);
1900 u[6] = half_btf_0_neon_r(&cospi[24], &u[6], &v_bit, &rnding);
1901
1902 addsub_neon(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1903 addsub_neon(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1904 addsub_neon(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1905 addsub_neon(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1906
1907 // stage 4
1908 x = vmlaq_n_s32(rnding, u[0], cospi[32]);
1909 u[0] = vshlq_s32(x, v_bit);
1910 u[1] = u[0];
1911
1912 u[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
1913 u[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
1914
1915 addsub_neon(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1916 addsub_neon(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1917
1918 x = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
1919 &rnding);
1920 u[14] =
1921 half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
1922 u[9] = x;
1923 y = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13], &v_bit,
1924 &rnding);
1925 u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13], &v_bit,
1926 &rnding);
1927 u[10] = y;
1928
1929 // stage 5
1930 addsub_neon(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1931 addsub_neon(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1932
1933 x = vmulq_n_s32(u[5], cospi[32]);
1934 y = vmlaq_n_s32(rnding, u[6], cospi[32]);
1935 u[5] = vsubq_s32(y, x);
1936 u[5] = vshlq_s32(u[5], v_bit);
1937
1938 u[6] = vaddq_s32(y, x);
1939 u[6] = vshlq_s32(u[6], v_bit);
1940
1941 addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1942 addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1943 addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1944 addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1945
1946 // stage 6
1947 addsub_neon(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1948 addsub_neon(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1949 addsub_neon(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1950 addsub_neon(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1951
1952 x = vmulq_n_s32(u[10], cospi[32]);
1953 y = vmlaq_n_s32(rnding, u[13], cospi[32]);
1954 u[10] = vsubq_s32(y, x);
1955 u[10] = vshlq_s32(u[10], v_bit);
1956
1957 u[13] = vaddq_s32(x, y);
1958 u[13] = vshlq_s32(u[13], v_bit);
1959
1960 x = vmulq_n_s32(u[11], cospi[32]);
1961 y = vmlaq_n_s32(rnding, u[12], cospi[32]);
1962 u[11] = vsubq_s32(y, x);
1963 u[11] = vshlq_s32(u[11], v_bit);
1964
1965 u[12] = vaddq_s32(x, y);
1966 u[12] = vshlq_s32(u[12], v_bit);
1967 // stage 7
1968 addsub_neon(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1969 addsub_neon(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1970 addsub_neon(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1971 addsub_neon(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1972 addsub_neon(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1973 addsub_neon(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1974 addsub_neon(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1975 addsub_neon(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1976
1977 if (!do_cols) {
1978 const int log_range_out = AOMMAX(16, bd + 6);
1979 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
1980 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
1981 round_shift_8x8(out, out_shift);
1982 highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1983 }
1984 }
1985
iadst16x16_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)1986 static void iadst16x16_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
1987 int do_cols, int bd, int out_shift) {
1988 const int32_t *cospi = cospi_arr(bit);
1989 int32x4_t v[16], x, y, temp1, temp2;
1990 const int32x4_t v_bit = vdupq_n_s32(-bit);
1991 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
1992 // stage 0
1993 // stage 1
1994 // stage 2
1995 v[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
1996 v[0] = vshlq_s32(v[0], v_bit);
1997
1998 v[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
1999 v[1] = vshlq_s32(v[1], v_bit);
2000
2001 // stage 3
2002 v[8] = v[0];
2003 v[9] = v[1];
2004
2005 // stage 4
2006 temp1 = vmlaq_n_s32(rnding, v[8], cospi[8]);
2007 temp1 = vmlaq_n_s32(temp1, v[9], cospi[56]);
2008 temp1 = vshlq_s32(temp1, v_bit);
2009
2010 temp2 = vmlaq_n_s32(rnding, v[8], cospi[56]);
2011 temp2 = vmlsq_n_s32(temp2, v[9], cospi[8]);
2012 temp2 = vshlq_s32(temp2, v_bit);
2013 v[8] = temp1;
2014 v[9] = temp2;
2015
2016 // stage 5
2017 v[4] = v[0];
2018 v[5] = v[1];
2019 v[12] = v[8];
2020 v[13] = v[9];
2021
2022 // stage 6
2023 temp1 = vmlaq_n_s32(rnding, v[4], cospi[16]);
2024 temp1 = vmlaq_n_s32(temp1, v[5], cospi[48]);
2025 temp1 = vshlq_s32(temp1, v_bit);
2026
2027 temp2 = vmlaq_n_s32(rnding, v[4], cospi[48]);
2028 temp2 = vmlsq_n_s32(temp2, v[5], cospi[16]);
2029 temp2 = vshlq_s32(temp2, v_bit);
2030 v[4] = temp1;
2031 v[5] = temp2;
2032
2033 temp1 = vmlaq_n_s32(rnding, v[12], cospi[16]);
2034 temp1 = vmlaq_n_s32(temp1, v[13], cospi[48]);
2035 temp1 = vshlq_s32(temp1, v_bit);
2036
2037 temp2 = vmlaq_n_s32(rnding, v[12], cospi[48]);
2038 temp2 = vmlsq_n_s32(temp2, v[13], cospi[16]);
2039 temp2 = vshlq_s32(temp2, v_bit);
2040 v[12] = temp1;
2041 v[13] = temp2;
2042
2043 // stage 7
2044 v[2] = v[0];
2045 v[3] = v[1];
2046 v[6] = v[4];
2047 v[7] = v[5];
2048 v[10] = v[8];
2049 v[11] = v[9];
2050 v[14] = v[12];
2051 v[15] = v[13];
2052
2053 // stage 8
2054 y = vmlaq_n_s32(rnding, v[2], cospi[32]);
2055 x = vmulq_n_s32(v[3], cospi[32]);
2056 v[2] = vaddq_s32(y, x);
2057 v[2] = vshlq_s32(v[2], v_bit);
2058
2059 v[3] = vsubq_s32(y, x);
2060 v[3] = vshlq_s32(v[3], v_bit);
2061
2062 y = vmlaq_n_s32(rnding, v[6], cospi[32]);
2063 x = vmulq_n_s32(v[7], cospi[32]);
2064 v[6] = vaddq_s32(y, x);
2065 v[6] = vshlq_s32(v[6], v_bit);
2066
2067 v[7] = vsubq_s32(y, x);
2068 v[7] = vshlq_s32(v[7], v_bit);
2069
2070 y = vmlaq_n_s32(rnding, v[10], cospi[32]);
2071 x = vmulq_n_s32(v[11], cospi[32]);
2072 v[10] = vaddq_s32(y, x);
2073 v[10] = vshlq_s32(v[10], v_bit);
2074
2075 v[11] = vsubq_s32(y, x);
2076 v[11] = vshlq_s32(v[11], v_bit);
2077
2078 y = vmlaq_n_s32(rnding, v[14], cospi[32]);
2079 x = vmulq_n_s32(v[15], cospi[32]);
2080 v[14] = vaddq_s32(y, x);
2081 v[14] = vshlq_s32(v[14], v_bit);
2082
2083 v[15] = vsubq_s32(y, x);
2084 v[15] = vshlq_s32(v[15], v_bit);
2085
2086 // stage 9
2087 if (do_cols) {
2088 out[0] = v[0];
2089 out[1] = vnegq_s32(v[8]);
2090 out[2] = v[12];
2091 out[3] = vnegq_s32(v[4]);
2092 out[4] = v[6];
2093 out[5] = vnegq_s32(v[14]);
2094 out[6] = v[10];
2095 out[7] = vnegq_s32(v[2]);
2096 out[8] = v[3];
2097 out[9] = vnegq_s32(v[11]);
2098 out[10] = v[15];
2099 out[11] = vnegq_s32(v[7]);
2100 out[12] = v[5];
2101 out[13] = vnegq_s32(v[13]);
2102 out[14] = v[9];
2103 out[15] = vnegq_s32(v[1]);
2104 } else {
2105 const int log_range_out = AOMMAX(16, bd + 6);
2106 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2107 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2108 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
2109 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
2110 neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2111 &v_shift, &offset);
2112 neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
2113 &clamp_hi_out, &v_shift, &offset);
2114 neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
2115 &clamp_hi_out, &v_shift, &offset);
2116 neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
2117 &clamp_hi_out, &v_shift, &offset);
2118 neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
2119 &clamp_hi_out, &v_shift, &offset);
2120 neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
2121 &clamp_hi_out, &v_shift, &offset);
2122 neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
2123 &clamp_hi_out, &v_shift, &offset);
2124 neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
2125 &clamp_hi_out, &v_shift, &offset);
2126 }
2127 }
2128
iadst16x16_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2129 static void iadst16x16_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
2130 int do_cols, int bd, int out_shift) {
2131 const int32_t *cospi = cospi_arr(bit);
2132 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2133 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2134 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2135 int32x4_t zero = vdupq_n_s32(0);
2136 int32x4_t u[16], x, y;
2137 const int32x4_t v_bit = vdupq_n_s32(-bit);
2138 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
2139 // stage 0-2
2140 u[0] = vmlaq_n_s32(rnding, in[0], cospi[62]);
2141 u[0] = vshlq_s32(u[0], v_bit);
2142
2143 u[1] = vmlsq_n_s32(rnding, in[0], cospi[2]);
2144 u[1] = vshlq_s32(u[1], v_bit);
2145
2146 u[2] = vmlaq_n_s32(rnding, in[2], cospi[54]);
2147 u[2] = vshlq_s32(u[2], v_bit);
2148
2149 u[3] = vmlsq_n_s32(rnding, in[2], cospi[10]);
2150 u[3] = vshlq_s32(u[3], v_bit);
2151
2152 u[4] = vmlaq_n_s32(rnding, in[4], cospi[46]);
2153 u[4] = vshlq_s32(u[4], v_bit);
2154
2155 u[5] = vmlsq_n_s32(rnding, in[4], cospi[18]);
2156 u[5] = vshlq_s32(u[5], v_bit);
2157
2158 u[6] = vmlaq_n_s32(rnding, in[6], cospi[38]);
2159 u[6] = vshlq_s32(u[6], v_bit);
2160
2161 u[7] = vmlsq_n_s32(rnding, in[6], cospi[26]);
2162 u[7] = vshlq_s32(u[7], v_bit);
2163
2164 u[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
2165 u[8] = vshlq_s32(u[8], v_bit);
2166
2167 u[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
2168 u[9] = vshlq_s32(u[9], v_bit);
2169
2170 u[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
2171 u[10] = vshlq_s32(u[10], v_bit);
2172
2173 u[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
2174 u[11] = vshlq_s32(u[11], v_bit);
2175
2176 u[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
2177 u[12] = vshlq_s32(u[12], v_bit);
2178
2179 u[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
2180 u[13] = vshlq_s32(u[13], v_bit);
2181
2182 u[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
2183 u[14] = vshlq_s32(u[14], v_bit);
2184
2185 u[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
2186 u[15] = vshlq_s32(u[15], v_bit);
2187
2188 // stage 3
2189 addsub_neon(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2190 addsub_neon(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2191 addsub_neon(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2192 addsub_neon(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2193 addsub_neon(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2194 addsub_neon(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2195 addsub_neon(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2196 addsub_neon(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2197
2198 // stage 4
2199 y = vmlaq_n_s32(rnding, u[8], cospi[56]);
2200 u[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
2201 u[8] = vmlaq_n_s32(u[8], u[9], cospi[56]);
2202 u[8] = vshlq_s32(u[8], v_bit);
2203
2204 u[9] = vmlsq_n_s32(y, u[9], cospi[8]);
2205 u[9] = vshlq_s32(u[9], v_bit);
2206
2207 y = vmlaq_n_s32(rnding, u[10], cospi[24]);
2208 u[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
2209 u[10] = vmlaq_n_s32(u[10], u[11], cospi[24]);
2210 u[10] = vshlq_s32(u[10], v_bit);
2211
2212 u[11] = vmlsq_n_s32(y, u[11], cospi[40]);
2213 u[11] = vshlq_s32(u[11], v_bit);
2214
2215 y = vmlaq_n_s32(rnding, u[12], cospi[8]);
2216 u[12] = vmlsq_n_s32(rnding, u[12], cospi[56]);
2217 u[12] = vmlaq_n_s32(u[12], u[13], cospi[8]);
2218 u[12] = vshlq_s32(u[12], v_bit);
2219
2220 u[13] = vmlaq_n_s32(y, u[13], cospi[56]);
2221 u[13] = vshlq_s32(u[13], v_bit);
2222
2223 y = vmlaq_n_s32(rnding, u[14], cospi[40]);
2224 u[14] = vmlsq_n_s32(rnding, u[14], cospi[24]);
2225 u[14] = vmlaq_n_s32(u[14], u[15], cospi[40]);
2226 u[14] = vshlq_s32(u[14], v_bit);
2227
2228 u[15] = vmlaq_n_s32(y, u[15], cospi[24]);
2229 u[15] = vshlq_s32(u[15], v_bit);
2230
2231 // stage 5
2232 addsub_neon(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2233 addsub_neon(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2234 addsub_neon(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2235 addsub_neon(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2236 addsub_neon(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2237 addsub_neon(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2238 addsub_neon(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2239 addsub_neon(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2240
2241 // stage 6
2242 y = vmlaq_n_s32(rnding, u[4], cospi[48]);
2243 u[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
2244 u[4] = vmlaq_n_s32(u[4], u[5], cospi[48]);
2245 u[4] = vshlq_s32(u[4], v_bit);
2246
2247 u[5] = vmlsq_n_s32(y, u[5], cospi[16]);
2248 u[5] = vshlq_s32(u[5], v_bit);
2249
2250 y = vmlaq_n_s32(rnding, u[6], cospi[16]);
2251 u[6] = vmlsq_n_s32(rnding, u[6], cospi[48]);
2252 u[6] = vmlaq_n_s32(u[6], u[7], cospi[16]);
2253 u[6] = vshlq_s32(u[6], v_bit);
2254
2255 u[7] = vmlaq_n_s32(y, u[7], cospi[48]);
2256 u[7] = vshlq_s32(u[7], v_bit);
2257
2258 y = vmlaq_n_s32(rnding, u[12], cospi[48]);
2259 u[12] = vmulq_n_s32(u[12], cospi[16]);
2260 u[12] = vmlaq_n_s32(u[12], u[13], cospi[48]);
2261 u[12] = vshlq_s32(u[12], v_bit);
2262
2263 u[13] = vmlsq_n_s32(y, u[13], cospi[16]);
2264 u[13] = vshlq_s32(u[13], v_bit);
2265
2266 y = vmlaq_n_s32(rnding, u[14], cospi[16]);
2267 u[14] = vmlsq_n_s32(rnding, u[14], cospi[48]);
2268 u[14] = vmlaq_n_s32(u[14], u[15], cospi[16]);
2269 u[14] = vshlq_s32(u[14], v_bit);
2270
2271 u[15] = vmlaq_n_s32(y, u[15], cospi[48]);
2272 u[15] = vshlq_s32(u[15], v_bit);
2273
2274 // stage 7
2275 addsub_neon(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2276 addsub_neon(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2277 addsub_neon(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2278 addsub_neon(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2279 addsub_neon(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2280 addsub_neon(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2281 addsub_neon(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2282 addsub_neon(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2283
2284 // stage 8
2285 y = vmlaq_n_s32(rnding, u[2], cospi[32]);
2286 x = vmulq_n_s32(u[3], cospi[32]);
2287 u[2] = vaddq_s32(y, x);
2288 u[2] = vshlq_s32(u[2], v_bit);
2289
2290 u[3] = vsubq_s32(y, x);
2291 u[3] = vshlq_s32(u[3], v_bit);
2292 y = vmlaq_n_s32(rnding, u[6], cospi[32]);
2293 x = vmulq_n_s32(u[7], cospi[32]);
2294 u[6] = vaddq_s32(y, x);
2295 u[6] = vshlq_s32(u[6], v_bit);
2296
2297 u[7] = vsubq_s32(y, x);
2298 u[7] = vshlq_s32(u[7], v_bit);
2299
2300 y = vmlaq_n_s32(rnding, u[10], cospi[32]);
2301 x = vmulq_n_s32(u[11], cospi[32]);
2302 u[10] = vaddq_s32(y, x);
2303 u[10] = vshlq_s32(u[10], v_bit);
2304
2305 u[11] = vsubq_s32(y, x);
2306 u[11] = vshlq_s32(u[11], v_bit);
2307
2308 y = vmlaq_n_s32(rnding, u[14], cospi[32]);
2309 x = vmulq_n_s32(u[15], cospi[32]);
2310 u[14] = vaddq_s32(y, x);
2311 u[14] = vshlq_s32(u[14], v_bit);
2312
2313 u[15] = vsubq_s32(y, x);
2314 u[15] = vshlq_s32(u[15], v_bit);
2315
2316 // stage 9
2317 if (do_cols) {
2318 out[0] = u[0];
2319 out[1] = vsubq_s32(zero, u[8]);
2320 out[2] = u[12];
2321 out[3] = vsubq_s32(zero, u[4]);
2322 out[4] = u[6];
2323 out[5] = vsubq_s32(zero, u[14]);
2324 out[6] = u[10];
2325 out[7] = vsubq_s32(zero, u[2]);
2326 out[8] = u[3];
2327 out[9] = vsubq_s32(zero, u[11]);
2328 out[10] = u[15];
2329 out[11] = vsubq_s32(zero, u[7]);
2330 out[12] = u[5];
2331 out[13] = vsubq_s32(zero, u[13]);
2332 out[14] = u[9];
2333 out[15] = vsubq_s32(zero, u[1]);
2334 } else {
2335 const int log_range_out = AOMMAX(16, bd + 6);
2336 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2337 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2338 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
2339 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
2340 neg_shift_neon(&u[0], &u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2341 &v_shift, &offset);
2342 neg_shift_neon(&u[12], &u[4], out + 2, out + 3, &clamp_lo_out,
2343 &clamp_hi_out, &v_shift, &offset);
2344 neg_shift_neon(&u[6], &u[14], out + 4, out + 5, &clamp_lo_out,
2345 &clamp_hi_out, &v_shift, &offset);
2346 neg_shift_neon(&u[10], &u[2], out + 6, out + 7, &clamp_lo_out,
2347 &clamp_hi_out, &v_shift, &offset);
2348 neg_shift_neon(&u[3], &u[11], out + 8, out + 9, &clamp_lo_out,
2349 &clamp_hi_out, &v_shift, &offset);
2350 neg_shift_neon(&u[15], &u[7], out + 10, out + 11, &clamp_lo_out,
2351 &clamp_hi_out, &v_shift, &offset);
2352 neg_shift_neon(&u[5], &u[13], out + 12, out + 13, &clamp_lo_out,
2353 &clamp_hi_out, &v_shift, &offset);
2354 neg_shift_neon(&u[9], &u[1], out + 14, out + 15, &clamp_lo_out,
2355 &clamp_hi_out, &v_shift, &offset);
2356 }
2357 }
2358
idct16x16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2359 static void idct16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
2360 int bd, int out_shift) {
2361 const int32_t *cospi = cospi_arr(bit);
2362 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2363 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2364 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2365 int32x4_t u[16], v[16], x, y;
2366 const int32x4_t v_bit = vdupq_n_s32(-bit);
2367 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
2368
2369 {
2370 // stage 0-1
2371 u[0] = in[0];
2372 u[1] = in[8];
2373 u[2] = in[4];
2374 u[3] = in[12];
2375 u[4] = in[2];
2376 u[5] = in[10];
2377 u[6] = in[6];
2378 u[7] = in[14];
2379 u[8] = in[1];
2380 u[9] = in[9];
2381 u[10] = in[5];
2382 u[11] = in[13];
2383 u[12] = in[3];
2384 u[13] = in[11];
2385 u[14] = in[7];
2386 u[15] = in[15];
2387
2388 // stage 2
2389 v[0] = u[0];
2390 v[1] = u[1];
2391 v[2] = u[2];
2392 v[3] = u[3];
2393 v[4] = u[4];
2394 v[5] = u[5];
2395 v[6] = u[6];
2396 v[7] = u[7];
2397
2398 v[8] = half_btf_neon_mode01_r(&cospi[60], &u[8], &cospi[4], &u[15], &v_bit,
2399 &rnding);
2400 v[9] = half_btf_neon_mode01_r(&cospi[28], &u[9], &cospi[36], &u[14], &v_bit,
2401 &rnding);
2402 v[10] = half_btf_neon_mode01_r(&cospi[44], &u[10], &cospi[20], &u[13],
2403 &v_bit, &rnding);
2404 v[11] = half_btf_neon_mode01_r(&cospi[12], &u[11], &cospi[52], &u[12],
2405 &v_bit, &rnding);
2406 v[12] = half_btf_neon_r(&cospi[52], &u[11], &cospi[12], &u[12], &v_bit,
2407 &rnding);
2408 v[13] = half_btf_neon_r(&cospi[20], &u[10], &cospi[44], &u[13], &v_bit,
2409 &rnding);
2410 v[14] =
2411 half_btf_neon_r(&cospi[36], &u[9], &cospi[28], &u[14], &v_bit, &rnding);
2412 v[15] =
2413 half_btf_neon_r(&cospi[4], &u[8], &cospi[60], &u[15], &v_bit, &rnding);
2414
2415 // stage 3
2416 u[0] = v[0];
2417 u[1] = v[1];
2418 u[2] = v[2];
2419 u[3] = v[3];
2420 u[4] = half_btf_neon_mode01_r(&cospi[56], &v[4], &cospi[8], &v[7], &v_bit,
2421 &rnding);
2422 u[5] = half_btf_neon_mode01_r(&cospi[24], &v[5], &cospi[40], &v[6], &v_bit,
2423 &rnding);
2424 u[6] =
2425 half_btf_neon_r(&cospi[40], &v[5], &cospi[24], &v[6], &v_bit, &rnding);
2426 u[7] =
2427 half_btf_neon_r(&cospi[8], &v[4], &cospi[56], &v[7], &v_bit, &rnding);
2428 addsub_neon(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2429 addsub_neon(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2430 addsub_neon(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2431 addsub_neon(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2432
2433 // stage 4
2434 x = vmlaq_n_s32(rnding, u[0], cospi[32]);
2435 y = vmulq_n_s32(u[1], cospi[32]);
2436 v[0] = vaddq_s32(x, y);
2437 v[0] = vshlq_s32(v[0], v_bit);
2438
2439 v[1] = vsubq_s32(x, y);
2440 v[1] = vshlq_s32(v[1], v_bit);
2441
2442 v[2] = half_btf_neon_mode01_r(&cospi[48], &u[2], &cospi[16], &u[3], &v_bit,
2443 &rnding);
2444 v[3] =
2445 half_btf_neon_r(&cospi[16], &u[2], &cospi[48], &u[3], &v_bit, &rnding);
2446 addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2447 addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2448 v[8] = u[8];
2449 v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
2450 &rnding);
2451 v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
2452 &v_bit, &rnding);
2453 v[11] = u[11];
2454 v[12] = u[12];
2455 v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
2456 &v_bit, &rnding);
2457 v[14] =
2458 half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
2459 v[15] = u[15];
2460
2461 // stage 5
2462 addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2463 addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2464 u[4] = v[4];
2465
2466 x = vmulq_n_s32(v[5], cospi[32]);
2467 y = vmlaq_n_s32(rnding, v[6], cospi[32]);
2468 u[5] = vsubq_s32(y, x);
2469 u[5] = vshlq_s32(u[5], v_bit);
2470
2471 u[6] = vaddq_s32(y, x);
2472 u[6] = vshlq_s32(u[6], v_bit);
2473
2474 u[7] = v[7];
2475 addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2476 addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2477 addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2478 addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2479
2480 // stage 6
2481 addsub_neon(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2482 addsub_neon(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2483 addsub_neon(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2484 addsub_neon(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2485 v[8] = u[8];
2486 v[9] = u[9];
2487
2488 x = vmulq_n_s32(u[10], cospi[32]);
2489 y = vmlaq_n_s32(rnding, u[13], cospi[32]);
2490 v[10] = vsubq_s32(y, x);
2491 v[10] = vshlq_s32(v[10], v_bit);
2492
2493 v[13] = vaddq_s32(x, y);
2494 v[13] = vshlq_s32(v[13], v_bit);
2495
2496 x = vmulq_n_s32(u[11], cospi[32]);
2497 y = vmlaq_n_s32(rnding, u[12], cospi[32]);
2498 v[11] = vsubq_s32(y, x);
2499 v[11] = vshlq_s32(v[11], v_bit);
2500
2501 v[12] = vaddq_s32(x, y);
2502 v[12] = vshlq_s32(v[12], v_bit);
2503
2504 v[14] = u[14];
2505 v[15] = u[15];
2506
2507 // stage 7
2508 addsub_neon(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2509 addsub_neon(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2510 addsub_neon(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2511 addsub_neon(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2512 addsub_neon(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2513 addsub_neon(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2514 addsub_neon(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2515 addsub_neon(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2516
2517 if (!do_cols) {
2518 const int log_range_out = AOMMAX(16, bd + 6);
2519 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2520 const int32x4_t clamp_hi_out =
2521 vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2522 round_shift_8x8(out, out_shift);
2523 highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2524 }
2525 }
2526 }
2527
iadst16x16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2528 static void iadst16x16_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
2529 int bd, int out_shift) {
2530 const int32_t *cospi = cospi_arr(bit);
2531 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2532 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2533 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2534 const int32x4_t zero = vdupq_n_s32(0);
2535 const int32x4_t v_bit = vdupq_n_s32(-bit);
2536 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
2537 int32x4_t u[16], v[16], x, y;
2538 // Calculate the column 0, 1, 2, 3
2539 // stage 0
2540 // stage 1
2541 // stage 2
2542 v[0] = vmlaq_n_s32(rnding, in[15], cospi[2]);
2543 v[0] = vmlaq_n_s32(v[0], in[0], cospi[62]);
2544 v[0] = vshlq_s32(v[0], v_bit);
2545
2546 v[1] = vmlaq_n_s32(rnding, in[15], cospi[62]);
2547 v[1] = vmlsq_n_s32(v[1], in[0], cospi[2]);
2548 v[1] = vshlq_s32(v[1], v_bit);
2549
2550 v[2] = vmlaq_n_s32(rnding, in[13], cospi[10]);
2551 v[2] = vmlaq_n_s32(v[2], in[2], cospi[54]);
2552 v[2] = vshlq_s32(v[2], v_bit);
2553
2554 v[3] = vmlaq_n_s32(rnding, in[13], cospi[54]);
2555 v[3] = vmlsq_n_s32(v[3], in[2], cospi[10]);
2556 v[3] = vshlq_s32(v[3], v_bit);
2557
2558 v[4] = vmlaq_n_s32(rnding, in[11], cospi[18]);
2559 v[4] = vmlaq_n_s32(v[4], in[4], cospi[46]);
2560 v[4] = vshlq_s32(v[4], v_bit);
2561
2562 v[5] = vmlaq_n_s32(rnding, in[11], cospi[46]);
2563 v[5] = vmlsq_n_s32(v[5], in[4], cospi[18]);
2564 v[5] = vshlq_s32(v[5], v_bit);
2565
2566 v[6] = vmlaq_n_s32(rnding, in[9], cospi[26]);
2567 v[6] = vmlaq_n_s32(v[6], in[6], cospi[38]);
2568 v[6] = vshlq_s32(v[6], v_bit);
2569
2570 v[7] = vmlaq_n_s32(rnding, in[9], cospi[38]);
2571 v[7] = vmlsq_n_s32(v[7], in[6], cospi[26]);
2572 v[7] = vshlq_s32(v[7], v_bit);
2573
2574 v[8] = vmlaq_n_s32(rnding, in[7], cospi[34]);
2575 v[8] = vmlaq_n_s32(v[8], in[8], cospi[30]);
2576 v[8] = vshlq_s32(v[8], v_bit);
2577
2578 v[9] = vmlaq_n_s32(rnding, in[7], cospi[30]);
2579 v[9] = vmlsq_n_s32(v[9], in[8], cospi[34]);
2580 v[9] = vshlq_s32(v[9], v_bit);
2581
2582 v[10] = vmlaq_n_s32(rnding, in[5], cospi[42]);
2583 v[10] = vmlaq_n_s32(v[10], in[10], cospi[22]);
2584 v[10] = vshlq_s32(v[10], v_bit);
2585
2586 v[11] = vmlaq_n_s32(rnding, in[5], cospi[22]);
2587 v[11] = vmlsq_n_s32(v[11], in[10], cospi[42]);
2588 v[11] = vshlq_s32(v[11], v_bit);
2589
2590 v[12] = vmlaq_n_s32(rnding, in[3], cospi[50]);
2591 v[12] = vmlaq_n_s32(v[12], in[12], cospi[14]);
2592 v[12] = vshlq_s32(v[12], v_bit);
2593
2594 v[13] = vmlaq_n_s32(rnding, in[3], cospi[14]);
2595 v[13] = vmlsq_n_s32(v[13], in[12], cospi[50]);
2596 v[13] = vshlq_s32(v[13], v_bit);
2597
2598 v[14] = vmlaq_n_s32(rnding, in[1], cospi[58]);
2599 v[14] = vmlaq_n_s32(v[14], in[14], cospi[6]);
2600 v[14] = vshlq_s32(v[14], v_bit);
2601
2602 v[15] = vmlaq_n_s32(rnding, in[1], cospi[6]);
2603 v[15] = vmlsq_n_s32(v[15], in[14], cospi[58]);
2604 v[15] = vshlq_s32(v[15], v_bit);
2605
2606 // stage 3
2607 addsub_neon(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2608 addsub_neon(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2609 addsub_neon(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2610 addsub_neon(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2611 addsub_neon(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2612 addsub_neon(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2613 addsub_neon(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2614 addsub_neon(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2615
2616 // stage 4
2617 v[0] = u[0];
2618 v[1] = u[1];
2619 v[2] = u[2];
2620 v[3] = u[3];
2621 v[4] = u[4];
2622 v[5] = u[5];
2623 v[6] = u[6];
2624 v[7] = u[7];
2625
2626 v[8] = vmlaq_n_s32(rnding, u[8], cospi[8]);
2627 v[8] = vmlaq_n_s32(v[8], u[9], cospi[56]);
2628 v[8] = vshlq_s32(v[8], v_bit);
2629
2630 v[9] = vmlaq_n_s32(rnding, u[8], cospi[56]);
2631 v[9] = vmlsq_n_s32(v[9], u[9], cospi[8]);
2632 v[9] = vshlq_s32(v[9], v_bit);
2633
2634 v[10] = vmlaq_n_s32(rnding, u[10], cospi[40]);
2635 v[10] = vmlaq_n_s32(v[10], u[11], cospi[24]);
2636 v[10] = vshlq_s32(v[10], v_bit);
2637
2638 v[11] = vmlaq_n_s32(rnding, u[10], cospi[24]);
2639 v[11] = vmlsq_n_s32(v[11], u[11], cospi[40]);
2640 v[11] = vshlq_s32(v[11], v_bit);
2641
2642 v[12] = vmlaq_n_s32(rnding, u[12], -cospi[56]);
2643 v[12] = vmlaq_n_s32(v[12], u[13], cospi[8]);
2644 v[12] = vshlq_s32(v[12], v_bit);
2645
2646 v[13] = vmlaq_n_s32(rnding, u[12], cospi[8]);
2647 v[13] = vmlsq_n_s32(v[13], u[13], -cospi[56]);
2648 v[13] = vshlq_s32(v[13], v_bit);
2649
2650 v[14] = vmlaq_n_s32(rnding, u[14], -cospi[24]);
2651 v[14] = vmlaq_n_s32(v[14], u[15], cospi[40]);
2652 v[14] = vshlq_s32(v[14], v_bit);
2653
2654 v[15] = vmlaq_n_s32(rnding, u[14], cospi[40]);
2655 v[15] = vmlsq_n_s32(v[15], u[15], -cospi[24]);
2656 v[15] = vshlq_s32(v[15], v_bit);
2657
2658 // stage 5
2659 addsub_neon(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2660 addsub_neon(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2661 addsub_neon(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2662 addsub_neon(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2663 addsub_neon(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2664 addsub_neon(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2665 addsub_neon(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2666 addsub_neon(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2667
2668 // stage 6
2669 v[0] = u[0];
2670 v[1] = u[1];
2671 v[2] = u[2];
2672 v[3] = u[3];
2673
2674 v[4] = vmlaq_n_s32(rnding, u[4], cospi[16]);
2675 v[4] = vmlaq_n_s32(v[4], u[5], cospi[48]);
2676 v[4] = vshlq_s32(v[4], v_bit);
2677
2678 v[5] = vmlaq_n_s32(rnding, u[4], cospi[48]);
2679 v[5] = vmlsq_n_s32(v[5], u[5], cospi[16]);
2680 v[5] = vshlq_s32(v[5], v_bit);
2681
2682 v[6] = vmlaq_n_s32(rnding, u[6], -cospi[48]);
2683 v[6] = vmlaq_n_s32(v[6], u[7], cospi[16]);
2684 v[6] = vshlq_s32(v[6], v_bit);
2685
2686 v[7] = vmlaq_n_s32(rnding, u[6], cospi[16]);
2687 v[7] = vmlsq_n_s32(v[7], u[7], -cospi[48]);
2688 v[7] = vshlq_s32(v[7], v_bit);
2689
2690 v[8] = u[8];
2691 v[9] = u[9];
2692 v[10] = u[10];
2693 v[11] = u[11];
2694
2695 v[12] = vmlaq_n_s32(rnding, u[12], cospi[16]);
2696 v[12] = vmlaq_n_s32(v[12], u[13], cospi[48]);
2697 v[12] = vshlq_s32(v[12], v_bit);
2698
2699 v[13] = vmlaq_n_s32(rnding, u[12], cospi[48]);
2700 v[13] = vmlsq_n_s32(v[13], u[13], cospi[16]);
2701 v[13] = vshlq_s32(v[13], v_bit);
2702
2703 v[14] = vmlaq_n_s32(rnding, u[14], -cospi[48]);
2704 v[14] = vmlaq_n_s32(v[14], u[15], cospi[16]);
2705 v[14] = vshlq_s32(v[14], v_bit);
2706
2707 v[15] = vmlaq_n_s32(rnding, u[14], cospi[16]);
2708 v[15] = vmlsq_n_s32(v[15], u[15], -cospi[48]);
2709 v[15] = vshlq_s32(v[15], v_bit);
2710
2711 // stage 7
2712 addsub_neon(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2713 addsub_neon(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2714 addsub_neon(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2715 addsub_neon(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2716 addsub_neon(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2717 addsub_neon(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2718 addsub_neon(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2719 addsub_neon(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2720
2721 // stage 8
2722 v[0] = u[0];
2723 v[1] = u[1];
2724
2725 y = vmlaq_n_s32(rnding, u[2], cospi[32]);
2726 x = vmulq_n_s32(u[3], cospi[32]);
2727 v[2] = vaddq_s32(y, x);
2728 v[2] = vshlq_s32(v[2], v_bit);
2729
2730 v[3] = vsubq_s32(y, x);
2731 v[3] = vshlq_s32(v[3], v_bit);
2732
2733 v[4] = u[4];
2734 v[5] = u[5];
2735
2736 y = vmlaq_n_s32(rnding, u[6], cospi[32]);
2737 x = vmulq_n_s32(u[7], cospi[32]);
2738 v[6] = vaddq_s32(y, x);
2739 v[6] = vshlq_s32(v[6], v_bit);
2740
2741 v[7] = vsubq_s32(y, x);
2742 v[7] = vshlq_s32(v[7], v_bit);
2743
2744 v[8] = u[8];
2745 v[9] = u[9];
2746
2747 y = vmlaq_n_s32(rnding, u[10], cospi[32]);
2748 x = vmulq_n_s32(u[11], cospi[32]);
2749 v[10] = vaddq_s32(y, x);
2750 v[10] = vshlq_s32(v[10], v_bit);
2751
2752 v[11] = vsubq_s32(y, x);
2753 v[11] = vshlq_s32(v[11], v_bit);
2754
2755 v[12] = u[12];
2756 v[13] = u[13];
2757
2758 y = vmlaq_n_s32(rnding, u[14], cospi[32]);
2759 x = vmulq_n_s32(u[15], cospi[32]);
2760 v[14] = vaddq_s32(y, x);
2761 v[14] = vshlq_s32(v[14], v_bit);
2762
2763 v[15] = vsubq_s32(y, x);
2764 v[15] = vshlq_s32(v[15], v_bit);
2765
2766 // stage 9
2767 if (do_cols) {
2768 out[0] = v[0];
2769 out[1] = vsubq_s32(zero, v[8]);
2770 out[2] = v[12];
2771 out[3] = vsubq_s32(zero, v[4]);
2772 out[4] = v[6];
2773 out[5] = vsubq_s32(zero, v[14]);
2774 out[6] = v[10];
2775 out[7] = vsubq_s32(zero, v[2]);
2776 out[8] = v[3];
2777 out[9] = vsubq_s32(zero, v[11]);
2778 out[10] = v[15];
2779 out[11] = vsubq_s32(zero, v[7]);
2780 out[12] = v[5];
2781 out[13] = vsubq_s32(zero, v[13]);
2782 out[14] = v[9];
2783 out[15] = vsubq_s32(zero, v[1]);
2784 } else {
2785 const int log_range_out = AOMMAX(16, bd + 6);
2786 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
2787 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
2788 const int32x4_t v_shift = vdupq_n_s32(-out_shift);
2789 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
2790 neg_shift_neon(&v[0], &v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2791 &v_shift, &offset);
2792 neg_shift_neon(&v[12], &v[4], out + 2, out + 3, &clamp_lo_out,
2793 &clamp_hi_out, &v_shift, &offset);
2794 neg_shift_neon(&v[6], &v[14], out + 4, out + 5, &clamp_lo_out,
2795 &clamp_hi_out, &v_shift, &offset);
2796 neg_shift_neon(&v[10], &v[2], out + 6, out + 7, &clamp_lo_out,
2797 &clamp_hi_out, &v_shift, &offset);
2798 neg_shift_neon(&v[3], &v[11], out + 8, out + 9, &clamp_lo_out,
2799 &clamp_hi_out, &v_shift, &offset);
2800 neg_shift_neon(&v[15], &v[7], out + 10, out + 11, &clamp_lo_out,
2801 &clamp_hi_out, &v_shift, &offset);
2802 neg_shift_neon(&v[5], &v[13], out + 12, out + 13, &clamp_lo_out,
2803 &clamp_hi_out, &v_shift, &offset);
2804 neg_shift_neon(&v[9], &v[1], out + 14, out + 15, &clamp_lo_out,
2805 &clamp_hi_out, &v_shift, &offset);
2806 }
2807 }
2808
iidentity16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)2809 static void iidentity16_neon(int32x4_t *in, int32x4_t *out, int bit,
2810 int do_cols, int bd, int out_shift) {
2811 (void)bit;
2812 int32x2_t fact = vdup_n_s32(2 * NewSqrt2);
2813 int32x4x2_t a0;
2814 int32x4_t zero = vdupq_n_s32(0);
2815 const int64x2_t rnding = vdupq_n_s64(1 << (NewSqrt2Bits - 1));
2816 for (int i = 0; i < 16; i++) {
2817 a0.val[0] = vreinterpretq_s32_s64(
2818 vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(in[i])), fact));
2819 a0.val[0] = vreinterpretq_s32_s64(
2820 vshrq_n_s64(vreinterpretq_s64_s32(a0.val[0]), NewSqrt2Bits));
2821 a0.val[1] = vextq_s32(in[i], zero, 1);
2822 a0.val[1] = vreinterpretq_s32_s64(
2823 vmlal_s32(rnding, vmovn_s64(vreinterpretq_s64_s32(a0.val[1])), fact));
2824 a0.val[1] = vreinterpretq_s32_s64(
2825 vshrq_n_s64(vreinterpretq_s64_s32(a0.val[1]), NewSqrt2Bits));
2826 a0 = vzipq_s32(a0.val[0], a0.val[1]);
2827 #if AOM_ARCH_AARCH64
2828 out[i] = vreinterpretq_s32_s64(vzip1q_s64(
2829 vreinterpretq_s64_s32(a0.val[0]), vreinterpretq_s64_s32(a0.val[1])));
2830 #else
2831 out[i] = vextq_s32(vextq_s32(a0.val[0], a0.val[0], 2), a0.val[1], 2);
2832 #endif
2833 }
2834
2835 if (!do_cols) {
2836 const int log_range = AOMMAX(16, bd + 6);
2837 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
2838 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
2839 round_shift_8x8(out, out_shift);
2840 highbd_clamp_s32_neon(out, out, &clamp_lo, &clamp_hi, 16);
2841 }
2842 }
2843
idct64_stage8_neon(int32x4_t * u,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)2844 static inline void idct64_stage8_neon(int32x4_t *u, const int32_t *cospi,
2845 const int32x4_t *clamp_lo,
2846 const int32x4_t *clamp_hi,
2847 const int32x4_t *v_bit,
2848 const int32x4_t *rnding) {
2849 int i;
2850 int32x4_t temp1, temp2, temp3, temp4;
2851 temp1 = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit,
2852 rnding);
2853 u[13] =
2854 half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], v_bit, rnding);
2855 u[10] = temp1;
2856 temp2 = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit,
2857 rnding);
2858 u[12] =
2859 half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], v_bit, rnding);
2860 u[11] = temp2;
2861
2862 for (i = 16; i < 20; ++i) {
2863 addsub_neon(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2864 addsub_neon(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
2865 }
2866
2867 temp1 = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59], v_bit,
2868 rnding);
2869 temp2 = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58], v_bit,
2870 rnding);
2871 temp3 = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57], v_bit,
2872 rnding);
2873 temp4 = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56], v_bit,
2874 rnding);
2875 u[56] =
2876 half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], v_bit, rnding);
2877 u[57] =
2878 half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], v_bit, rnding);
2879 u[58] =
2880 half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], v_bit, rnding);
2881 u[59] =
2882 half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], v_bit, rnding);
2883 u[36] = temp1;
2884 u[37] = temp2;
2885 u[38] = temp3;
2886 u[39] = temp4;
2887
2888 temp1 = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55], v_bit,
2889 rnding);
2890 temp2 = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54], v_bit,
2891 rnding);
2892 temp3 = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53], v_bit,
2893 rnding);
2894 temp4 = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52], v_bit,
2895 rnding);
2896 u[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52], v_bit,
2897 rnding);
2898 u[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53], v_bit,
2899 rnding);
2900 u[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54], v_bit,
2901 rnding);
2902 u[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55], v_bit,
2903 rnding);
2904 u[40] = temp1;
2905 u[41] = temp2;
2906 u[42] = temp3;
2907 u[43] = temp4;
2908 }
2909
idct64_stage9_neon(int32x4_t * u,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)2910 static inline void idct64_stage9_neon(int32x4_t *u, const int32_t *cospi,
2911 const int32x4_t *clamp_lo,
2912 const int32x4_t *clamp_hi,
2913 const int32x4_t *v_bit,
2914 const int32x4_t *rnding) {
2915 int i;
2916 int32x4_t temp1, temp2, temp3, temp4;
2917 for (i = 0; i < 8; ++i) {
2918 addsub_neon(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
2919 }
2920 temp1 = half_btf_neon_mode10_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit,
2921 rnding);
2922 temp2 = half_btf_neon_mode10_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit,
2923 rnding);
2924 temp3 = half_btf_neon_mode10_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit,
2925 rnding);
2926 temp4 = half_btf_neon_mode10_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit,
2927 rnding);
2928 u[24] =
2929 half_btf_neon_r(&cospi[32], &u[23], &cospi[32], &u[24], v_bit, rnding);
2930 u[25] =
2931 half_btf_neon_r(&cospi[32], &u[22], &cospi[32], &u[25], v_bit, rnding);
2932 u[26] =
2933 half_btf_neon_r(&cospi[32], &u[21], &cospi[32], &u[26], v_bit, rnding);
2934 u[27] =
2935 half_btf_neon_r(&cospi[32], &u[20], &cospi[32], &u[27], v_bit, rnding);
2936 u[20] = temp1;
2937 u[21] = temp2;
2938 u[22] = temp3;
2939 u[23] = temp4;
2940 for (i = 32; i < 40; i++) {
2941 addsub_neon(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
2942 }
2943
2944 for (i = 48; i < 56; i++) {
2945 addsub_neon(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
2946 }
2947 }
2948
idct64_stage10_neon(int32x4_t * u,const int32_t * cospi,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi,const int32x4_t * v_bit,const int32x4_t * rnding)2949 static inline void idct64_stage10_neon(int32x4_t *u, const int32_t *cospi,
2950 const int32x4_t *clamp_lo,
2951 const int32x4_t *clamp_hi,
2952 const int32x4_t *v_bit,
2953 const int32x4_t *rnding) {
2954 int32x4_t temp1, temp2, temp3, temp4;
2955 for (int i = 0; i < 16; i++) {
2956 addsub_neon(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
2957 }
2958 temp1 = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit,
2959 rnding);
2960 temp2 = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit,
2961 rnding);
2962 temp3 = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit,
2963 rnding);
2964 temp4 = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit,
2965 rnding);
2966 u[52] =
2967 half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], v_bit, rnding);
2968 u[53] =
2969 half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], v_bit, rnding);
2970 u[54] =
2971 half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], v_bit, rnding);
2972 u[55] =
2973 half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], v_bit, rnding);
2974 u[40] = temp1;
2975 u[41] = temp2;
2976 u[42] = temp3;
2977 u[43] = temp4;
2978
2979 temp1 = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit,
2980 rnding);
2981 temp2 = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit,
2982 rnding);
2983 temp3 = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit,
2984 rnding);
2985 temp4 = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit,
2986 rnding);
2987 u[48] =
2988 half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], v_bit, rnding);
2989 u[49] =
2990 half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], v_bit, rnding);
2991 u[50] =
2992 half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], v_bit, rnding);
2993 u[51] =
2994 half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], v_bit, rnding);
2995 u[44] = temp1;
2996 u[45] = temp2;
2997 u[46] = temp3;
2998 u[47] = temp4;
2999 }
3000
idct64_stage11_neon(int32x4_t * u,int32x4_t * out,int do_cols,int bd,int out_shift,const int32x4_t * clamp_lo,const int32x4_t * clamp_hi)3001 static inline void idct64_stage11_neon(int32x4_t *u, int32x4_t *out,
3002 int do_cols, int bd, int out_shift,
3003 const int32x4_t *clamp_lo,
3004 const int32x4_t *clamp_hi) {
3005 for (int i = 0; i < 32; i++) {
3006 addsub_neon(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
3007 }
3008
3009 if (!do_cols) {
3010 const int log_range_out = AOMMAX(16, bd + 6);
3011 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
3012 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
3013 for (int i = 0; i < 64; i += 4) {
3014 round_shift_4x4(out + i, out_shift);
3015 highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out, 4);
3016 }
3017 }
3018 }
3019
idct64x64_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3020 static void idct64x64_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
3021 int do_cols, int bd, int out_shift) {
3022 const int32_t *cospi = cospi_arr(bit);
3023 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3024 int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3025 int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3026
3027 const int32x4_t v_bit = vdupq_n_s32(-bit);
3028 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3029 {
3030 int32x4_t x;
3031
3032 // stage 1
3033 // stage 2
3034 // stage 3
3035 // stage 4
3036 // stage 5
3037 // stage 6
3038 x = half_btf_0_neon_r(&cospi[32], &in[0], &v_bit, &rnding);
3039
3040 // stage 8
3041 // stage 9
3042 // stage 10
3043 // stage 11
3044 if (!do_cols) {
3045 const int log_range_out = AOMMAX(16, bd + 6);
3046 clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
3047 clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
3048 if (out_shift != 0) {
3049 int32x4_t offset = vdupq_n_s32((1 << out_shift) >> 1);
3050 x = vaddq_s32(x, offset);
3051 x = vshlq_s32(x, vdupq_n_s32(-out_shift));
3052 }
3053 }
3054 x = vmaxq_s32(x, clamp_lo);
3055 x = vminq_s32(x, clamp_hi);
3056 out[0] = x;
3057 out[1] = x;
3058 out[2] = x;
3059 out[3] = x;
3060 out[4] = x;
3061 out[5] = x;
3062 out[6] = x;
3063 out[7] = x;
3064 out[8] = x;
3065 out[9] = x;
3066 out[10] = x;
3067 out[11] = x;
3068 out[12] = x;
3069 out[13] = x;
3070 out[14] = x;
3071 out[15] = x;
3072 out[16] = x;
3073 out[17] = x;
3074 out[18] = x;
3075 out[19] = x;
3076 out[20] = x;
3077 out[21] = x;
3078 out[22] = x;
3079 out[23] = x;
3080 out[24] = x;
3081 out[25] = x;
3082 out[26] = x;
3083 out[27] = x;
3084 out[28] = x;
3085 out[29] = x;
3086 out[30] = x;
3087 out[31] = x;
3088 out[32] = x;
3089 out[33] = x;
3090 out[34] = x;
3091 out[35] = x;
3092 out[36] = x;
3093 out[37] = x;
3094 out[38] = x;
3095 out[39] = x;
3096 out[40] = x;
3097 out[41] = x;
3098 out[42] = x;
3099 out[43] = x;
3100 out[44] = x;
3101 out[45] = x;
3102 out[46] = x;
3103 out[47] = x;
3104 out[48] = x;
3105 out[49] = x;
3106 out[50] = x;
3107 out[51] = x;
3108 out[52] = x;
3109 out[53] = x;
3110 out[54] = x;
3111 out[55] = x;
3112 out[56] = x;
3113 out[57] = x;
3114 out[58] = x;
3115 out[59] = x;
3116 out[60] = x;
3117 out[61] = x;
3118 out[62] = x;
3119 out[63] = x;
3120 }
3121 }
3122
idct64x64_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3123 static void idct64x64_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
3124 int do_cols, int bd, int out_shift) {
3125 int i, j;
3126 const int32_t *cospi = cospi_arr(bit);
3127 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3128 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3129 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3130 const int32x4_t v_bit = vdupq_n_s32(-bit);
3131 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3132 {
3133 int32x4_t u[64];
3134
3135 // stage 1
3136 u[0] = in[0];
3137 u[8] = in[4];
3138 u[16] = in[2];
3139 u[24] = in[6];
3140 u[32] = in[1];
3141 u[40] = in[5];
3142 u[48] = in[3];
3143 u[56] = in[7];
3144
3145 // stage 2
3146 u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
3147 u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
3148 u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
3149 u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
3150 u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
3151 u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
3152 u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
3153 u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
3154
3155 // stage 3
3156 u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
3157 u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
3158 u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
3159 u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
3160 u[33] = u[32];
3161 u[38] = u[39];
3162 u[41] = u[40];
3163 u[46] = u[47];
3164 u[49] = u[48];
3165 u[54] = u[55];
3166 u[57] = u[56];
3167 u[62] = u[63];
3168
3169 // stage 4
3170 int32x4_t temp1, temp2;
3171 u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
3172 u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
3173 u[17] = u[16];
3174 u[22] = u[23];
3175 u[25] = u[24];
3176 u[30] = u[31];
3177
3178 temp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
3179 &v_bit, &rnding);
3180 u[62] =
3181 half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
3182 u[33] = temp1;
3183
3184 temp2 = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
3185 &v_bit, &rnding);
3186 u[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
3187 &v_bit, &rnding);
3188 u[57] = temp2;
3189
3190 temp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
3191 &v_bit, &rnding);
3192 u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
3193 &rnding);
3194 u[41] = temp1;
3195
3196 temp2 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
3197 &v_bit, &rnding);
3198 u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
3199 &v_bit, &rnding);
3200 u[46] = temp2;
3201
3202 // stage 5
3203 u[9] = u[8];
3204 u[14] = u[15];
3205
3206 temp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30],
3207 &v_bit, &rnding);
3208 u[30] =
3209 half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
3210 u[17] = temp1;
3211
3212 temp2 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
3213 &v_bit, &rnding);
3214 u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
3215 &v_bit, &rnding);
3216 u[22] = temp2;
3217
3218 u[35] = u[32];
3219 u[34] = u[33];
3220 u[36] = u[39];
3221 u[37] = u[38];
3222 u[43] = u[40];
3223 u[42] = u[41];
3224 u[44] = u[47];
3225 u[45] = u[46];
3226 u[51] = u[48];
3227 u[50] = u[49];
3228 u[52] = u[55];
3229 u[53] = u[54];
3230 u[59] = u[56];
3231 u[58] = u[57];
3232 u[60] = u[63];
3233 u[61] = u[62];
3234
3235 // stage 6
3236 temp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3237 u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3238 u[0] = temp1;
3239
3240 temp2 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14],
3241 &v_bit, &rnding);
3242 u[14] =
3243 half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
3244 u[9] = temp2;
3245 u[19] = u[16];
3246 u[18] = u[17];
3247 u[20] = u[23];
3248 u[21] = u[22];
3249 u[27] = u[24];
3250 u[26] = u[25];
3251 u[28] = u[31];
3252 u[29] = u[30];
3253
3254 temp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
3255 &v_bit, &rnding);
3256 u[61] =
3257 half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
3258 u[34] = temp1;
3259 temp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
3260 &v_bit, &rnding);
3261 u[60] =
3262 half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
3263 u[35] = temp2;
3264 temp1 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
3265 &v_bit, &rnding);
3266 u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
3267 &v_bit, &rnding);
3268 u[36] = temp1;
3269 temp2 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
3270 &v_bit, &rnding);
3271 u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
3272 &v_bit, &rnding);
3273 u[37] = temp2;
3274 temp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
3275 &v_bit, &rnding);
3276 u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
3277 &rnding);
3278 u[42] = temp1;
3279 temp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
3280 &v_bit, &rnding);
3281 u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
3282 &rnding);
3283 u[43] = temp2;
3284 temp1 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
3285 &v_bit, &rnding);
3286 u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
3287 &v_bit, &rnding);
3288 u[44] = temp1;
3289 temp2 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
3290 &v_bit, &rnding);
3291 u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
3292 &v_bit, &rnding);
3293 u[45] = temp2;
3294
3295 // stage 7
3296 u[3] = u[0];
3297 u[2] = u[1];
3298 u[11] = u[8];
3299 u[10] = u[9];
3300 u[12] = u[15];
3301 u[13] = u[14];
3302
3303 temp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
3304 &v_bit, &rnding);
3305 u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
3306 &rnding);
3307 u[18] = temp1;
3308 temp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
3309 &v_bit, &rnding);
3310 u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
3311 &rnding);
3312 u[19] = temp2;
3313 temp1 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
3314 &v_bit, &rnding);
3315 u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
3316 &v_bit, &rnding);
3317 u[20] = temp1;
3318 temp2 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
3319 &v_bit, &rnding);
3320 u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
3321 &v_bit, &rnding);
3322 u[21] = temp2;
3323 for (i = 32; i < 64; i += 16) {
3324 for (j = i; j < i + 4; j++) {
3325 addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3326 addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3327 &clamp_hi);
3328 }
3329 }
3330
3331 // stage 8
3332 u[7] = u[0];
3333 u[6] = u[1];
3334 u[5] = u[2];
3335 u[4] = u[3];
3336 u[9] = u[9];
3337
3338 idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3339
3340 // stage 9
3341 idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3342
3343 // stage 10
3344 idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3345
3346 // stage 11
3347 idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3348 }
3349 }
3350
idct64x64_low16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3351 static void idct64x64_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
3352 int do_cols, int bd, int out_shift) {
3353 int i, j;
3354 const int32_t *cospi = cospi_arr(bit);
3355 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3356 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3357 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3358 const int32x4_t v_bit = vdupq_n_s32(-bit);
3359 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3360
3361 {
3362 int32x4_t u[64];
3363 int32x4_t tmp1, tmp2, tmp3, tmp4;
3364 // stage 1
3365 u[0] = in[0];
3366 u[32] = in[1];
3367 u[36] = in[9];
3368 u[40] = in[5];
3369 u[44] = in[13];
3370 u[48] = in[3];
3371 u[52] = in[11];
3372 u[56] = in[7];
3373 u[60] = in[15];
3374 u[16] = in[2];
3375 u[20] = in[10];
3376 u[24] = in[6];
3377 u[28] = in[14];
3378 u[4] = in[8];
3379 u[8] = in[4];
3380 u[12] = in[12];
3381
3382 // stage 2
3383 u[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
3384 u[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
3385 u[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
3386 u[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
3387 u[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
3388 u[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
3389 u[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
3390 u[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
3391 u[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
3392 u[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
3393 u[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
3394 u[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
3395 u[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
3396 u[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
3397 u[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
3398 u[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
3399
3400 // stage 3
3401 u[31] = half_btf_0_neon_r(&cospi[2], &u[16], &v_bit, &rnding);
3402 u[16] = half_btf_0_neon_r(&cospi[62], &u[16], &v_bit, &rnding);
3403 u[19] = half_btf_0_m_neon_r(&cospi[50], &u[28], &v_bit, &rnding);
3404 u[28] = half_btf_0_neon_r(&cospi[14], &u[28], &v_bit, &rnding);
3405 u[27] = half_btf_0_neon_r(&cospi[10], &u[20], &v_bit, &rnding);
3406 u[20] = half_btf_0_neon_r(&cospi[54], &u[20], &v_bit, &rnding);
3407 u[23] = half_btf_0_m_neon_r(&cospi[58], &u[24], &v_bit, &rnding);
3408 u[24] = half_btf_0_neon_r(&cospi[6], &u[24], &v_bit, &rnding);
3409 u[33] = u[32];
3410 u[34] = u[35];
3411 u[37] = u[36];
3412 u[38] = u[39];
3413 u[41] = u[40];
3414 u[42] = u[43];
3415 u[45] = u[44];
3416 u[46] = u[47];
3417 u[49] = u[48];
3418 u[50] = u[51];
3419 u[53] = u[52];
3420 u[54] = u[55];
3421 u[57] = u[56];
3422 u[58] = u[59];
3423 u[61] = u[60];
3424 u[62] = u[63];
3425
3426 // stage 4
3427 u[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
3428 u[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
3429 u[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
3430 u[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
3431
3432 u[17] = u[16];
3433 u[18] = u[19];
3434 u[21] = u[20];
3435 u[22] = u[23];
3436 u[25] = u[24];
3437 u[26] = u[27];
3438 u[29] = u[28];
3439 u[30] = u[31];
3440
3441 tmp1 = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62], &v_bit,
3442 &rnding);
3443 tmp2 = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61], &v_bit,
3444 &rnding);
3445 tmp3 = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
3446 &v_bit, &rnding);
3447 tmp4 = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
3448 &v_bit, &rnding);
3449 u[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
3450 &v_bit, &rnding);
3451 u[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
3452 &rnding);
3453 u[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
3454 &v_bit, &rnding);
3455 u[62] =
3456 half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
3457 u[33] = tmp1;
3458 u[34] = tmp2;
3459 u[37] = tmp3;
3460 u[38] = tmp4;
3461
3462 tmp1 = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
3463 &v_bit, &rnding);
3464 tmp2 = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
3465 &v_bit, &rnding);
3466 tmp3 = half_btf_neon_r(&cospi[52], &u[45], &cospi[12], &u[50], &v_bit,
3467 &rnding);
3468 tmp4 = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
3469 &v_bit, &rnding);
3470 u[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
3471 &v_bit, &rnding);
3472 u[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
3473 &rnding);
3474 u[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
3475 &v_bit, &rnding);
3476 u[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
3477 &rnding);
3478 u[41] = tmp1;
3479 u[42] = tmp2;
3480 u[45] = tmp3;
3481 u[46] = tmp4;
3482
3483 // stage 5
3484 u[7] = half_btf_0_neon_r(&cospi[8], &u[4], &v_bit, &rnding);
3485 u[4] = half_btf_0_neon_r(&cospi[56], &u[4], &v_bit, &rnding);
3486
3487 u[9] = u[8];
3488 u[10] = u[11];
3489 u[13] = u[12];
3490 u[14] = u[15];
3491
3492 tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[17], &cospi[56], &u[30], &v_bit,
3493 &rnding);
3494 tmp2 = half_btf_neon_mode11_r(&cospi[56], &u[18], &cospi[8], &u[29], &v_bit,
3495 &rnding);
3496 tmp3 = half_btf_neon_mode10_r(&cospi[40], &u[21], &cospi[24], &u[26],
3497 &v_bit, &rnding);
3498 tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[22], &cospi[40], &u[25],
3499 &v_bit, &rnding);
3500 u[25] = half_btf_neon_mode10_r(&cospi[40], &u[22], &cospi[24], &u[25],
3501 &v_bit, &rnding);
3502 u[26] = half_btf_neon_r(&cospi[24], &u[21], &cospi[40], &u[26], &v_bit,
3503 &rnding);
3504 u[29] = half_btf_neon_mode10_r(&cospi[8], &u[18], &cospi[56], &u[29],
3505 &v_bit, &rnding);
3506 u[30] =
3507 half_btf_neon_r(&cospi[56], &u[17], &cospi[8], &u[30], &v_bit, &rnding);
3508 u[17] = tmp1;
3509 u[18] = tmp2;
3510 u[21] = tmp3;
3511 u[22] = tmp4;
3512
3513 for (i = 32; i < 64; i += 8) {
3514 addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3515 &clamp_hi);
3516 addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3517 &clamp_hi);
3518
3519 addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3520 &clamp_hi);
3521 addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3522 &clamp_hi);
3523 }
3524
3525 // stage 6
3526 tmp1 = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3527 u[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3528 u[0] = tmp1;
3529 u[5] = u[4];
3530 u[6] = u[7];
3531
3532 tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
3533 &rnding);
3534 u[14] =
3535 half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
3536 u[9] = tmp1;
3537 tmp2 = half_btf_neon_mode01_r(&cospi[48], &u[10], &cospi[16], &u[13],
3538 &v_bit, &rnding);
3539 u[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
3540 &v_bit, &rnding);
3541 u[10] = tmp2;
3542
3543 for (i = 16; i < 32; i += 8) {
3544 addsub_neon(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3545 &clamp_hi);
3546 addsub_neon(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3547 &clamp_hi);
3548
3549 addsub_neon(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3550 &clamp_hi);
3551 addsub_neon(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3552 &clamp_hi);
3553 }
3554
3555 tmp1 = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61], &v_bit,
3556 &rnding);
3557 tmp2 = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60], &v_bit,
3558 &rnding);
3559 tmp3 = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59], &v_bit,
3560 &rnding);
3561 tmp4 = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58], &v_bit,
3562 &rnding);
3563 u[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
3564 &v_bit, &rnding);
3565 u[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
3566 &v_bit, &rnding);
3567 u[60] =
3568 half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
3569 u[61] =
3570 half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
3571 u[34] = tmp1;
3572 u[35] = tmp2;
3573 u[36] = tmp3;
3574 u[37] = tmp4;
3575
3576 tmp1 = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
3577 &v_bit, &rnding);
3578 tmp2 = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
3579 &v_bit, &rnding);
3580 tmp3 = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
3581 &v_bit, &rnding);
3582 tmp4 = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
3583 &v_bit, &rnding);
3584 u[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
3585 &v_bit, &rnding);
3586 u[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
3587 &v_bit, &rnding);
3588 u[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
3589 &rnding);
3590 u[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
3591 &rnding);
3592 u[42] = tmp1;
3593 u[43] = tmp2;
3594 u[44] = tmp3;
3595 u[45] = tmp4;
3596
3597 // stage 7
3598 u[3] = u[0];
3599 u[2] = u[1];
3600 tmp1 = half_btf_neon_mode10_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit,
3601 &rnding);
3602 u[6] =
3603 half_btf_neon_r(&cospi[32], &u[5], &cospi[32], &u[6], &v_bit, &rnding);
3604 u[5] = tmp1;
3605 addsub_neon(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3606 addsub_neon(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3607 addsub_neon(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3608 addsub_neon(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3609
3610 tmp1 = half_btf_neon_mode10_r(&cospi[16], &u[18], &cospi[48], &u[29],
3611 &v_bit, &rnding);
3612 tmp2 = half_btf_neon_mode10_r(&cospi[16], &u[19], &cospi[48], &u[28],
3613 &v_bit, &rnding);
3614 tmp3 = half_btf_neon_mode11_r(&cospi[48], &u[20], &cospi[16], &u[27],
3615 &v_bit, &rnding);
3616 tmp4 = half_btf_neon_mode11_r(&cospi[48], &u[21], &cospi[16], &u[26],
3617 &v_bit, &rnding);
3618 u[26] = half_btf_neon_mode10_r(&cospi[16], &u[21], &cospi[48], &u[26],
3619 &v_bit, &rnding);
3620 u[27] = half_btf_neon_mode10_r(&cospi[16], &u[20], &cospi[48], &u[27],
3621 &v_bit, &rnding);
3622 u[28] = half_btf_neon_r(&cospi[48], &u[19], &cospi[16], &u[28], &v_bit,
3623 &rnding);
3624 u[29] = half_btf_neon_r(&cospi[48], &u[18], &cospi[16], &u[29], &v_bit,
3625 &rnding);
3626 u[18] = tmp1;
3627 u[19] = tmp2;
3628 u[20] = tmp3;
3629 u[21] = tmp4;
3630
3631 for (i = 32; i < 64; i += 16) {
3632 for (j = i; j < i + 4; j++) {
3633 addsub_neon(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3634 addsub_neon(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3635 &clamp_hi);
3636 }
3637 }
3638
3639 // stage 8
3640 for (i = 0; i < 4; ++i) {
3641 addsub_neon(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3642 }
3643
3644 idct64_stage8_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3645
3646 // stage 9
3647 idct64_stage9_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3648
3649 // stage 10
3650 idct64_stage10_neon(u, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
3651
3652 // stage 11
3653 idct64_stage11_neon(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3654 }
3655 }
3656
idct64x64_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)3657 static void idct64x64_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
3658 int bd, int out_shift) {
3659 int i, j;
3660 const int32_t *cospi = cospi_arr(bit);
3661 const int32x4_t v_bit = vdupq_n_s32(-bit);
3662 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
3663
3664 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3665 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
3666 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
3667
3668 {
3669 int32x4_t u[64], v[64];
3670
3671 // stage 1
3672 u[32] = in[1];
3673 u[34] = in[17];
3674 u[36] = in[9];
3675 u[38] = in[25];
3676 u[40] = in[5];
3677 u[42] = in[21];
3678 u[44] = in[13];
3679 u[46] = in[29];
3680 u[48] = in[3];
3681 u[50] = in[19];
3682 u[52] = in[11];
3683 u[54] = in[27];
3684 u[56] = in[7];
3685 u[58] = in[23];
3686 u[60] = in[15];
3687 u[62] = in[31];
3688
3689 v[16] = in[2];
3690 v[18] = in[18];
3691 v[20] = in[10];
3692 v[22] = in[26];
3693 v[24] = in[6];
3694 v[26] = in[22];
3695 v[28] = in[14];
3696 v[30] = in[30];
3697
3698 u[8] = in[4];
3699 u[10] = in[20];
3700 u[12] = in[12];
3701 u[14] = in[28];
3702
3703 v[4] = in[8];
3704 v[6] = in[24];
3705
3706 u[0] = in[0];
3707 u[2] = in[16];
3708
3709 // stage 2
3710 v[32] = half_btf_0_neon_r(&cospi[63], &u[32], &v_bit, &rnding);
3711 v[33] = half_btf_0_m_neon_r(&cospi[33], &u[62], &v_bit, &rnding);
3712 v[34] = half_btf_0_neon_r(&cospi[47], &u[34], &v_bit, &rnding);
3713 v[35] = half_btf_0_m_neon_r(&cospi[49], &u[60], &v_bit, &rnding);
3714 v[36] = half_btf_0_neon_r(&cospi[55], &u[36], &v_bit, &rnding);
3715 v[37] = half_btf_0_m_neon_r(&cospi[41], &u[58], &v_bit, &rnding);
3716 v[38] = half_btf_0_neon_r(&cospi[39], &u[38], &v_bit, &rnding);
3717 v[39] = half_btf_0_m_neon_r(&cospi[57], &u[56], &v_bit, &rnding);
3718 v[40] = half_btf_0_neon_r(&cospi[59], &u[40], &v_bit, &rnding);
3719 v[41] = half_btf_0_m_neon_r(&cospi[37], &u[54], &v_bit, &rnding);
3720 v[42] = half_btf_0_neon_r(&cospi[43], &u[42], &v_bit, &rnding);
3721 v[43] = half_btf_0_m_neon_r(&cospi[53], &u[52], &v_bit, &rnding);
3722 v[44] = half_btf_0_neon_r(&cospi[51], &u[44], &v_bit, &rnding);
3723 v[45] = half_btf_0_m_neon_r(&cospi[45], &u[50], &v_bit, &rnding);
3724 v[46] = half_btf_0_neon_r(&cospi[35], &u[46], &v_bit, &rnding);
3725 v[47] = half_btf_0_m_neon_r(&cospi[61], &u[48], &v_bit, &rnding);
3726 v[48] = half_btf_0_neon_r(&cospi[3], &u[48], &v_bit, &rnding);
3727 v[49] = half_btf_0_neon_r(&cospi[29], &u[46], &v_bit, &rnding);
3728 v[50] = half_btf_0_neon_r(&cospi[19], &u[50], &v_bit, &rnding);
3729 v[51] = half_btf_0_neon_r(&cospi[13], &u[44], &v_bit, &rnding);
3730 v[52] = half_btf_0_neon_r(&cospi[11], &u[52], &v_bit, &rnding);
3731 v[53] = half_btf_0_neon_r(&cospi[21], &u[42], &v_bit, &rnding);
3732 v[54] = half_btf_0_neon_r(&cospi[27], &u[54], &v_bit, &rnding);
3733 v[55] = half_btf_0_neon_r(&cospi[5], &u[40], &v_bit, &rnding);
3734 v[56] = half_btf_0_neon_r(&cospi[7], &u[56], &v_bit, &rnding);
3735 v[57] = half_btf_0_neon_r(&cospi[25], &u[38], &v_bit, &rnding);
3736 v[58] = half_btf_0_neon_r(&cospi[23], &u[58], &v_bit, &rnding);
3737 v[59] = half_btf_0_neon_r(&cospi[9], &u[36], &v_bit, &rnding);
3738 v[60] = half_btf_0_neon_r(&cospi[15], &u[60], &v_bit, &rnding);
3739 v[61] = half_btf_0_neon_r(&cospi[17], &u[34], &v_bit, &rnding);
3740 v[62] = half_btf_0_neon_r(&cospi[31], &u[62], &v_bit, &rnding);
3741 v[63] = half_btf_0_neon_r(&cospi[1], &u[32], &v_bit, &rnding);
3742
3743 // stage 3
3744 u[16] = half_btf_0_neon_r(&cospi[62], &v[16], &v_bit, &rnding);
3745 u[17] = half_btf_0_m_neon_r(&cospi[34], &v[30], &v_bit, &rnding);
3746 u[18] = half_btf_0_neon_r(&cospi[46], &v[18], &v_bit, &rnding);
3747 u[19] = half_btf_0_m_neon_r(&cospi[50], &v[28], &v_bit, &rnding);
3748 u[20] = half_btf_0_neon_r(&cospi[54], &v[20], &v_bit, &rnding);
3749 u[21] = half_btf_0_m_neon_r(&cospi[42], &v[26], &v_bit, &rnding);
3750 u[22] = half_btf_0_neon_r(&cospi[38], &v[22], &v_bit, &rnding);
3751 u[23] = half_btf_0_m_neon_r(&cospi[58], &v[24], &v_bit, &rnding);
3752 u[24] = half_btf_0_neon_r(&cospi[6], &v[24], &v_bit, &rnding);
3753 u[25] = half_btf_0_neon_r(&cospi[26], &v[22], &v_bit, &rnding);
3754 u[26] = half_btf_0_neon_r(&cospi[22], &v[26], &v_bit, &rnding);
3755 u[27] = half_btf_0_neon_r(&cospi[10], &v[20], &v_bit, &rnding);
3756 u[28] = half_btf_0_neon_r(&cospi[14], &v[28], &v_bit, &rnding);
3757 u[29] = half_btf_0_neon_r(&cospi[18], &v[18], &v_bit, &rnding);
3758 u[30] = half_btf_0_neon_r(&cospi[30], &v[30], &v_bit, &rnding);
3759 u[31] = half_btf_0_neon_r(&cospi[2], &v[16], &v_bit, &rnding);
3760
3761 for (i = 32; i < 64; i += 4) {
3762 addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3763 &clamp_hi);
3764 addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3765 &clamp_hi);
3766 }
3767
3768 // stage 4
3769 v[8] = half_btf_0_neon_r(&cospi[60], &u[8], &v_bit, &rnding);
3770 v[9] = half_btf_0_m_neon_r(&cospi[36], &u[14], &v_bit, &rnding);
3771 v[10] = half_btf_0_neon_r(&cospi[44], &u[10], &v_bit, &rnding);
3772 v[11] = half_btf_0_m_neon_r(&cospi[52], &u[12], &v_bit, &rnding);
3773 v[12] = half_btf_0_neon_r(&cospi[12], &u[12], &v_bit, &rnding);
3774 v[13] = half_btf_0_neon_r(&cospi[20], &u[10], &v_bit, &rnding);
3775 v[14] = half_btf_0_neon_r(&cospi[28], &u[14], &v_bit, &rnding);
3776 v[15] = half_btf_0_neon_r(&cospi[4], &u[8], &v_bit, &rnding);
3777
3778 for (i = 16; i < 32; i += 4) {
3779 addsub_neon(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3780 &clamp_hi);
3781 addsub_neon(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3782 &clamp_hi);
3783 }
3784
3785 for (i = 32; i < 64; i += 4) {
3786 v[i + 0] = u[i + 0];
3787 v[i + 3] = u[i + 3];
3788 }
3789
3790 v[33] = half_btf_neon_mode10_r(&cospi[4], &u[33], &cospi[60], &u[62],
3791 &v_bit, &rnding);
3792 v[34] = half_btf_neon_mode11_r(&cospi[60], &u[34], &cospi[4], &u[61],
3793 &v_bit, &rnding);
3794 v[37] = half_btf_neon_mode10_r(&cospi[36], &u[37], &cospi[28], &u[58],
3795 &v_bit, &rnding);
3796 v[38] = half_btf_neon_mode11_r(&cospi[28], &u[38], &cospi[36], &u[57],
3797 &v_bit, &rnding);
3798 v[41] = half_btf_neon_mode10_r(&cospi[20], &u[41], &cospi[44], &u[54],
3799 &v_bit, &rnding);
3800 v[42] = half_btf_neon_mode11_r(&cospi[44], &u[42], &cospi[20], &u[53],
3801 &v_bit, &rnding);
3802 v[45] = half_btf_neon_mode10_r(&cospi[52], &u[45], &cospi[12], &u[50],
3803 &v_bit, &rnding);
3804 v[46] = half_btf_neon_mode11_r(&cospi[12], &u[46], &cospi[52], &u[49],
3805 &v_bit, &rnding);
3806 v[49] = half_btf_neon_mode10_r(&cospi[52], &u[46], &cospi[12], &u[49],
3807 &v_bit, &rnding);
3808 v[50] = half_btf_neon_r(&cospi[12], &u[45], &cospi[52], &u[50], &v_bit,
3809 &rnding);
3810 v[53] = half_btf_neon_mode10_r(&cospi[20], &u[42], &cospi[44], &u[53],
3811 &v_bit, &rnding);
3812 v[54] = half_btf_neon_r(&cospi[44], &u[41], &cospi[20], &u[54], &v_bit,
3813 &rnding);
3814 v[57] = half_btf_neon_mode10_r(&cospi[36], &u[38], &cospi[28], &u[57],
3815 &v_bit, &rnding);
3816 v[58] = half_btf_neon_r(&cospi[28], &u[37], &cospi[36], &u[58], &v_bit,
3817 &rnding);
3818 v[61] = half_btf_neon_mode10_r(&cospi[4], &u[34], &cospi[60], &u[61],
3819 &v_bit, &rnding);
3820 v[62] =
3821 half_btf_neon_r(&cospi[60], &u[33], &cospi[4], &u[62], &v_bit, &rnding);
3822
3823 // stage 5
3824 u[4] = half_btf_0_neon_r(&cospi[56], &v[4], &v_bit, &rnding);
3825 u[5] = half_btf_0_m_neon_r(&cospi[40], &v[6], &v_bit, &rnding);
3826 u[6] = half_btf_0_neon_r(&cospi[24], &v[6], &v_bit, &rnding);
3827 u[7] = half_btf_0_neon_r(&cospi[8], &v[4], &v_bit, &rnding);
3828
3829 for (i = 8; i < 16; i += 4) {
3830 addsub_neon(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3831 &clamp_hi);
3832 addsub_neon(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3833 &clamp_hi);
3834 }
3835
3836 for (i = 16; i < 32; i += 4) {
3837 u[i + 0] = v[i + 0];
3838 u[i + 3] = v[i + 3];
3839 }
3840
3841 u[17] = half_btf_neon_mode10_r(&cospi[8], &v[17], &cospi[56], &v[30],
3842 &v_bit, &rnding);
3843 u[18] = half_btf_neon_mode11_r(&cospi[56], &v[18], &cospi[8], &v[29],
3844 &v_bit, &rnding);
3845 u[21] = half_btf_neon_mode10_r(&cospi[40], &v[21], &cospi[24], &v[26],
3846 &v_bit, &rnding);
3847 u[22] = half_btf_neon_mode11_r(&cospi[24], &v[22], &cospi[40], &v[25],
3848 &v_bit, &rnding);
3849 u[25] = half_btf_neon_mode10_r(&cospi[40], &v[22], &cospi[24], &v[25],
3850 &v_bit, &rnding);
3851 u[26] = half_btf_neon_r(&cospi[24], &v[21], &cospi[40], &v[26], &v_bit,
3852 &rnding);
3853 u[29] = half_btf_neon_mode10_r(&cospi[8], &v[18], &cospi[56], &v[29],
3854 &v_bit, &rnding);
3855 u[30] =
3856 half_btf_neon_r(&cospi[56], &v[17], &cospi[8], &v[30], &v_bit, &rnding);
3857
3858 for (i = 32; i < 64; i += 8) {
3859 addsub_neon(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3860 &clamp_hi);
3861 addsub_neon(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3862 &clamp_hi);
3863
3864 addsub_neon(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3865 &clamp_hi);
3866 addsub_neon(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3867 &clamp_hi);
3868 }
3869
3870 // stage 6
3871 v[0] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3872 v[1] = half_btf_0_neon_r(&cospi[32], &u[0], &v_bit, &rnding);
3873 v[2] = half_btf_0_neon_r(&cospi[48], &u[2], &v_bit, &rnding);
3874 v[3] = half_btf_0_neon_r(&cospi[16], &u[2], &v_bit, &rnding);
3875
3876 addsub_neon(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3877 addsub_neon(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3878
3879 for (i = 8; i < 16; i += 4) {
3880 v[i + 0] = u[i + 0];
3881 v[i + 3] = u[i + 3];
3882 }
3883
3884 v[9] = half_btf_neon_mode10_r(&cospi[16], &u[9], &cospi[48], &u[14], &v_bit,
3885 &rnding);
3886 v[10] = half_btf_neon_mode11_r(&cospi[48], &u[10], &cospi[16], &u[13],
3887 &v_bit, &rnding);
3888 v[13] = half_btf_neon_mode10_r(&cospi[16], &u[10], &cospi[48], &u[13],
3889 &v_bit, &rnding);
3890 v[14] =
3891 half_btf_neon_r(&cospi[48], &u[9], &cospi[16], &u[14], &v_bit, &rnding);
3892
3893 for (i = 16; i < 32; i += 8) {
3894 addsub_neon(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
3895 &clamp_hi);
3896 addsub_neon(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
3897 &clamp_hi);
3898
3899 addsub_neon(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
3900 &clamp_hi);
3901 addsub_neon(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
3902 &clamp_hi);
3903 }
3904
3905 for (i = 32; i < 64; i += 8) {
3906 v[i + 0] = u[i + 0];
3907 v[i + 1] = u[i + 1];
3908 v[i + 6] = u[i + 6];
3909 v[i + 7] = u[i + 7];
3910 }
3911
3912 v[34] = half_btf_neon_mode10_r(&cospi[8], &u[34], &cospi[56], &u[61],
3913 &v_bit, &rnding);
3914 v[35] = half_btf_neon_mode10_r(&cospi[8], &u[35], &cospi[56], &u[60],
3915 &v_bit, &rnding);
3916 v[36] = half_btf_neon_mode11_r(&cospi[56], &u[36], &cospi[8], &u[59],
3917 &v_bit, &rnding);
3918 v[37] = half_btf_neon_mode11_r(&cospi[56], &u[37], &cospi[8], &u[58],
3919 &v_bit, &rnding);
3920 v[42] = half_btf_neon_mode10_r(&cospi[40], &u[42], &cospi[24], &u[53],
3921 &v_bit, &rnding);
3922 v[43] = half_btf_neon_mode10_r(&cospi[40], &u[43], &cospi[24], &u[52],
3923 &v_bit, &rnding);
3924 v[44] = half_btf_neon_mode11_r(&cospi[24], &u[44], &cospi[40], &u[51],
3925 &v_bit, &rnding);
3926 v[45] = half_btf_neon_mode11_r(&cospi[24], &u[45], &cospi[40], &u[50],
3927 &v_bit, &rnding);
3928 v[50] = half_btf_neon_mode10_r(&cospi[40], &u[45], &cospi[24], &u[50],
3929 &v_bit, &rnding);
3930 v[51] = half_btf_neon_mode10_r(&cospi[40], &u[44], &cospi[24], &u[51],
3931 &v_bit, &rnding);
3932 v[52] = half_btf_neon_r(&cospi[24], &u[43], &cospi[40], &u[52], &v_bit,
3933 &rnding);
3934 v[53] = half_btf_neon_r(&cospi[24], &u[42], &cospi[40], &u[53], &v_bit,
3935 &rnding);
3936 v[58] = half_btf_neon_mode10_r(&cospi[8], &u[37], &cospi[56], &u[58],
3937 &v_bit, &rnding);
3938 v[59] = half_btf_neon_mode10_r(&cospi[8], &u[36], &cospi[56], &u[59],
3939 &v_bit, &rnding);
3940 v[60] =
3941 half_btf_neon_r(&cospi[56], &u[35], &cospi[8], &u[60], &v_bit, &rnding);
3942 v[61] =
3943 half_btf_neon_r(&cospi[56], &u[34], &cospi[8], &u[61], &v_bit, &rnding);
3944
3945 // stage 7
3946 addsub_neon(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3947 addsub_neon(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3948
3949 u[4] = v[4];
3950 u[7] = v[7];
3951 u[5] = half_btf_neon_mode10_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit,
3952 &rnding);
3953 u[6] =
3954 half_btf_neon_r(&cospi[32], &v[5], &cospi[32], &v[6], &v_bit, &rnding);
3955
3956 addsub_neon(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3957 addsub_neon(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3958 addsub_neon(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3959 addsub_neon(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3960
3961 for (i = 16; i < 32; i += 8) {
3962 u[i + 0] = v[i + 0];
3963 u[i + 1] = v[i + 1];
3964 u[i + 6] = v[i + 6];
3965 u[i + 7] = v[i + 7];
3966 }
3967
3968 u[18] = half_btf_neon_mode10_r(&cospi[16], &v[18], &cospi[48], &v[29],
3969 &v_bit, &rnding);
3970 u[19] = half_btf_neon_mode10_r(&cospi[16], &v[19], &cospi[48], &v[28],
3971 &v_bit, &rnding);
3972 u[20] = half_btf_neon_mode11_r(&cospi[48], &v[20], &cospi[16], &v[27],
3973 &v_bit, &rnding);
3974 u[21] = half_btf_neon_mode11_r(&cospi[48], &v[21], &cospi[16], &v[26],
3975 &v_bit, &rnding);
3976 u[26] = half_btf_neon_mode10_r(&cospi[16], &v[21], &cospi[48], &v[26],
3977 &v_bit, &rnding);
3978 u[27] = half_btf_neon_mode10_r(&cospi[16], &v[20], &cospi[48], &v[27],
3979 &v_bit, &rnding);
3980 u[28] = half_btf_neon_r(&cospi[48], &v[19], &cospi[16], &v[28], &v_bit,
3981 &rnding);
3982 u[29] = half_btf_neon_r(&cospi[48], &v[18], &cospi[16], &v[29], &v_bit,
3983 &rnding);
3984
3985 for (i = 32; i < 64; i += 16) {
3986 for (j = i; j < i + 4; j++) {
3987 addsub_neon(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3988 addsub_neon(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3989 &clamp_hi);
3990 }
3991 }
3992
3993 // stage 8
3994 for (i = 0; i < 4; ++i) {
3995 addsub_neon(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
3996 }
3997
3998 v[8] = u[8];
3999 v[9] = u[9];
4000 v[14] = u[14];
4001 v[15] = u[15];
4002
4003 v[10] = half_btf_neon_mode10_r(&cospi[32], &u[10], &cospi[32], &u[13],
4004 &v_bit, &rnding);
4005 v[11] = half_btf_neon_mode10_r(&cospi[32], &u[11], &cospi[32], &u[12],
4006 &v_bit, &rnding);
4007 v[12] = half_btf_neon_r(&cospi[32], &u[11], &cospi[32], &u[12], &v_bit,
4008 &rnding);
4009 v[13] = half_btf_neon_r(&cospi[32], &u[10], &cospi[32], &u[13], &v_bit,
4010 &rnding);
4011
4012 for (i = 16; i < 20; ++i) {
4013 addsub_neon(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4014 addsub_neon(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4015 &clamp_hi);
4016 }
4017
4018 for (i = 32; i < 36; ++i) {
4019 v[i] = u[i];
4020 v[i + 12] = u[i + 12];
4021 v[i + 16] = u[i + 16];
4022 v[i + 28] = u[i + 28];
4023 }
4024
4025 v[36] = half_btf_neon_mode10_r(&cospi[16], &u[36], &cospi[48], &u[59],
4026 &v_bit, &rnding);
4027 v[37] = half_btf_neon_mode10_r(&cospi[16], &u[37], &cospi[48], &u[58],
4028 &v_bit, &rnding);
4029 v[38] = half_btf_neon_mode10_r(&cospi[16], &u[38], &cospi[48], &u[57],
4030 &v_bit, &rnding);
4031 v[39] = half_btf_neon_mode10_r(&cospi[16], &u[39], &cospi[48], &u[56],
4032 &v_bit, &rnding);
4033 v[40] = half_btf_neon_mode11_r(&cospi[48], &u[40], &cospi[16], &u[55],
4034 &v_bit, &rnding);
4035 v[41] = half_btf_neon_mode11_r(&cospi[48], &u[41], &cospi[16], &u[54],
4036 &v_bit, &rnding);
4037 v[42] = half_btf_neon_mode11_r(&cospi[48], &u[42], &cospi[16], &u[53],
4038 &v_bit, &rnding);
4039 v[43] = half_btf_neon_mode11_r(&cospi[48], &u[43], &cospi[16], &u[52],
4040 &v_bit, &rnding);
4041 v[52] = half_btf_neon_mode10_r(&cospi[16], &u[43], &cospi[48], &u[52],
4042 &v_bit, &rnding);
4043 v[53] = half_btf_neon_mode10_r(&cospi[16], &u[42], &cospi[48], &u[53],
4044 &v_bit, &rnding);
4045 v[54] = half_btf_neon_mode10_r(&cospi[16], &u[41], &cospi[48], &u[54],
4046 &v_bit, &rnding);
4047 v[55] = half_btf_neon_mode10_r(&cospi[16], &u[40], &cospi[48], &u[55],
4048 &v_bit, &rnding);
4049 v[56] = half_btf_neon_r(&cospi[48], &u[39], &cospi[16], &u[56], &v_bit,
4050 &rnding);
4051 v[57] = half_btf_neon_r(&cospi[48], &u[38], &cospi[16], &u[57], &v_bit,
4052 &rnding);
4053 v[58] = half_btf_neon_r(&cospi[48], &u[37], &cospi[16], &u[58], &v_bit,
4054 &rnding);
4055 v[59] = half_btf_neon_r(&cospi[48], &u[36], &cospi[16], &u[59], &v_bit,
4056 &rnding);
4057
4058 // stage 9
4059 for (i = 0; i < 8; ++i) {
4060 addsub_neon(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4061 }
4062
4063 for (i = 16; i < 20; ++i) {
4064 u[i] = v[i];
4065 u[i + 12] = v[i + 12];
4066 }
4067
4068 u[20] = half_btf_neon_mode10_r(&cospi[32], &v[20], &cospi[32], &v[27],
4069 &v_bit, &rnding);
4070 u[21] = half_btf_neon_mode10_r(&cospi[32], &v[21], &cospi[32], &v[26],
4071 &v_bit, &rnding);
4072 u[22] = half_btf_neon_mode10_r(&cospi[32], &v[22], &cospi[32], &v[25],
4073 &v_bit, &rnding);
4074 u[23] = half_btf_neon_mode10_r(&cospi[32], &v[23], &cospi[32], &v[24],
4075 &v_bit, &rnding);
4076 u[24] = half_btf_neon_r(&cospi[32], &v[23], &cospi[32], &v[24], &v_bit,
4077 &rnding);
4078 u[25] = half_btf_neon_r(&cospi[32], &v[22], &cospi[32], &v[25], &v_bit,
4079 &rnding);
4080 u[26] = half_btf_neon_r(&cospi[32], &v[21], &cospi[32], &v[26], &v_bit,
4081 &rnding);
4082 u[27] = half_btf_neon_r(&cospi[32], &v[20], &cospi[32], &v[27], &v_bit,
4083 &rnding);
4084
4085 for (i = 32; i < 40; i++) {
4086 addsub_neon(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4087 }
4088
4089 for (i = 48; i < 56; i++) {
4090 addsub_neon(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4091 }
4092
4093 // stage 10
4094 for (i = 0; i < 16; i++) {
4095 addsub_neon(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4096 }
4097
4098 for (i = 32; i < 40; i++) v[i] = u[i];
4099
4100 v[40] = half_btf_neon_mode10_r(&cospi[32], &u[40], &cospi[32], &u[55],
4101 &v_bit, &rnding);
4102 v[41] = half_btf_neon_mode10_r(&cospi[32], &u[41], &cospi[32], &u[54],
4103 &v_bit, &rnding);
4104 v[42] = half_btf_neon_mode10_r(&cospi[32], &u[42], &cospi[32], &u[53],
4105 &v_bit, &rnding);
4106 v[43] = half_btf_neon_mode10_r(&cospi[32], &u[43], &cospi[32], &u[52],
4107 &v_bit, &rnding);
4108 v[44] = half_btf_neon_mode10_r(&cospi[32], &u[44], &cospi[32], &u[51],
4109 &v_bit, &rnding);
4110 v[45] = half_btf_neon_mode10_r(&cospi[32], &u[45], &cospi[32], &u[50],
4111 &v_bit, &rnding);
4112 v[46] = half_btf_neon_mode10_r(&cospi[32], &u[46], &cospi[32], &u[49],
4113 &v_bit, &rnding);
4114 v[47] = half_btf_neon_mode10_r(&cospi[32], &u[47], &cospi[32], &u[48],
4115 &v_bit, &rnding);
4116 v[48] = half_btf_neon_r(&cospi[32], &u[47], &cospi[32], &u[48], &v_bit,
4117 &rnding);
4118 v[49] = half_btf_neon_r(&cospi[32], &u[46], &cospi[32], &u[49], &v_bit,
4119 &rnding);
4120 v[50] = half_btf_neon_r(&cospi[32], &u[45], &cospi[32], &u[50], &v_bit,
4121 &rnding);
4122 v[51] = half_btf_neon_r(&cospi[32], &u[44], &cospi[32], &u[51], &v_bit,
4123 &rnding);
4124 v[52] = half_btf_neon_r(&cospi[32], &u[43], &cospi[32], &u[52], &v_bit,
4125 &rnding);
4126 v[53] = half_btf_neon_r(&cospi[32], &u[42], &cospi[32], &u[53], &v_bit,
4127 &rnding);
4128 v[54] = half_btf_neon_r(&cospi[32], &u[41], &cospi[32], &u[54], &v_bit,
4129 &rnding);
4130 v[55] = half_btf_neon_r(&cospi[32], &u[40], &cospi[32], &u[55], &v_bit,
4131 &rnding);
4132
4133 for (i = 56; i < 64; i++) v[i] = u[i];
4134
4135 // stage 11
4136 for (i = 0; i < 32; i++) {
4137 addsub_neon(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4138 &clamp_hi);
4139 }
4140
4141 if (!do_cols) {
4142 const int log_range_out = AOMMAX(16, bd + 6);
4143 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
4144 const int32x4_t clamp_hi_out =
4145 vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4146 for (i = 0; i < 64; i += 4) {
4147 round_shift_4x4(out + i, out_shift);
4148 highbd_clamp_s32_neon(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
4149 4);
4150 }
4151 }
4152 }
4153 }
4154
idct32x32_low1_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4155 static void idct32x32_low1_neon(int32x4_t *in, int32x4_t *out, int bit,
4156 int do_cols, int bd, int out_shift) {
4157 const int32_t *cospi = cospi_arr(bit);
4158 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4159 int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4160 int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4161 int32x4_t bf1;
4162 const int32x4_t v_bit = vdupq_n_s32(-bit);
4163 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4164 // stage 0-1
4165 bf1 = in[0];
4166
4167 // stage 2-5
4168 bf1 = half_btf_0_neon_r(&cospi[32], &bf1, &v_bit, &rnding);
4169
4170 // stage 6-9
4171 if (do_cols) {
4172 bf1 = vmaxq_s32(bf1, clamp_lo);
4173 bf1 = vminq_s32(bf1, clamp_hi);
4174 } else {
4175 const int log_range_out = AOMMAX(16, bd + 6);
4176 clamp_lo = vdupq_n_s32(-(1 << (log_range_out - 1)));
4177 clamp_hi = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4178 if (out_shift != 0) {
4179 bf1 = vrshlq_s32(bf1, vdupq_n_s32(-out_shift));
4180 }
4181 }
4182
4183 bf1 = vmaxq_s32(bf1, clamp_lo);
4184 bf1 = vminq_s32(bf1, clamp_hi);
4185
4186 for (int i = 0; i < 32; i++) out[i] = bf1;
4187 }
4188
idct32x32_low8_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4189 static void idct32x32_low8_neon(int32x4_t *in, int32x4_t *out, int bit,
4190 int do_cols, int bd, int out_shift) {
4191 const int32_t *cospi = cospi_arr(bit);
4192 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4193 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4194 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4195 int32x4_t bf1[32];
4196 const int32x4_t v_bit = vdupq_n_s32(-bit);
4197 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4198
4199 // stage 0-1
4200 bf1[0] = in[0];
4201 bf1[4] = in[4];
4202 bf1[8] = in[2];
4203 bf1[12] = in[6];
4204 bf1[16] = in[1];
4205 bf1[20] = in[5];
4206 bf1[24] = in[3];
4207 bf1[28] = in[7];
4208
4209 // stage 2
4210 bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
4211 bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
4212 bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
4213 bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
4214 bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
4215 bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
4216 bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
4217 bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
4218
4219 // stage 3
4220 bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
4221 bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
4222
4223 bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
4224 bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
4225 bf1[17] = bf1[16];
4226 bf1[18] = bf1[19];
4227 bf1[21] = bf1[20];
4228 bf1[22] = bf1[23];
4229 bf1[25] = bf1[24];
4230 bf1[26] = bf1[27];
4231 bf1[29] = bf1[28];
4232 bf1[30] = bf1[31];
4233
4234 // stage 4 :
4235 bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
4236 bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
4237
4238 bf1[9] = bf1[8];
4239 bf1[10] = bf1[11];
4240 bf1[13] = bf1[12];
4241 bf1[14] = bf1[15];
4242
4243 idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
4244
4245 // stage 5
4246 bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
4247 bf1[1] = bf1[0];
4248 bf1[5] = bf1[4];
4249 bf1[6] = bf1[7];
4250
4251 idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4252
4253 // stage 6
4254 bf1[3] = bf1[0];
4255 bf1[2] = bf1[1];
4256
4257 idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4258
4259 // stage 7
4260 idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4261
4262 // stage 8
4263 idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4264
4265 // stage 9
4266 idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4267 }
4268
idct32x32_low16_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4269 static void idct32x32_low16_neon(int32x4_t *in, int32x4_t *out, int bit,
4270 int do_cols, int bd, int out_shift) {
4271 const int32_t *cospi = cospi_arr(bit);
4272 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4273 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4274 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4275 int32x4_t bf1[32];
4276 const int32x4_t v_bit = vdupq_n_s32(-bit);
4277 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4278
4279 // stage 0-1
4280
4281 bf1[0] = in[0];
4282 bf1[2] = in[8];
4283 bf1[4] = in[4];
4284 bf1[6] = in[12];
4285 bf1[8] = in[2];
4286 bf1[10] = in[10];
4287 bf1[12] = in[6];
4288 bf1[14] = in[14];
4289 bf1[16] = in[1];
4290 bf1[18] = in[9];
4291 bf1[20] = in[5];
4292 bf1[22] = in[13];
4293 bf1[24] = in[3];
4294 bf1[26] = in[11];
4295 bf1[28] = in[7];
4296 bf1[30] = in[15];
4297
4298 // stage 2
4299 bf1[31] = half_btf_0_neon_r(&cospi[2], &bf1[16], &v_bit, &rnding);
4300 bf1[16] = half_btf_0_neon_r(&cospi[62], &bf1[16], &v_bit, &rnding);
4301 bf1[17] = half_btf_0_m_neon_r(&cospi[34], &bf1[30], &v_bit, &rnding);
4302 bf1[30] = half_btf_0_neon_r(&cospi[30], &bf1[30], &v_bit, &rnding);
4303 bf1[29] = half_btf_0_neon_r(&cospi[18], &bf1[18], &v_bit, &rnding);
4304 bf1[18] = half_btf_0_neon_r(&cospi[46], &bf1[18], &v_bit, &rnding);
4305 bf1[19] = half_btf_0_m_neon_r(&cospi[50], &bf1[28], &v_bit, &rnding);
4306 bf1[28] = half_btf_0_neon_r(&cospi[14], &bf1[28], &v_bit, &rnding);
4307 bf1[27] = half_btf_0_neon_r(&cospi[10], &bf1[20], &v_bit, &rnding);
4308 bf1[20] = half_btf_0_neon_r(&cospi[54], &bf1[20], &v_bit, &rnding);
4309 bf1[21] = half_btf_0_m_neon_r(&cospi[42], &bf1[26], &v_bit, &rnding);
4310 bf1[26] = half_btf_0_neon_r(&cospi[22], &bf1[26], &v_bit, &rnding);
4311 bf1[25] = half_btf_0_neon_r(&cospi[26], &bf1[22], &v_bit, &rnding);
4312 bf1[22] = half_btf_0_neon_r(&cospi[38], &bf1[22], &v_bit, &rnding);
4313 bf1[23] = half_btf_0_m_neon_r(&cospi[58], &bf1[24], &v_bit, &rnding);
4314 bf1[24] = half_btf_0_neon_r(&cospi[6], &bf1[24], &v_bit, &rnding);
4315
4316 // stage 3
4317 bf1[15] = half_btf_0_neon_r(&cospi[4], &bf1[8], &v_bit, &rnding);
4318 bf1[8] = half_btf_0_neon_r(&cospi[60], &bf1[8], &v_bit, &rnding);
4319 bf1[9] = half_btf_0_m_neon_r(&cospi[36], &bf1[14], &v_bit, &rnding);
4320 bf1[14] = half_btf_0_neon_r(&cospi[28], &bf1[14], &v_bit, &rnding);
4321 bf1[13] = half_btf_0_neon_r(&cospi[20], &bf1[10], &v_bit, &rnding);
4322 bf1[10] = half_btf_0_neon_r(&cospi[44], &bf1[10], &v_bit, &rnding);
4323 bf1[11] = half_btf_0_m_neon_r(&cospi[52], &bf1[12], &v_bit, &rnding);
4324 bf1[12] = half_btf_0_neon_r(&cospi[12], &bf1[12], &v_bit, &rnding);
4325
4326 addsub_neon(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4327 addsub_neon(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4328 addsub_neon(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4329 addsub_neon(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4330 addsub_neon(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4331 addsub_neon(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4332 addsub_neon(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4333 addsub_neon(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4334 // stage 4
4335 bf1[7] = half_btf_0_neon_r(&cospi[8], &bf1[4], &v_bit, &rnding);
4336 bf1[4] = half_btf_0_neon_r(&cospi[56], &bf1[4], &v_bit, &rnding);
4337 bf1[5] = half_btf_0_m_neon_r(&cospi[40], &bf1[6], &v_bit, &rnding);
4338 bf1[6] = half_btf_0_neon_r(&cospi[24], &bf1[6], &v_bit, &rnding);
4339
4340 addsub_neon(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4341 addsub_neon(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4342 addsub_neon(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4343 addsub_neon(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4344
4345 idct32_stage4_neon(bf1, cospi, &v_bit, &rnding);
4346
4347 // stage 5
4348 bf1[0] = half_btf_0_neon_r(&cospi[32], &bf1[0], &v_bit, &rnding);
4349 bf1[1] = bf1[0];
4350 bf1[3] = half_btf_0_neon_r(&cospi[16], &bf1[2], &v_bit, &rnding);
4351 bf1[2] = half_btf_0_neon_r(&cospi[48], &bf1[2], &v_bit, &rnding);
4352
4353 addsub_neon(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4354 addsub_neon(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4355
4356 idct32_stage5_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4357
4358 // stage 6
4359 addsub_neon(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4360 addsub_neon(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4361
4362 idct32_stage6_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4363
4364 // stage 7
4365 idct32_stage7_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4366
4367 // stage 8
4368 idct32_stage8_neon(bf1, cospi, &clamp_lo, &clamp_hi, &v_bit, &rnding);
4369 // stage 9
4370 idct32_stage9_neon(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4371 }
4372
idct32x32_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4373 static void idct32x32_neon(int32x4_t *in, int32x4_t *out, int bit, int do_cols,
4374 int bd, int out_shift) {
4375 const int32_t *cospi = cospi_arr(bit);
4376 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4377 const int32x4_t clamp_lo = vdupq_n_s32(-(1 << (log_range - 1)));
4378 const int32x4_t clamp_hi = vdupq_n_s32((1 << (log_range - 1)) - 1);
4379 int32x4_t bf1[32], bf0[32];
4380 const int32x4_t v_bit = vdupq_n_s32(-bit);
4381 const int32x4_t rnding = vdupq_n_s32(1 << (bit - 1));
4382 // stage 0
4383 // stage 1
4384 bf1[0] = in[0];
4385 bf1[1] = in[16];
4386 bf1[2] = in[8];
4387 bf1[3] = in[24];
4388 bf1[4] = in[4];
4389 bf1[5] = in[20];
4390 bf1[6] = in[12];
4391 bf1[7] = in[28];
4392 bf1[8] = in[2];
4393 bf1[9] = in[18];
4394 bf1[10] = in[10];
4395 bf1[11] = in[26];
4396 bf1[12] = in[6];
4397 bf1[13] = in[22];
4398 bf1[14] = in[14];
4399 bf1[15] = in[30];
4400 bf1[16] = in[1];
4401 bf1[17] = in[17];
4402 bf1[18] = in[9];
4403 bf1[19] = in[25];
4404 bf1[20] = in[5];
4405 bf1[21] = in[21];
4406 bf1[22] = in[13];
4407 bf1[23] = in[29];
4408 bf1[24] = in[3];
4409 bf1[25] = in[19];
4410 bf1[26] = in[11];
4411 bf1[27] = in[27];
4412 bf1[28] = in[7];
4413 bf1[29] = in[23];
4414 bf1[30] = in[15];
4415 bf1[31] = in[31];
4416
4417 // stage 2
4418 for (int i = 0; i < 16; i++) bf0[i] = bf1[i];
4419
4420 bf0[16] = half_btf_neon_mode01_r(&cospi[62], &bf1[16], &cospi[2], &bf1[31],
4421 &v_bit, &rnding);
4422 bf0[17] = half_btf_neon_mode01_r(&cospi[30], &bf1[17], &cospi[34], &bf1[30],
4423 &v_bit, &rnding);
4424 bf0[18] = half_btf_neon_mode01_r(&cospi[46], &bf1[18], &cospi[18], &bf1[29],
4425 &v_bit, &rnding);
4426 bf0[19] = half_btf_neon_mode01_r(&cospi[14], &bf1[19], &cospi[50], &bf1[28],
4427 &v_bit, &rnding);
4428 bf0[20] = half_btf_neon_mode01_r(&cospi[54], &bf1[20], &cospi[10], &bf1[27],
4429 &v_bit, &rnding);
4430 bf0[21] = half_btf_neon_mode01_r(&cospi[22], &bf1[21], &cospi[42], &bf1[26],
4431 &v_bit, &rnding);
4432 bf0[22] = half_btf_neon_mode01_r(&cospi[38], &bf1[22], &cospi[26], &bf1[25],
4433 &v_bit, &rnding);
4434 bf0[23] = half_btf_neon_mode01_r(&cospi[6], &bf1[23], &cospi[58], &bf1[24],
4435 &v_bit, &rnding);
4436 bf0[24] = half_btf_neon_r(&cospi[58], &bf1[23], &cospi[6], &bf1[24], &v_bit,
4437 &rnding);
4438 bf0[25] = half_btf_neon_r(&cospi[26], &bf1[22], &cospi[38], &bf1[25], &v_bit,
4439 &rnding);
4440 bf0[26] = half_btf_neon_r(&cospi[42], &bf1[21], &cospi[22], &bf1[26], &v_bit,
4441 &rnding);
4442 bf0[27] = half_btf_neon_r(&cospi[10], &bf1[20], &cospi[54], &bf1[27], &v_bit,
4443 &rnding);
4444 bf0[28] = half_btf_neon_r(&cospi[50], &bf1[19], &cospi[14], &bf1[28], &v_bit,
4445 &rnding);
4446 bf0[29] = half_btf_neon_r(&cospi[18], &bf1[18], &cospi[46], &bf1[29], &v_bit,
4447 &rnding);
4448 bf0[30] = half_btf_neon_r(&cospi[34], &bf1[17], &cospi[30], &bf1[30], &v_bit,
4449 &rnding);
4450 bf0[31] = half_btf_neon_r(&cospi[2], &bf1[16], &cospi[62], &bf1[31], &v_bit,
4451 &rnding);
4452
4453 // stage 3
4454 for (int i = 0; i < 8; i++) bf1[i] = bf0[i];
4455
4456 bf1[8] = half_btf_neon_mode01_r(&cospi[60], &bf0[8], &cospi[4], &bf0[15],
4457 &v_bit, &rnding);
4458 bf1[9] = half_btf_neon_mode01_r(&cospi[28], &bf0[9], &cospi[36], &bf0[14],
4459 &v_bit, &rnding);
4460 bf1[10] = half_btf_neon_mode01_r(&cospi[44], &bf0[10], &cospi[20], &bf0[13],
4461 &v_bit, &rnding);
4462 bf1[11] = half_btf_neon_mode01_r(&cospi[12], &bf0[11], &cospi[52], &bf0[12],
4463 &v_bit, &rnding);
4464 bf1[12] = half_btf_neon_r(&cospi[52], &bf0[11], &cospi[12], &bf0[12], &v_bit,
4465 &rnding);
4466 bf1[13] = half_btf_neon_r(&cospi[20], &bf0[10], &cospi[44], &bf0[13], &v_bit,
4467 &rnding);
4468 bf1[14] = half_btf_neon_r(&cospi[36], &bf0[9], &cospi[28], &bf0[14], &v_bit,
4469 &rnding);
4470 bf1[15] = half_btf_neon_r(&cospi[4], &bf0[8], &cospi[60], &bf0[15], &v_bit,
4471 &rnding);
4472
4473 addsub_neon(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4474 addsub_neon(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4475 addsub_neon(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4476 addsub_neon(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4477 addsub_neon(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4478 addsub_neon(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4479 addsub_neon(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4480 addsub_neon(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4481
4482 // stage 4
4483 bf0[0] = bf1[0];
4484 bf0[1] = bf1[1];
4485 bf0[2] = bf1[2];
4486 bf0[3] = bf1[3];
4487 bf0[4] = half_btf_neon_mode01_r(&cospi[56], &bf1[4], &cospi[8], &bf1[7],
4488 &v_bit, &rnding);
4489 bf0[5] = half_btf_neon_mode01_r(&cospi[24], &bf1[5], &cospi[40], &bf1[6],
4490 &v_bit, &rnding);
4491 bf0[6] = half_btf_neon_r(&cospi[40], &bf1[5], &cospi[24], &bf1[6], &v_bit,
4492 &rnding);
4493 bf0[7] =
4494 half_btf_neon_r(&cospi[8], &bf1[4], &cospi[56], &bf1[7], &v_bit, &rnding);
4495
4496 addsub_neon(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4497 addsub_neon(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4498 addsub_neon(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4499 addsub_neon(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4500
4501 bf0[16] = bf1[16];
4502 bf0[17] = half_btf_neon_mode10_r(&cospi[8], &bf1[17], &cospi[56], &bf1[30],
4503 &v_bit, &rnding);
4504 bf0[18] = half_btf_neon_mode11_r(&cospi[56], &bf1[18], &cospi[8], &bf1[29],
4505 &v_bit, &rnding);
4506 bf0[19] = bf1[19];
4507 bf0[20] = bf1[20];
4508 bf0[21] = half_btf_neon_mode10_r(&cospi[40], &bf1[21], &cospi[24], &bf1[26],
4509 &v_bit, &rnding);
4510 bf0[22] = half_btf_neon_mode11_r(&cospi[24], &bf1[22], &cospi[40], &bf1[25],
4511 &v_bit, &rnding);
4512 bf0[23] = bf1[23];
4513 bf0[24] = bf1[24];
4514 bf0[25] = half_btf_neon_mode10_r(&cospi[40], &bf1[22], &cospi[24], &bf1[25],
4515 &v_bit, &rnding);
4516 bf0[26] = half_btf_neon_r(&cospi[24], &bf1[21], &cospi[40], &bf1[26], &v_bit,
4517 &rnding);
4518 bf0[27] = bf1[27];
4519 bf0[28] = bf1[28];
4520 bf0[29] = half_btf_neon_mode10_r(&cospi[8], &bf1[18], &cospi[56], &bf1[29],
4521 &v_bit, &rnding);
4522 bf0[30] = half_btf_neon_r(&cospi[56], &bf1[17], &cospi[8], &bf1[30], &v_bit,
4523 &rnding);
4524 bf0[31] = bf1[31];
4525
4526 // stage 5
4527 bf1[0] = half_btf_neon_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1], &v_bit,
4528 &rnding);
4529 bf1[1] = half_btf_neon_mode01_r(&cospi[32], &bf0[0], &cospi[32], &bf0[1],
4530 &v_bit, &rnding);
4531 bf1[2] = half_btf_neon_mode01_r(&cospi[48], &bf0[2], &cospi[16], &bf0[3],
4532 &v_bit, &rnding);
4533 bf1[3] = half_btf_neon_r(&cospi[16], &bf0[2], &cospi[48], &bf0[3], &v_bit,
4534 &rnding);
4535 addsub_neon(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4536 addsub_neon(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4537 bf1[8] = bf0[8];
4538 bf1[9] = half_btf_neon_mode10_r(&cospi[16], &bf0[9], &cospi[48], &bf0[14],
4539 &v_bit, &rnding);
4540 bf1[10] = half_btf_neon_mode11_r(&cospi[48], &bf0[10], &cospi[16], &bf0[13],
4541 &v_bit, &rnding);
4542 bf1[11] = bf0[11];
4543 bf1[12] = bf0[12];
4544 bf1[13] = half_btf_neon_mode10_r(&cospi[16], &bf0[10], &cospi[48], &bf0[13],
4545 &v_bit, &rnding);
4546 bf1[14] = half_btf_neon_r(&cospi[48], &bf0[9], &cospi[16], &bf0[14], &v_bit,
4547 &rnding);
4548 bf1[15] = bf0[15];
4549 addsub_neon(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
4550 addsub_neon(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
4551 addsub_neon(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
4552 addsub_neon(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
4553 addsub_neon(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
4554 addsub_neon(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
4555 addsub_neon(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
4556 addsub_neon(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
4557
4558 // stage 6
4559 addsub_neon(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
4560 addsub_neon(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
4561 bf0[4] = bf1[4];
4562 bf0[5] = half_btf_neon_mode10_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6],
4563 &v_bit, &rnding);
4564 bf0[6] = half_btf_neon_r(&cospi[32], &bf1[5], &cospi[32], &bf1[6], &v_bit,
4565 &rnding);
4566 bf0[7] = bf1[7];
4567 addsub_neon(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
4568 addsub_neon(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
4569 addsub_neon(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
4570 addsub_neon(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
4571 bf0[16] = bf1[16];
4572 bf0[17] = bf1[17];
4573 bf0[18] = half_btf_neon_mode10_r(&cospi[16], &bf1[18], &cospi[48], &bf1[29],
4574 &v_bit, &rnding);
4575 bf0[19] = half_btf_neon_mode10_r(&cospi[16], &bf1[19], &cospi[48], &bf1[28],
4576 &v_bit, &rnding);
4577 bf0[20] = half_btf_neon_mode11_r(&cospi[48], &bf1[20], &cospi[16], &bf1[27],
4578 &v_bit, &rnding);
4579 bf0[21] = half_btf_neon_mode11_r(&cospi[48], &bf1[21], &cospi[16], &bf1[26],
4580 &v_bit, &rnding);
4581 bf0[22] = bf1[22];
4582 bf0[23] = bf1[23];
4583 bf0[24] = bf1[24];
4584 bf0[25] = bf1[25];
4585 bf0[26] = half_btf_neon_mode10_r(&cospi[16], &bf1[21], &cospi[48], &bf1[26],
4586 &v_bit, &rnding);
4587 bf0[27] = half_btf_neon_mode10_r(&cospi[16], &bf1[20], &cospi[48], &bf1[27],
4588 &v_bit, &rnding);
4589 bf0[28] = half_btf_neon_r(&cospi[48], &bf1[19], &cospi[16], &bf1[28], &v_bit,
4590 &rnding);
4591 bf0[29] = half_btf_neon_r(&cospi[48], &bf1[18], &cospi[16], &bf1[29], &v_bit,
4592 &rnding);
4593 bf0[30] = bf1[30];
4594 bf0[31] = bf1[31];
4595
4596 // stage 7
4597 addsub_neon(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
4598 addsub_neon(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
4599 addsub_neon(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
4600 addsub_neon(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
4601 bf1[8] = bf0[8];
4602 bf1[9] = bf0[9];
4603 bf1[10] = half_btf_neon_mode10_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13],
4604 &v_bit, &rnding);
4605 bf1[11] = half_btf_neon_mode10_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12],
4606 &v_bit, &rnding);
4607 bf1[12] = half_btf_neon_r(&cospi[32], &bf0[11], &cospi[32], &bf0[12], &v_bit,
4608 &rnding);
4609 bf1[13] = half_btf_neon_r(&cospi[32], &bf0[10], &cospi[32], &bf0[13], &v_bit,
4610 &rnding);
4611 bf1[14] = bf0[14];
4612 bf1[15] = bf0[15];
4613 addsub_neon(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
4614 addsub_neon(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
4615 addsub_neon(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
4616 addsub_neon(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
4617 addsub_neon(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
4618 addsub_neon(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
4619 addsub_neon(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
4620 addsub_neon(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
4621
4622 // stage 8
4623 addsub_neon(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
4624 addsub_neon(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
4625 addsub_neon(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
4626 addsub_neon(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
4627 addsub_neon(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
4628 addsub_neon(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
4629 addsub_neon(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
4630 addsub_neon(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
4631 bf0[16] = bf1[16];
4632 bf0[17] = bf1[17];
4633 bf0[18] = bf1[18];
4634 bf0[19] = bf1[19];
4635 bf0[20] = half_btf_neon_mode10_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27],
4636 &v_bit, &rnding);
4637 bf0[21] = half_btf_neon_mode10_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26],
4638 &v_bit, &rnding);
4639 bf0[22] = half_btf_neon_mode10_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25],
4640 &v_bit, &rnding);
4641 bf0[23] = half_btf_neon_mode10_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24],
4642 &v_bit, &rnding);
4643 bf0[24] = half_btf_neon_r(&cospi[32], &bf1[23], &cospi[32], &bf1[24], &v_bit,
4644 &rnding);
4645 bf0[25] = half_btf_neon_r(&cospi[32], &bf1[22], &cospi[32], &bf1[25], &v_bit,
4646 &rnding);
4647 bf0[26] = half_btf_neon_r(&cospi[32], &bf1[21], &cospi[32], &bf1[26], &v_bit,
4648 &rnding);
4649 bf0[27] = half_btf_neon_r(&cospi[32], &bf1[20], &cospi[32], &bf1[27], &v_bit,
4650 &rnding);
4651 bf0[28] = bf1[28];
4652 bf0[29] = bf1[29];
4653 bf0[30] = bf1[30];
4654 bf0[31] = bf1[31];
4655
4656 // stage 9
4657 addsub_neon(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
4658 addsub_neon(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
4659 addsub_neon(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
4660 addsub_neon(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
4661 addsub_neon(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
4662 addsub_neon(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
4663 addsub_neon(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
4664 addsub_neon(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
4665 addsub_neon(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
4666 addsub_neon(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
4667 addsub_neon(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
4668 addsub_neon(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
4669 addsub_neon(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
4670 addsub_neon(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
4671 addsub_neon(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
4672 addsub_neon(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
4673
4674 if (!do_cols) {
4675 const int log_range_out = AOMMAX(16, bd + 6);
4676 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
4677 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4678 round_shift_8x8(out, out_shift);
4679 round_shift_8x8(out + 16, out_shift);
4680 highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
4681 }
4682 }
4683
iidentity32_neon(int32x4_t * in,int32x4_t * out,int bit,int do_cols,int bd,int out_shift)4684 static void iidentity32_neon(int32x4_t *in, int32x4_t *out, int bit,
4685 int do_cols, int bd, int out_shift) {
4686 (void)bit;
4687 for (int i = 0; i < 32; i += 16) {
4688 out[i] = vshlq_n_s32(in[i], 2);
4689 out[i + 1] = vshlq_n_s32(in[i + 1], 2);
4690 out[i + 2] = vshlq_n_s32(in[i + 2], 2);
4691 out[i + 3] = vshlq_n_s32(in[i + 3], 2);
4692 out[i + 4] = vshlq_n_s32(in[i + 4], 2);
4693 out[i + 5] = vshlq_n_s32(in[i + 5], 2);
4694 out[i + 6] = vshlq_n_s32(in[i + 6], 2);
4695 out[i + 7] = vshlq_n_s32(in[i + 7], 2);
4696 out[i + 8] = vshlq_n_s32(in[i + 8], 2);
4697 out[i + 9] = vshlq_n_s32(in[i + 9], 2);
4698 out[i + 10] = vshlq_n_s32(in[i + 10], 2);
4699 out[i + 11] = vshlq_n_s32(in[i + 11], 2);
4700 out[i + 12] = vshlq_n_s32(in[i + 12], 2);
4701 out[i + 13] = vshlq_n_s32(in[i + 13], 2);
4702 out[i + 14] = vshlq_n_s32(in[i + 14], 2);
4703 out[i + 15] = vshlq_n_s32(in[i + 15], 2);
4704 }
4705
4706 if (!do_cols) {
4707 const int log_range_out = AOMMAX(16, bd + 6);
4708 const int32x4_t clamp_lo_out = vdupq_n_s32(-(1 << (log_range_out - 1)));
4709 const int32x4_t clamp_hi_out = vdupq_n_s32((1 << (log_range_out - 1)) - 1);
4710 round_shift_8x8(out, out_shift);
4711 round_shift_8x8(out + 16, out_shift);
4712 highbd_clamp_s32_neon(out, out, &clamp_lo_out, &clamp_hi_out, 32);
4713 }
4714 }
4715
4716 // 1D itx types
4717 typedef enum ATTRIBUTE_PACKED {
4718 IDCT_1D,
4719 IADST_1D,
4720 IFLIPADST_1D = IADST_1D,
4721 IIDENTITY_1D,
4722 ITX_TYPES_1D,
4723 } ITX_TYPE_1D;
4724
4725 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
4726 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
4727 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
4728 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
4729 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
4730 };
4731 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
4732 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
4733 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
4734 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
4735 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
4736 };
4737
4738 static const transform_1d_neon
4739 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4740 {
4741 { idct4x4_neon, NULL, NULL, NULL },
4742 { iadst4x4_neon, NULL, NULL, NULL },
4743 { iidentity4_neon, iidentity4_neon, iidentity4_neon, NULL },
4744 },
4745 { { idct8x8_low1_neon, idct8x8_new_neon, NULL, NULL },
4746 { iadst8x8_low1_neon, iadst8x8_new_neon, NULL, NULL },
4747 { iidentity8_neon, iidentity8_neon, NULL, NULL } },
4748 {
4749 { idct16x16_low1_neon, idct16x16_low8_neon, idct16x16_neon, NULL },
4750 { iadst16x16_low1_neon, iadst16x16_low8_neon, iadst16x16_neon, NULL },
4751 { iidentity16_neon, NULL, iidentity16_neon, NULL },
4752 },
4753 { { idct32x32_low1_neon, idct32x32_low8_neon, idct32x32_low16_neon,
4754 idct32x32_neon },
4755 { NULL, NULL, NULL, NULL },
4756 { iidentity32_neon, NULL, NULL, NULL } },
4757 { { idct64x64_low1_neon, idct64x64_low8_neon, idct64x64_low16_neon,
4758 idct64x64_neon },
4759 { NULL, NULL, NULL, NULL },
4760 { NULL, NULL, NULL, NULL } }
4761 };
4762
av1_inv_txfm2d_add_4x8_neon(const tran_low_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4763 void av1_inv_txfm2d_add_4x8_neon(const tran_low_t *input, uint16_t *output,
4764 int stride, TX_TYPE tx_type, const int bd) {
4765 TX_SIZE tx_size = TX_4X8;
4766 int32x4_t buf1[32] = { vdupq_n_s32(0) };
4767
4768 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4769 const int txw_idx = get_txw_idx(tx_size);
4770 const int txh_idx = get_txh_idx(tx_size);
4771 const int txfm_size_col = tx_size_wide[tx_size];
4772 const int txfm_size_row = tx_size_high[tx_size];
4773 const transform_1d_neon row_txfm =
4774 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
4775 const transform_1d_neon col_txfm =
4776 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
4777 const int input_stride = AOMMIN(32, txfm_size_row);
4778
4779 assert(col_txfm != NULL);
4780 assert(row_txfm != NULL);
4781 int ud_flip, lr_flip;
4782 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4783
4784 // 1st stage: column transform
4785 int32x4_t buf0[8];
4786 load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
4787 load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
4788 round_shift_rect_array_32_neon(buf0, buf0, txfm_size_row);
4789 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4790 row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
4791
4792 if (lr_flip) {
4793 TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
4794 buf1[3]);
4795
4796 TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
4797 buf1[7]);
4798 } else {
4799 TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
4800 buf1[3]);
4801
4802 TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
4803 buf1[7]);
4804 }
4805
4806 // 2nd stage: column transform
4807 col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
4808
4809 round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
4810
4811 // write to buffer
4812 highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
4813 bd);
4814 }
4815
av1_inv_txfm2d_add_8x4_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4816 void av1_inv_txfm2d_add_8x4_neon(const int32_t *input, uint16_t *output,
4817 int stride, TX_TYPE tx_type, const int bd) {
4818 TX_SIZE tx_size = TX_8X4;
4819 int32x4_t buf1[8];
4820 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4821 const int txw_idx = get_txw_idx(tx_size);
4822 const int txh_idx = get_txh_idx(tx_size);
4823 const int txfm_size_col = tx_size_wide[tx_size];
4824 const int txfm_size_row = tx_size_high[tx_size];
4825 const transform_1d_neon row_txfm =
4826 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
4827 const transform_1d_neon col_txfm =
4828 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
4829
4830 assert(col_txfm != NULL);
4831 assert(row_txfm != NULL);
4832 int ud_flip, lr_flip;
4833 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4834
4835 // 1st stage: column transform
4836 int32x4_t buf0[8];
4837 const int32_t *input_row = input;
4838 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
4839
4840 round_shift_rect_array_32_neon(buf0, buf0, txfm_size_col);
4841 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4842
4843 int32x4_t *buf1_ptr;
4844 if (lr_flip) {
4845 flip_buf_neon(buf0, buf1, txfm_size_col);
4846 buf1_ptr = buf1;
4847 } else {
4848 buf1_ptr = buf0;
4849 }
4850
4851 // 2nd stage: column transform
4852 for (int i = 0; i < 2; i++) {
4853 int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
4854 transpose_4x4(buf1_cur, buf1_cur);
4855 col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
4856 }
4857 round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
4858 // write to buffer
4859 highbd_write_buffer_8xn_neon(buf1_ptr, output, stride, ud_flip, txfm_size_row,
4860 bd);
4861 }
4862
av1_inv_txfm2d_add_4x16_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4863 void av1_inv_txfm2d_add_4x16_neon(const int32_t *input, uint16_t *output,
4864 int stride, TX_TYPE tx_type, const int bd) {
4865 TX_SIZE tx_size = TX_4X16;
4866 int32x4_t buf1[16];
4867 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4868 const int txw_idx = get_txw_idx(tx_size);
4869 const int txh_idx = get_txh_idx(tx_size);
4870 const int txfm_size_col = tx_size_wide[tx_size];
4871 const int txfm_size_row = tx_size_high[tx_size];
4872 const int buf_size_h_div8 = txfm_size_row >> 2;
4873 const transform_1d_neon row_txfm =
4874 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
4875 const transform_1d_neon col_txfm =
4876 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
4877 const int input_stride = AOMMIN(32, txfm_size_row);
4878
4879 assert(col_txfm != NULL);
4880 assert(row_txfm != NULL);
4881 int ud_flip, lr_flip;
4882 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4883
4884 // 1st stage: column transform
4885 int32x4_t buf0[16];
4886 for (int i = 0; i < (txfm_size_row >> 2); i++) {
4887 const int32_t *input_row = input + i * 4;
4888 int32x4_t *buf0_cur = buf0 + i * 4;
4889 load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
4890 row_txfm(buf0 + (i << 2), buf0 + (i << 2), INV_COS_BIT, 0, bd, -shift[0]);
4891 }
4892
4893 if (lr_flip) {
4894 for (int j = 0; j < buf_size_h_div8; ++j) {
4895 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
4896 buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
4897 buf1[4 * j + 3]);
4898 }
4899 } else {
4900 for (int j = 0; j < buf_size_h_div8; ++j) {
4901 TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
4902 buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
4903 buf1[4 * j + 2], buf1[4 * j + 3]);
4904 }
4905 }
4906
4907 // 2nd stage: column transform
4908 col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
4909
4910 round_shift_array_32_neon(buf1, buf1, txfm_size_row, -shift[1]);
4911
4912 // write to buffer
4913 highbd_write_buffer_4xn_neon(buf1, output, stride, ud_flip, txfm_size_row,
4914 bd);
4915 }
4916
av1_inv_txfm2d_add_16x4_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,const int bd)4917 void av1_inv_txfm2d_add_16x4_neon(const int32_t *input, uint16_t *output,
4918 int stride, TX_TYPE tx_type, const int bd) {
4919 TX_SIZE tx_size = TX_16X4;
4920 int32x4_t buf1[16];
4921 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4922 const int txw_idx = get_txw_idx(tx_size);
4923 const int txh_idx = get_txh_idx(tx_size);
4924 const int txfm_size_col = tx_size_wide[tx_size];
4925 const int txfm_size_row = tx_size_high[tx_size];
4926 const int buf_size_w_div8 = txfm_size_col >> 2;
4927 const transform_1d_neon row_txfm =
4928 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
4929 const transform_1d_neon col_txfm =
4930 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
4931
4932 assert(col_txfm != NULL);
4933 assert(row_txfm != NULL);
4934 int ud_flip, lr_flip;
4935 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4936
4937 // 1st stage: column transform
4938 int32x4_t buf0[16];
4939 const int32_t *input_row = input;
4940 load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
4941
4942 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4943
4944 int32x4_t *buf1_ptr;
4945 if (lr_flip) {
4946 flip_buf_neon(buf0, buf1, txfm_size_col);
4947 buf1_ptr = buf1;
4948 } else {
4949 buf1_ptr = buf0;
4950 }
4951
4952 // 2nd stage: column transform
4953 for (int i = 0; i < buf_size_w_div8; i++) {
4954 int32x4_t *buf1_cur = buf1_ptr + i * txfm_size_row;
4955 transpose_4x4(buf1_cur, buf1_cur);
4956 col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
4957 }
4958 round_shift_array_32_neon(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
4959
4960 // write to buffer
4961 for (int i = 0; i < (txfm_size_col >> 3); i++) {
4962 highbd_write_buffer_8xn_neon(buf1_ptr + i * txfm_size_row * 2,
4963 output + 8 * i, stride, ud_flip, txfm_size_row,
4964 bd);
4965 }
4966 }
4967
4968 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
4969 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
4970 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4971 };
4972
4973 // Transform block width in log2 for eob (size of 64 map to 32)
4974 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
4975 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
4976 };
4977
4978 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
4979 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
4980 };
4981
4982 DECLARE_ALIGNED(16, static const int16_t,
4983 av1_eob_to_eobxy_16x16_default[16]) = {
4984 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
4985 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
4986 };
4987
4988 DECLARE_ALIGNED(16, static const int16_t,
4989 av1_eob_to_eobxy_32x32_default[32]) = {
4990 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4991 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4992 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4993 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
4994 };
4995
4996 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
4997 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
4998 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
4999 };
5000
5001 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
5002 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
5003 };
5004
5005 DECLARE_ALIGNED(16, static const int16_t,
5006 av1_eob_to_eobxy_16x32_default[32]) = {
5007 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
5008 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
5009 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
5010 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
5011 };
5012
5013 DECLARE_ALIGNED(16, static const int16_t,
5014 av1_eob_to_eobxy_32x16_default[16]) = {
5015 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
5016 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
5017 };
5018
5019 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
5020 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
5021 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
5022 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
5023 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
5024 };
5025
5026 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
5027 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
5028 };
5029
5030 DECLARE_ALIGNED(16, static const int16_t *,
5031 av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
5032 NULL,
5033 av1_eob_to_eobxy_8x8_default,
5034 av1_eob_to_eobxy_16x16_default,
5035 av1_eob_to_eobxy_32x32_default,
5036 av1_eob_to_eobxy_32x32_default,
5037 NULL,
5038 NULL,
5039 av1_eob_to_eobxy_8x16_default,
5040 av1_eob_to_eobxy_16x8_default,
5041 av1_eob_to_eobxy_16x32_default,
5042 av1_eob_to_eobxy_32x16_default,
5043 av1_eob_to_eobxy_32x32_default,
5044 av1_eob_to_eobxy_32x32_default,
5045 NULL,
5046 NULL,
5047 av1_eob_to_eobxy_8x32_default,
5048 av1_eob_to_eobxy_32x8_default,
5049 av1_eob_to_eobxy_16x32_default,
5050 av1_eob_to_eobxy_32x16_default,
5051 };
5052
highbd_get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size,int eob)5053 static inline void highbd_get_eobx_eoby_scan_default(int *eobx, int *eoby,
5054 TX_SIZE tx_size, int eob) {
5055 if (eob == 1) {
5056 *eobx = 0;
5057 *eoby = 0;
5058 return;
5059 }
5060
5061 const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
5062 const int eob_row = (eob - 1) >> tx_w_log2;
5063 const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
5064 *eobx = eobxy & 0xFF;
5065 *eoby = eobxy >> 8;
5066 }
5067
get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size)5068 static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
5069 TX_SIZE tx_size) {
5070 if (tx_size == 2) {
5071 *eoby = 15, *eobx = 15;
5072 } else if (tx_size == 3) {
5073 *eoby = 31, *eobx = 31;
5074 } else if (tx_size == 4) {
5075 *eoby = 31, *eobx = 31;
5076 } else if (tx_size == 7) {
5077 *eoby = 15, *eobx = 7;
5078 } else if (tx_size == 8) {
5079 *eoby = 7, *eobx = 15;
5080 } else if (tx_size == 9) {
5081 *eoby = 31, *eobx = 15;
5082 } else if (tx_size == 10) {
5083 *eoby = 15, *eobx = 31;
5084 } else if (tx_size == 11) {
5085 *eoby = 31, *eobx = 31;
5086 } else if (tx_size == 12) {
5087 *eoby = 31, *eobx = 31;
5088 } else if (tx_size == 15) {
5089 *eoby = 31, *eobx = 7;
5090 } else if (tx_size == 16) {
5091 *eoby = 7, *eobx = 31;
5092 } else if (tx_size == 17) {
5093 *eoby = 31, *eobx = 15;
5094 } else if (tx_size == 18) {
5095 *eoby = 15, *eobx = 31;
5096 } else {
5097 *eoby = 0, *eobx = 0;
5098 }
5099 }
5100
get_eobx_eoby_scan_v_identity(int * eobx,int * eoby,TX_SIZE tx_size)5101 static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
5102 TX_SIZE tx_size) {
5103 const int txfm_size_row = tx_size_high[tx_size];
5104 *eoby = AOMMIN(32, txfm_size_row) - 1;
5105 *eobx = 0;
5106 }
5107
get_eobx_eoby_scan_h_identity(int * eobx,int * eoby,TX_SIZE tx_size)5108 static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
5109 TX_SIZE tx_size) {
5110 const int txfm_size_col = tx_size_wide[tx_size];
5111 *eobx = AOMMIN(32, txfm_size_col) - 1;
5112 *eoby = 0;
5113 }
5114
inv_txfm2d_add_h_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5115 static void inv_txfm2d_add_h_identity_neon(const int32_t *input,
5116 uint16_t *output, int stride,
5117 TX_TYPE tx_type, TX_SIZE tx_size,
5118 const int bd) {
5119 int32x4_t buf1[64];
5120 int eobx, eoby;
5121 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size);
5122 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5123 const int txw_idx = get_txw_idx(tx_size);
5124 const int txh_idx = get_txh_idx(tx_size);
5125 const int txfm_size_col = tx_size_wide[tx_size];
5126 const int txfm_size_row = tx_size_high[tx_size];
5127 const int buf_size_w = AOMMIN(32, txfm_size_col);
5128 const int buf_size_w_div4 = buf_size_w >> 2;
5129 const int buf_size_h_div8 = (eoby + 8) >> 3;
5130 const int row_max = AOMMIN(32, txfm_size_row);
5131 const int input_stride = row_max;
5132 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5133 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
5134 const transform_1d_neon row_txfm =
5135 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5136 assert(row_txfm != NULL);
5137 const transform_1d_neon col_txfm =
5138 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
5139 assert(col_txfm != NULL);
5140 int ud_flip, lr_flip;
5141 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5142
5143 for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
5144 int32x4_t buf0[16];
5145 load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5146 if (rect_type == 1 || rect_type == -1) {
5147 round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
5148 }
5149 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5150
5151 int32x4_t *_buf1 = buf1 + i * 4;
5152
5153 for (int j = 0; j < buf_size_w_div4; ++j) {
5154 int32x4_t *buf0_cur = buf0 + j * 4;
5155 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5156 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5157 _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5158 _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5159 _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5160 _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5161 }
5162 }
5163 for (int i = 0; i < buf_size_w_div4; i++) {
5164 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5165 bd, 0);
5166
5167 round_shift_array_32_neon(buf1 + i * txfm_size_row,
5168 buf1 + i * txfm_size_row, txfm_size_row,
5169 -shift[1]);
5170 }
5171
5172 // write to buffer
5173 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5174 highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5175 stride, ud_flip, txfm_size_row, bd);
5176 }
5177 }
5178
inv_txfm2d_add_v_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5179 static void inv_txfm2d_add_v_identity_neon(const int32_t *input,
5180 uint16_t *output, int stride,
5181 TX_TYPE tx_type, TX_SIZE tx_size,
5182 const int bd) {
5183 int32x4_t buf1[64];
5184 int eobx, eoby;
5185 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size);
5186 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5187 const int txw_idx = get_txw_idx(tx_size);
5188 const int txh_idx = get_txh_idx(tx_size);
5189 const int txfm_size_col = tx_size_wide[tx_size];
5190 const int txfm_size_row = tx_size_high[tx_size];
5191 const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
5192 const int row_max = AOMMIN(32, txfm_size_row);
5193 const int input_stride = row_max;
5194 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5195 const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
5196 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5197 const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
5198 const transform_1d_neon row_txfm =
5199 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
5200 assert(row_txfm != NULL);
5201 const transform_1d_neon col_txfm =
5202 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5203 assert(col_txfm != NULL);
5204 int ud_flip, lr_flip;
5205 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5206
5207 for (int i = 0; i < (row_max >> 2); ++i) {
5208 int32x4_t buf0[16];
5209 load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5210 buf_size_nonzero_w);
5211 if (rect_type == 1 || rect_type == -1) {
5212 round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
5213 }
5214 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5215
5216 int32x4_t *_buf1 = buf1 + i * 4;
5217 if (lr_flip) {
5218 for (int j = 0; j < buf_size_w_div4; ++j) {
5219 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5220 buf0[4 * j],
5221 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
5222 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
5223 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
5224 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
5225 }
5226 } else {
5227 for (int j = 0; j < buf_size_w_div4; ++j) {
5228 TRANSPOSE_4X4(
5229 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5230 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5231 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5232 }
5233 }
5234 }
5235 for (int i = 0; i < buf_size_w_div4; i++) {
5236 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5237 bd, 0);
5238
5239 round_shift_array_32_neon(buf1 + i * txfm_size_row,
5240 buf1 + i * txfm_size_row, txfm_size_row,
5241 -shift[1]);
5242 }
5243
5244 // write to buffer
5245 {
5246 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5247 highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5248 stride, ud_flip, txfm_size_row, bd);
5249 }
5250 }
5251 }
5252
inv_txfm2d_add_idtx_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5253 static void inv_txfm2d_add_idtx_neon(const int32_t *input, uint16_t *output,
5254 int stride, TX_TYPE tx_type,
5255 TX_SIZE tx_size, const int bd) {
5256 int32x4_t buf1[64 * 4];
5257 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5258 const int txw_idx = get_txw_idx(tx_size);
5259 const int txh_idx = get_txh_idx(tx_size);
5260 const int txfm_size_col = tx_size_wide[tx_size];
5261 const int txfm_size_row = tx_size_high[tx_size];
5262 const int row_max = AOMMIN(32, txfm_size_row);
5263 const int input_stride = row_max;
5264 const int buf_size_w = AOMMIN(32, txfm_size_col);
5265 const int buf_size_w_div4 = buf_size_w >> 2;
5266 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5267 const transform_1d_neon row_txfm =
5268 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5269 assert(row_txfm != NULL);
5270 const transform_1d_neon col_txfm =
5271 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5272 assert(col_txfm != NULL);
5273 for (int i = 0; i < (row_max >> 2); ++i) {
5274 int32x4_t buf0[32];
5275 load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5276 if (rect_type == 1 || rect_type == -1) {
5277 round_shift_rect_array_32_neon(buf0, buf0, buf_size_w);
5278 }
5279 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5280
5281 int32x4_t *_buf1 = buf1 + i * 4;
5282 for (int j = 0; j < buf_size_w_div4; ++j) {
5283 int32x4_t *buf0_cur = buf0 + j * 4;
5284 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5285 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5286 _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5287 _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5288 _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5289 _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5290 }
5291 }
5292 for (int i = 0; i < buf_size_w_div4; i++) {
5293 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5294 bd, 0);
5295
5296 round_shift_array_32_neon(buf1 + i * txfm_size_row,
5297 buf1 + i * txfm_size_row, txfm_size_row,
5298 -shift[1]);
5299 }
5300
5301 // write to buffer
5302 {
5303 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5304 highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5305 stride, 0, txfm_size_row, bd);
5306 }
5307 }
5308 }
5309
inv_txfm2d_add_no_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5310 static void inv_txfm2d_add_no_identity_neon(const int32_t *input,
5311 uint16_t *output, int stride,
5312 TX_TYPE tx_type, TX_SIZE tx_size,
5313 const int bd) {
5314 int32x4_t buf1[64 * 16];
5315 int eobx, eoby;
5316 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size);
5317 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5318 const int txw_idx = get_txw_idx(tx_size);
5319 const int txh_idx = get_txh_idx(tx_size);
5320 const int txfm_size_col = tx_size_wide[tx_size];
5321 const int txfm_size_row = tx_size_high[tx_size];
5322 const int buf_size_w_div4 = txfm_size_col >> 2;
5323 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
5324 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5325 const int input_stride = AOMMIN(32, txfm_size_row);
5326 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5327
5328 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5329 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5330 const transform_1d_neon row_txfm =
5331 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5332 const transform_1d_neon col_txfm =
5333 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5334
5335 assert(col_txfm != NULL);
5336 assert(row_txfm != NULL);
5337 int ud_flip, lr_flip;
5338 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5339 // 1st stage: column transform
5340 for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5341 int32x4_t buf0[64];
5342 load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5343 buf_size_nonzero_w);
5344 if (rect_type == 1 || rect_type == -1) {
5345 round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w);
5346 }
5347 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5348
5349 int32x4_t *_buf1 = &buf1[i * 4];
5350
5351 if (lr_flip) {
5352 for (int j = 0; j < buf_size_w_div4; ++j) {
5353 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5354 buf0[4 * j],
5355 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
5356 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
5357 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
5358 _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
5359 }
5360 } else {
5361 for (int j = 0; j < buf_size_w_div4; ++j) {
5362 TRANSPOSE_4X4(
5363 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5364 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5365 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5366 }
5367 }
5368 }
5369 // 2nd stage: column transform
5370 for (int i = 0; i < buf_size_w_div4; i++) {
5371 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5372 bd, 0);
5373
5374 round_shift_array_32_neon(buf1 + i * txfm_size_row,
5375 buf1 + i * txfm_size_row, txfm_size_row,
5376 -shift[1]);
5377 }
5378
5379 // write to buffer
5380 {
5381 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5382 highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5383 stride, ud_flip, txfm_size_row, bd);
5384 }
5385 }
5386 }
5387
highbd_inv_txfm2d_add_no_identity_neon(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5388 static void highbd_inv_txfm2d_add_no_identity_neon(const int32_t *input,
5389 uint16_t *output, int stride,
5390 TX_TYPE tx_type,
5391 TX_SIZE tx_size, int eob,
5392 const int bd) {
5393 int32x4_t buf1[64 * 16];
5394 int eobx, eoby;
5395 highbd_get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5396 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5397 const int txw_idx = get_txw_idx(tx_size);
5398 const int txh_idx = get_txh_idx(tx_size);
5399 const int txfm_size_col = tx_size_wide[tx_size];
5400 const int txfm_size_row = tx_size_high[tx_size];
5401 const int buf_size_w_div8 = txfm_size_col >> 2;
5402 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5403 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5404 const int input_stride = AOMMIN(32, txfm_size_col);
5405 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5406
5407 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5408 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5409 const transform_1d_neon row_txfm =
5410 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5411 const transform_1d_neon col_txfm =
5412 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5413
5414 assert(col_txfm != NULL);
5415 assert(row_txfm != NULL);
5416 int ud_flip, lr_flip;
5417 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5418 // 1st stage: column transform
5419 for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5420 int32x4_t buf0[64];
5421 const int32_t *input_row = input + i * input_stride * 4;
5422 for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
5423 int32x4_t *buf0_cur = &buf0[j * 4];
5424 load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
5425
5426 TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5427 buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5428 }
5429 if (rect_type == 1 || rect_type == -1) {
5430 round_shift_rect_array_32_neon(buf0, buf0, buf_size_nonzero_w_div8 << 3);
5431 }
5432 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5433
5434 int32x4_t *_buf1 = &buf1[i * 4];
5435
5436 if (lr_flip) {
5437 for (int j = 0; j < buf_size_w_div8; ++j) {
5438 TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5439 buf0[4 * j],
5440 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
5441 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
5442 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
5443 _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
5444 }
5445 } else {
5446 for (int j = 0; j < buf_size_w_div8; ++j) {
5447 TRANSPOSE_4X4(
5448 buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5449 _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5450 _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5451 }
5452 }
5453 }
5454 // 2nd stage: column transform
5455 for (int i = 0; i < buf_size_w_div8; i++) {
5456 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5457 bd, 0);
5458
5459 round_shift_array_32_neon(buf1 + i * txfm_size_row,
5460 buf1 + i * txfm_size_row, txfm_size_row,
5461 -shift[1]);
5462 }
5463
5464 // write to buffer
5465 {
5466 for (int i = 0; i < (txfm_size_col >> 3); i++) {
5467 highbd_write_buffer_8xn_neon(buf1 + i * txfm_size_row * 2, output + 8 * i,
5468 stride, ud_flip, txfm_size_row, bd);
5469 }
5470 }
5471 }
5472
highbd_inv_txfm2d_add_universe_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5473 static void highbd_inv_txfm2d_add_universe_neon(const int32_t *input,
5474 uint8_t *output, int stride,
5475 TX_TYPE tx_type,
5476 TX_SIZE tx_size, int eob,
5477 const int bd) {
5478 switch (tx_type) {
5479 case DCT_DCT:
5480 case ADST_DCT:
5481 case DCT_ADST:
5482 case ADST_ADST:
5483 case FLIPADST_DCT:
5484 case DCT_FLIPADST:
5485 case FLIPADST_FLIPADST:
5486 case ADST_FLIPADST:
5487 case FLIPADST_ADST:
5488 highbd_inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
5489 stride, tx_type, tx_size, eob, bd);
5490 break;
5491 case V_DCT:
5492 case V_ADST:
5493 case V_FLIPADST:
5494 inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5495 tx_type, tx_size, bd);
5496 break;
5497 case H_DCT:
5498 case H_ADST:
5499 case H_FLIPADST:
5500 inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5501 tx_type, tx_size, bd);
5502 break;
5503 case IDTX:
5504 inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5505 tx_type, tx_size, bd);
5506 break;
5507 default: assert(0); break;
5508 }
5509 }
5510
inv_txfm2d_add_universe_neon(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,const int bd)5511 static void inv_txfm2d_add_universe_neon(const int32_t *input, uint8_t *output,
5512 int stride, TX_TYPE tx_type,
5513 TX_SIZE tx_size, const int bd) {
5514 switch (tx_type) {
5515 case DCT_DCT:
5516 case ADST_DCT:
5517 case DCT_ADST:
5518 case ADST_ADST:
5519 case FLIPADST_DCT:
5520 case DCT_FLIPADST:
5521 case FLIPADST_FLIPADST:
5522 case ADST_FLIPADST:
5523 case FLIPADST_ADST:
5524 inv_txfm2d_add_no_identity_neon(input, CONVERT_TO_SHORTPTR(output),
5525 stride, tx_type, tx_size, bd);
5526 break;
5527 case V_DCT:
5528 case V_ADST:
5529 case V_FLIPADST:
5530 inv_txfm2d_add_h_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5531 tx_type, tx_size, bd);
5532 break;
5533 case H_DCT:
5534 case H_ADST:
5535 case H_FLIPADST:
5536 inv_txfm2d_add_v_identity_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5537 tx_type, tx_size, bd);
5538 break;
5539 case IDTX:
5540 inv_txfm2d_add_idtx_neon(input, CONVERT_TO_SHORTPTR(output), stride,
5541 tx_type, tx_size, bd);
5542 break;
5543 default: assert(0); break;
5544 }
5545 }
5546
highbd_inv_txfm_add_8x8_neon(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5547 static void highbd_inv_txfm_add_8x8_neon(const tran_low_t *input, uint8_t *dest,
5548 int stride,
5549 const TxfmParam *txfm_param) {
5550 int bd = txfm_param->bd;
5551 const TX_TYPE tx_type = txfm_param->tx_type;
5552 const int32_t *src = cast_to_int32(input);
5553 switch (tx_type) {
5554 case IDTX:
5555 case H_DCT:
5556 case H_ADST:
5557 case H_FLIPADST:
5558 case V_DCT:
5559 case V_ADST:
5560 case V_FLIPADST:
5561 highbd_inv_txfm2d_add_universe_neon(input, dest, stride, tx_type,
5562 txfm_param->tx_size, txfm_param->eob,
5563 bd);
5564 break;
5565 default:
5566 av1_inv_txfm2d_add_8x8_neon(src, CONVERT_TO_SHORTPTR(dest), stride,
5567 tx_type, bd);
5568 break;
5569 }
5570 }
5571
highbd_inv_txfm_add_4x4_neon(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5572 static void highbd_inv_txfm_add_4x4_neon(const tran_low_t *input, uint8_t *dest,
5573 int stride,
5574 const TxfmParam *txfm_param) {
5575 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5576 int eob = txfm_param->eob;
5577 int bd = txfm_param->bd;
5578 int lossless = txfm_param->lossless;
5579 const int32_t *src = cast_to_int32(input);
5580 const TX_TYPE tx_type = txfm_param->tx_type;
5581 if (lossless) {
5582 assert(tx_type == DCT_DCT);
5583 av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5584 return;
5585 }
5586 av1_inv_txfm2d_add_4x4_neon(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5587 bd);
5588 }
5589
av1_inv_txfm2d_add_8x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5590 void av1_inv_txfm2d_add_8x16_neon(const tran_low_t *input, uint16_t *dest,
5591 int stride, TX_TYPE tx_type, const int bd) {
5592 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X16,
5593 bd);
5594 }
5595
av1_inv_txfm2d_add_16x8_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5596 void av1_inv_txfm2d_add_16x8_neon(const tran_low_t *input, uint16_t *dest,
5597 int stride, TX_TYPE tx_type, const int bd) {
5598 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_16X8,
5599 bd);
5600 }
5601
av1_inv_txfm2d_add_16x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5602 void av1_inv_txfm2d_add_16x32_neon(const tran_low_t *input, uint16_t *dest,
5603 int stride, TX_TYPE tx_type, const int bd) {
5604 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5605 TX_16X32, bd);
5606 }
5607
av1_inv_txfm2d_add_32x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5608 void av1_inv_txfm2d_add_32x16_neon(const tran_low_t *input, uint16_t *dest,
5609 int stride, TX_TYPE tx_type, const int bd) {
5610 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5611 TX_32X16, bd);
5612 }
5613
av1_inv_txfm2d_add_32x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5614 void av1_inv_txfm2d_add_32x32_neon(const tran_low_t *input, uint16_t *dest,
5615 int stride, TX_TYPE tx_type, const int bd) {
5616 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5617 TX_32X32, bd);
5618 }
5619
av1_inv_txfm2d_add_64x64_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5620 void av1_inv_txfm2d_add_64x64_neon(const tran_low_t *input, uint16_t *dest,
5621 int stride, TX_TYPE tx_type, const int bd) {
5622 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5623 TX_64X64, bd);
5624 }
5625
av1_inv_txfm2d_add_32x64_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5626 void av1_inv_txfm2d_add_32x64_neon(const tran_low_t *input, uint16_t *dest,
5627 int stride, TX_TYPE tx_type, const int bd) {
5628 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5629 TX_32X64, bd);
5630 }
5631
av1_inv_txfm2d_add_64x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5632 void av1_inv_txfm2d_add_64x32_neon(const tran_low_t *input, uint16_t *dest,
5633 int stride, TX_TYPE tx_type, const int bd) {
5634 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5635 TX_64X32, bd);
5636 }
5637
av1_inv_txfm2d_add_64x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5638 void av1_inv_txfm2d_add_64x16_neon(const tran_low_t *input, uint16_t *dest,
5639 int stride, TX_TYPE tx_type, const int bd) {
5640 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5641 TX_64X16, bd);
5642 }
5643
av1_inv_txfm2d_add_16x64_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5644 void av1_inv_txfm2d_add_16x64_neon(const tran_low_t *input, uint16_t *dest,
5645 int stride, TX_TYPE tx_type, const int bd) {
5646 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5647 TX_16X64, bd);
5648 }
5649
av1_inv_txfm2d_add_16x16_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5650 static void av1_inv_txfm2d_add_16x16_neon(const tran_low_t *input,
5651 uint16_t *dest, int stride,
5652 TX_TYPE tx_type, const int bd) {
5653 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type,
5654 TX_16X16, bd);
5655 }
5656
av1_inv_txfm2d_add_32x8_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5657 void av1_inv_txfm2d_add_32x8_neon(const tran_low_t *input, uint16_t *dest,
5658 int stride, TX_TYPE tx_type, const int bd) {
5659 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_32X8,
5660 bd);
5661 }
5662
av1_inv_txfm2d_add_8x32_neon(const tran_low_t * input,uint16_t * dest,int stride,TX_TYPE tx_type,const int bd)5663 void av1_inv_txfm2d_add_8x32_neon(const tran_low_t *input, uint16_t *dest,
5664 int stride, TX_TYPE tx_type, const int bd) {
5665 inv_txfm2d_add_universe_neon(input, (uint8_t *)dest, stride, tx_type, TX_8X32,
5666 bd);
5667 }
5668
av1_highbd_inv_txfm_add_neon(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5669 void av1_highbd_inv_txfm_add_neon(const tran_low_t *input, uint8_t *dest,
5670 int stride, const TxfmParam *txfm_param) {
5671 const TX_SIZE tx_size = txfm_param->tx_size;
5672
5673 TX_TYPE tx_type = txfm_param->tx_type;
5674 int bd = txfm_param->bd;
5675 switch (tx_size) {
5676 case TX_8X8:
5677 highbd_inv_txfm_add_8x8_neon(input, dest, stride, txfm_param);
5678 break;
5679 case TX_4X8:
5680 av1_inv_txfm2d_add_4x8_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5681 txfm_param->tx_type, txfm_param->bd);
5682 break;
5683 case TX_8X4:
5684 av1_inv_txfm2d_add_8x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5685 txfm_param->tx_type, txfm_param->bd);
5686 break;
5687 case TX_4X4:
5688 highbd_inv_txfm_add_4x4_neon(input, dest, stride, txfm_param);
5689 break;
5690 case TX_16X4:
5691 av1_inv_txfm2d_add_16x4_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5692 txfm_param->tx_type, txfm_param->bd);
5693 break;
5694 case TX_4X16:
5695 av1_inv_txfm2d_add_4x16_neon(input, CONVERT_TO_SHORTPTR(dest), stride,
5696 txfm_param->tx_type, txfm_param->bd);
5697 break;
5698 case TX_8X16:
5699 av1_inv_txfm2d_add_8x16_neon(input, (uint16_t *)dest, stride, tx_type,
5700 bd);
5701 break;
5702 case TX_16X8:
5703 av1_inv_txfm2d_add_16x8_neon(input, (uint16_t *)dest, stride, tx_type,
5704 bd);
5705 break;
5706 case TX_16X32:
5707 av1_inv_txfm2d_add_16x32_neon(input, (uint16_t *)dest, stride, tx_type,
5708 bd);
5709 break;
5710 case TX_32X16:
5711 av1_inv_txfm2d_add_32x16_neon(input, (uint16_t *)dest, stride, tx_type,
5712 bd);
5713 break;
5714 case TX_16X16:
5715 av1_inv_txfm2d_add_16x16_neon(input, (uint16_t *)dest, stride, tx_type,
5716 bd);
5717 break;
5718 case TX_32X32:
5719 av1_inv_txfm2d_add_32x32_neon(input, (uint16_t *)dest, stride, tx_type,
5720 bd);
5721 break;
5722 case TX_64X64:
5723 av1_inv_txfm2d_add_64x64_neon(input, (uint16_t *)dest, stride, tx_type,
5724 bd);
5725 break;
5726 case TX_32X64:
5727 av1_inv_txfm2d_add_32x64_neon(input, (uint16_t *)dest, stride, tx_type,
5728 bd);
5729 break;
5730 case TX_64X32:
5731 av1_inv_txfm2d_add_64x32_neon(input, (uint16_t *)dest, stride, tx_type,
5732 bd);
5733 break;
5734 case TX_16X64:
5735 av1_inv_txfm2d_add_16x64_neon(input, (uint16_t *)dest, stride, tx_type,
5736 bd);
5737 break;
5738 case TX_64X16:
5739 av1_inv_txfm2d_add_64x16_neon(input, (uint16_t *)dest, stride, tx_type,
5740 bd);
5741 break;
5742 case TX_32X8:
5743 av1_inv_txfm2d_add_32x8_neon(input, (uint16_t *)dest, stride, tx_type,
5744 bd);
5745 break;
5746 case TX_8X32:
5747 av1_inv_txfm2d_add_8x32_neon(input, (uint16_t *)dest, stride, tx_type,
5748 bd);
5749 break;
5750 }
5751 }
5752