1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
12 #define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "./vpx_config.h"
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx_dsp/txfm_common.h"
19 #include "vpx_dsp/arm/mem_neon.h"
20 #include "vpx_dsp/arm/transpose_neon.h"
21 #include "vpx_dsp/arm/fdct_neon.h"
22
23 // Load & cross the first 8 and last 8, then the middle
load_cross(const int16_t * a,int stride,int16x8_t * b)24 static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
25 b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
26 b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
27 b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
28 b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
29 b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
30 b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
31 b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
32 b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
33
34 b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
35 b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
36 b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
37 b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
38 b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
39 b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
40 b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
41 b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
42
43 b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
44 b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
45 b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
46 b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
47 b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
48 b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
49 b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
50 b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
51
52 b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
53 b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
54 b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
55 b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
56 b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
57 b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
58 b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
59 b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
60 }
61
62 #define STORE_S16(src, index, dest) \
63 do { \
64 store_s16q_to_tran_low(dest, src[index]); \
65 dest += 8; \
66 } while (0)
67
68 // Store 32 16x8 values, assuming stride == 32.
69 // Slight twist: store horizontally in blocks of 8.
store(tran_low_t * a,const int16x8_t * b)70 static INLINE void store(tran_low_t *a, const int16x8_t *b) {
71 STORE_S16(b, 0, a);
72 STORE_S16(b, 8, a);
73 STORE_S16(b, 16, a);
74 STORE_S16(b, 24, a);
75 STORE_S16(b, 1, a);
76 STORE_S16(b, 9, a);
77 STORE_S16(b, 17, a);
78 STORE_S16(b, 25, a);
79 STORE_S16(b, 2, a);
80 STORE_S16(b, 10, a);
81 STORE_S16(b, 18, a);
82 STORE_S16(b, 26, a);
83 STORE_S16(b, 3, a);
84 STORE_S16(b, 11, a);
85 STORE_S16(b, 19, a);
86 STORE_S16(b, 27, a);
87 STORE_S16(b, 4, a);
88 STORE_S16(b, 12, a);
89 STORE_S16(b, 20, a);
90 STORE_S16(b, 28, a);
91 STORE_S16(b, 5, a);
92 STORE_S16(b, 13, a);
93 STORE_S16(b, 21, a);
94 STORE_S16(b, 29, a);
95 STORE_S16(b, 6, a);
96 STORE_S16(b, 14, a);
97 STORE_S16(b, 22, a);
98 STORE_S16(b, 30, a);
99 STORE_S16(b, 7, a);
100 STORE_S16(b, 15, a);
101 STORE_S16(b, 23, a);
102 STORE_S16(b, 31, a);
103 }
104
105 #undef STORE_S16
106
scale_input(const int16x8_t * in,int16x8_t * out)107 static INLINE void scale_input(const int16x8_t *in /*32*/,
108 int16x8_t *out /*32*/) {
109 out[0] = vshlq_n_s16(in[0], 2);
110 out[1] = vshlq_n_s16(in[1], 2);
111 out[2] = vshlq_n_s16(in[2], 2);
112 out[3] = vshlq_n_s16(in[3], 2);
113 out[4] = vshlq_n_s16(in[4], 2);
114 out[5] = vshlq_n_s16(in[5], 2);
115 out[6] = vshlq_n_s16(in[6], 2);
116 out[7] = vshlq_n_s16(in[7], 2);
117
118 out[8] = vshlq_n_s16(in[8], 2);
119 out[9] = vshlq_n_s16(in[9], 2);
120 out[10] = vshlq_n_s16(in[10], 2);
121 out[11] = vshlq_n_s16(in[11], 2);
122 out[12] = vshlq_n_s16(in[12], 2);
123 out[13] = vshlq_n_s16(in[13], 2);
124 out[14] = vshlq_n_s16(in[14], 2);
125 out[15] = vshlq_n_s16(in[15], 2);
126
127 out[16] = vshlq_n_s16(in[16], 2);
128 out[17] = vshlq_n_s16(in[17], 2);
129 out[18] = vshlq_n_s16(in[18], 2);
130 out[19] = vshlq_n_s16(in[19], 2);
131 out[20] = vshlq_n_s16(in[20], 2);
132 out[21] = vshlq_n_s16(in[21], 2);
133 out[22] = vshlq_n_s16(in[22], 2);
134 out[23] = vshlq_n_s16(in[23], 2);
135
136 out[24] = vshlq_n_s16(in[24], 2);
137 out[25] = vshlq_n_s16(in[25], 2);
138 out[26] = vshlq_n_s16(in[26], 2);
139 out[27] = vshlq_n_s16(in[27], 2);
140 out[28] = vshlq_n_s16(in[28], 2);
141 out[29] = vshlq_n_s16(in[29], 2);
142 out[30] = vshlq_n_s16(in[30], 2);
143 out[31] = vshlq_n_s16(in[31], 2);
144 }
145
dct_body_first_pass(const int16x8_t * in,int16x8_t * out)146 static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
147 int16x8_t a[32];
148 int16x8_t b[32];
149
150 // Stage 1: Done as part of the load.
151
152 // Stage 2.
153 // Mini cross. X the first 16 values and the middle 8 of the second half.
154 a[0] = vaddq_s16(in[0], in[15]);
155 a[1] = vaddq_s16(in[1], in[14]);
156 a[2] = vaddq_s16(in[2], in[13]);
157 a[3] = vaddq_s16(in[3], in[12]);
158 a[4] = vaddq_s16(in[4], in[11]);
159 a[5] = vaddq_s16(in[5], in[10]);
160 a[6] = vaddq_s16(in[6], in[9]);
161 a[7] = vaddq_s16(in[7], in[8]);
162
163 a[8] = vsubq_s16(in[7], in[8]);
164 a[9] = vsubq_s16(in[6], in[9]);
165 a[10] = vsubq_s16(in[5], in[10]);
166 a[11] = vsubq_s16(in[4], in[11]);
167 a[12] = vsubq_s16(in[3], in[12]);
168 a[13] = vsubq_s16(in[2], in[13]);
169 a[14] = vsubq_s16(in[1], in[14]);
170 a[15] = vsubq_s16(in[0], in[15]);
171
172 a[16] = in[16];
173 a[17] = in[17];
174 a[18] = in[18];
175 a[19] = in[19];
176
177 butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
178 &a[20]);
179 butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
180 &a[21]);
181 butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
182 &a[22]);
183 butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
184 &a[23]);
185
186 a[28] = in[28];
187 a[29] = in[29];
188 a[30] = in[30];
189 a[31] = in[31];
190
191 // Stage 3.
192 b[0] = vaddq_s16(a[0], a[7]);
193 b[1] = vaddq_s16(a[1], a[6]);
194 b[2] = vaddq_s16(a[2], a[5]);
195 b[3] = vaddq_s16(a[3], a[4]);
196
197 b[4] = vsubq_s16(a[3], a[4]);
198 b[5] = vsubq_s16(a[2], a[5]);
199 b[6] = vsubq_s16(a[1], a[6]);
200 b[7] = vsubq_s16(a[0], a[7]);
201
202 b[8] = a[8];
203 b[9] = a[9];
204
205 butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
206 butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
207
208 b[14] = a[14];
209 b[15] = a[15];
210
211 b[16] = vaddq_s16(in[16], a[23]);
212 b[17] = vaddq_s16(in[17], a[22]);
213 b[18] = vaddq_s16(in[18], a[21]);
214 b[19] = vaddq_s16(in[19], a[20]);
215
216 b[20] = vsubq_s16(in[19], a[20]);
217 b[21] = vsubq_s16(in[18], a[21]);
218 b[22] = vsubq_s16(in[17], a[22]);
219 b[23] = vsubq_s16(in[16], a[23]);
220
221 b[24] = vsubq_s16(in[31], a[24]);
222 b[25] = vsubq_s16(in[30], a[25]);
223 b[26] = vsubq_s16(in[29], a[26]);
224 b[27] = vsubq_s16(in[28], a[27]);
225
226 b[28] = vaddq_s16(in[28], a[27]);
227 b[29] = vaddq_s16(in[29], a[26]);
228 b[30] = vaddq_s16(in[30], a[25]);
229 b[31] = vaddq_s16(in[31], a[24]);
230
231 // Stage 4.
232 a[0] = vaddq_s16(b[0], b[3]);
233 a[1] = vaddq_s16(b[1], b[2]);
234 a[2] = vsubq_s16(b[1], b[2]);
235 a[3] = vsubq_s16(b[0], b[3]);
236
237 a[4] = b[4];
238
239 butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
240
241 a[7] = b[7];
242
243 a[8] = vaddq_s16(b[8], b[11]);
244 a[9] = vaddq_s16(b[9], b[10]);
245 a[10] = vsubq_s16(b[9], b[10]);
246 a[11] = vsubq_s16(b[8], b[11]);
247 a[12] = vsubq_s16(b[15], b[12]);
248 a[13] = vsubq_s16(b[14], b[13]);
249 a[14] = vaddq_s16(b[14], b[13]);
250 a[15] = vaddq_s16(b[15], b[12]);
251
252 a[16] = b[16];
253 a[17] = b[17];
254
255 butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
256 butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
257 butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
258 butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
259
260 a[22] = b[22];
261 a[23] = b[23];
262 a[24] = b[24];
263 a[25] = b[25];
264
265 a[30] = b[30];
266 a[31] = b[31];
267
268 // Stage 5.
269 butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
270 butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
271
272 b[4] = vaddq_s16(a[4], a[5]);
273 b[5] = vsubq_s16(a[4], a[5]);
274 b[6] = vsubq_s16(a[7], a[6]);
275 b[7] = vaddq_s16(a[7], a[6]);
276
277 b[8] = a[8];
278
279 butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
280 butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
281
282 b[11] = a[11];
283 b[12] = a[12];
284
285 b[15] = a[15];
286
287 b[16] = vaddq_s16(a[19], a[16]);
288 b[17] = vaddq_s16(a[18], a[17]);
289 b[18] = vsubq_s16(a[17], a[18]);
290 b[19] = vsubq_s16(a[16], a[19]);
291 b[20] = vsubq_s16(a[23], a[20]);
292 b[21] = vsubq_s16(a[22], a[21]);
293 b[22] = vaddq_s16(a[21], a[22]);
294 b[23] = vaddq_s16(a[20], a[23]);
295 b[24] = vaddq_s16(a[27], a[24]);
296 b[25] = vaddq_s16(a[26], a[25]);
297 b[26] = vsubq_s16(a[25], a[26]);
298 b[27] = vsubq_s16(a[24], a[27]);
299 b[28] = vsubq_s16(a[31], a[28]);
300 b[29] = vsubq_s16(a[30], a[29]);
301 b[30] = vaddq_s16(a[29], a[30]);
302 b[31] = vaddq_s16(a[28], a[31]);
303
304 // Stage 6.
305 a[0] = b[0];
306 a[1] = b[1];
307 a[2] = b[2];
308 a[3] = b[3];
309
310 butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
311 butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
312
313 a[8] = vaddq_s16(b[8], b[9]);
314 a[9] = vsubq_s16(b[8], b[9]);
315 a[10] = vsubq_s16(b[11], b[10]);
316 a[11] = vaddq_s16(b[11], b[10]);
317 a[12] = vaddq_s16(b[12], b[13]);
318 a[13] = vsubq_s16(b[12], b[13]);
319 a[14] = vsubq_s16(b[15], b[14]);
320 a[15] = vaddq_s16(b[15], b[14]);
321
322 a[16] = b[16];
323 a[19] = b[19];
324 a[20] = b[20];
325 a[23] = b[23];
326 a[24] = b[24];
327 a[27] = b[27];
328 a[28] = b[28];
329 a[31] = b[31];
330
331 butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
332 butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
333
334 butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
335 butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
336
337 // Stage 7.
338 b[0] = a[0];
339 b[1] = a[1];
340 b[2] = a[2];
341 b[3] = a[3];
342 b[4] = a[4];
343 b[5] = a[5];
344 b[6] = a[6];
345 b[7] = a[7];
346
347 butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
348 butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
349 butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
350 butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
351
352 b[16] = vaddq_s16(a[16], a[17]);
353 b[17] = vsubq_s16(a[16], a[17]);
354 b[18] = vsubq_s16(a[19], a[18]);
355 b[19] = vaddq_s16(a[19], a[18]);
356 b[20] = vaddq_s16(a[20], a[21]);
357 b[21] = vsubq_s16(a[20], a[21]);
358 b[22] = vsubq_s16(a[23], a[22]);
359 b[23] = vaddq_s16(a[23], a[22]);
360 b[24] = vaddq_s16(a[24], a[25]);
361 b[25] = vsubq_s16(a[24], a[25]);
362 b[26] = vsubq_s16(a[27], a[26]);
363 b[27] = vaddq_s16(a[27], a[26]);
364 b[28] = vaddq_s16(a[28], a[29]);
365 b[29] = vsubq_s16(a[28], a[29]);
366 b[30] = vsubq_s16(a[31], a[30]);
367 b[31] = vaddq_s16(a[31], a[30]);
368
369 // Final stage.
370 // Also compute partial rounding shift:
371 // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
372 out[0] = sub_round_shift_s16(b[0]);
373 out[16] = sub_round_shift_s16(b[1]);
374 out[8] = sub_round_shift_s16(b[2]);
375 out[24] = sub_round_shift_s16(b[3]);
376 out[4] = sub_round_shift_s16(b[4]);
377 out[20] = sub_round_shift_s16(b[5]);
378 out[12] = sub_round_shift_s16(b[6]);
379 out[28] = sub_round_shift_s16(b[7]);
380 out[2] = sub_round_shift_s16(b[8]);
381 out[18] = sub_round_shift_s16(b[9]);
382 out[10] = sub_round_shift_s16(b[10]);
383 out[26] = sub_round_shift_s16(b[11]);
384 out[6] = sub_round_shift_s16(b[12]);
385 out[22] = sub_round_shift_s16(b[13]);
386 out[14] = sub_round_shift_s16(b[14]);
387 out[30] = sub_round_shift_s16(b[15]);
388
389 butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
390 out[1] = sub_round_shift_s16(a[1]);
391 out[31] = sub_round_shift_s16(a[31]);
392
393 butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
394 out[17] = sub_round_shift_s16(a[17]);
395 out[15] = sub_round_shift_s16(a[15]);
396
397 butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
398 out[9] = sub_round_shift_s16(a[9]);
399 out[23] = sub_round_shift_s16(a[23]);
400
401 butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
402 out[25] = sub_round_shift_s16(a[25]);
403 out[7] = sub_round_shift_s16(a[7]);
404
405 butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
406 out[5] = sub_round_shift_s16(a[5]);
407 out[27] = sub_round_shift_s16(a[27]);
408
409 butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
410 out[21] = sub_round_shift_s16(a[21]);
411 out[11] = sub_round_shift_s16(a[11]);
412
413 butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
414 out[13] = sub_round_shift_s16(a[13]);
415 out[19] = sub_round_shift_s16(a[19]);
416
417 butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
418 out[29] = sub_round_shift_s16(a[29]);
419 out[3] = sub_round_shift_s16(a[3]);
420 }
421
422 #define PASS_THROUGH(src, dst, element) \
423 do { \
424 dst##_lo[element] = src##_lo[element]; \
425 dst##_hi[element] = src##_hi[element]; \
426 } while (0)
427
428 #define ADD_S16_S32(a, left_index, right_index, b, b_index) \
429 do { \
430 b##_lo[b_index] = \
431 vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
432 b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
433 vget_high_s16(a[right_index])); \
434 } while (0)
435
436 #define SUB_S16_S32(a, left_index, right_index, b, b_index) \
437 do { \
438 b##_lo[b_index] = \
439 vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
440 b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
441 vget_high_s16(a[right_index])); \
442 } while (0)
443
444 #define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
445 do { \
446 c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
447 c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
448 } while (0)
449
450 #define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
451 do { \
452 temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
453 temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
454 c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
455 c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
456 } while (0)
457
458 #define ADD_S32(a, left_index, right_index, b, b_index) \
459 do { \
460 b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
461 b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
462 } while (0)
463
464 #define SUB_S32(a, left_index, right_index, b, b_index) \
465 do { \
466 b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
467 b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
468 } while (0)
469
470 #define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
471 add_index, sub_index) \
472 do { \
473 butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
474 &b##_lo[add_index], &b##_hi[add_index], \
475 &b##_lo[sub_index], &b##_hi[sub_index]); \
476 } while (0)
477
478 #define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
479 sub_index) \
480 do { \
481 butterfly_one_coeff_s32_fast( \
482 a##_lo[left_index], a##_hi[left_index], a##_lo[right_index], \
483 a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
484 &b##_lo[sub_index], &b##_hi[sub_index]); \
485 } while (0)
486
487 #define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
488 right_constant, b, add_index, sub_index) \
489 do { \
490 butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
491 a##_lo[right_index], a##_hi[right_index], \
492 left_constant, right_constant, &b##_lo[add_index], \
493 &b##_hi[add_index], &b##_lo[sub_index], \
494 &b##_hi[sub_index]); \
495 } while (0)
496
dct_body_second_pass(const int16x8_t * in,int16x8_t * out)497 static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
498 int16x8_t a[32];
499 int16x8_t b[32];
500 int32x4_t c_lo[32];
501 int32x4_t c_hi[32];
502 int32x4_t d_lo[32];
503 int32x4_t d_hi[32];
504
505 // Stage 1. Done as part of the load for the first pass.
506 a[0] = vaddq_s16(in[0], in[31]);
507 a[1] = vaddq_s16(in[1], in[30]);
508 a[2] = vaddq_s16(in[2], in[29]);
509 a[3] = vaddq_s16(in[3], in[28]);
510 a[4] = vaddq_s16(in[4], in[27]);
511 a[5] = vaddq_s16(in[5], in[26]);
512 a[6] = vaddq_s16(in[6], in[25]);
513 a[7] = vaddq_s16(in[7], in[24]);
514 a[8] = vaddq_s16(in[8], in[23]);
515 a[9] = vaddq_s16(in[9], in[22]);
516 a[10] = vaddq_s16(in[10], in[21]);
517 a[11] = vaddq_s16(in[11], in[20]);
518 a[12] = vaddq_s16(in[12], in[19]);
519 a[13] = vaddq_s16(in[13], in[18]);
520 a[14] = vaddq_s16(in[14], in[17]);
521 a[15] = vaddq_s16(in[15], in[16]);
522 a[16] = vsubq_s16(in[15], in[16]);
523 a[17] = vsubq_s16(in[14], in[17]);
524 a[18] = vsubq_s16(in[13], in[18]);
525 a[19] = vsubq_s16(in[12], in[19]);
526 a[20] = vsubq_s16(in[11], in[20]);
527 a[21] = vsubq_s16(in[10], in[21]);
528 a[22] = vsubq_s16(in[9], in[22]);
529 a[23] = vsubq_s16(in[8], in[23]);
530 a[24] = vsubq_s16(in[7], in[24]);
531 a[25] = vsubq_s16(in[6], in[25]);
532 a[26] = vsubq_s16(in[5], in[26]);
533 a[27] = vsubq_s16(in[4], in[27]);
534 a[28] = vsubq_s16(in[3], in[28]);
535 a[29] = vsubq_s16(in[2], in[29]);
536 a[30] = vsubq_s16(in[1], in[30]);
537 a[31] = vsubq_s16(in[0], in[31]);
538
539 // Stage 2.
540 b[0] = vaddq_s16(a[0], a[15]);
541 b[1] = vaddq_s16(a[1], a[14]);
542 b[2] = vaddq_s16(a[2], a[13]);
543 b[3] = vaddq_s16(a[3], a[12]);
544 b[4] = vaddq_s16(a[4], a[11]);
545 b[5] = vaddq_s16(a[5], a[10]);
546 b[6] = vaddq_s16(a[6], a[9]);
547 b[7] = vaddq_s16(a[7], a[8]);
548
549 b[8] = vsubq_s16(a[7], a[8]);
550 b[9] = vsubq_s16(a[6], a[9]);
551 b[10] = vsubq_s16(a[5], a[10]);
552 b[11] = vsubq_s16(a[4], a[11]);
553 b[12] = vsubq_s16(a[3], a[12]);
554 b[13] = vsubq_s16(a[2], a[13]);
555 b[14] = vsubq_s16(a[1], a[14]);
556 b[15] = vsubq_s16(a[0], a[15]);
557
558 b[16] = a[16];
559 b[17] = a[17];
560 b[18] = a[18];
561 b[19] = a[19];
562
563 butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
564 butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
565 butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
566 butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
567
568 b[28] = a[28];
569 b[29] = a[29];
570 b[30] = a[30];
571 b[31] = a[31];
572
573 // Stage 3. With extreme values for input this calculation rolls over int16_t.
574 // The sources for b[0] get added multiple times and, through testing, have
575 // been shown to overflow starting here.
576 ADD_S16_S32(b, 0, 7, c, 0);
577 ADD_S16_S32(b, 1, 6, c, 1);
578 ADD_S16_S32(b, 2, 5, c, 2);
579 ADD_S16_S32(b, 3, 4, c, 3);
580 SUB_S16_S32(b, 3, 4, c, 4);
581 SUB_S16_S32(b, 2, 5, c, 5);
582 SUB_S16_S32(b, 1, 6, c, 6);
583 SUB_S16_S32(b, 0, 7, c, 7);
584
585 a[8] = b[8];
586 a[9] = b[9];
587
588 BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
589 BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
590
591 a[14] = b[14];
592 a[15] = b[15];
593
594 ADD_S16_S32(b, 16, 23, c, 16);
595 ADD_S16_S32(b, 17, 22, c, 17);
596 ADD_S16_S32(b, 18, 21, c, 18);
597 ADD_S16_S32(b, 19, 20, c, 19);
598 SUB_S16_S32(b, 19, 20, c, 20);
599 SUB_S16_S32(b, 18, 21, c, 21);
600 SUB_S16_S32(b, 17, 22, c, 22);
601 SUB_S16_S32(b, 16, 23, c, 23);
602 SUB_S16_S32(b, 31, 24, c, 24);
603 SUB_S16_S32(b, 30, 25, c, 25);
604 SUB_S16_S32(b, 29, 26, c, 26);
605 SUB_S16_S32(b, 28, 27, c, 27);
606 ADD_S16_S32(b, 28, 27, c, 28);
607 ADD_S16_S32(b, 29, 26, c, 29);
608 ADD_S16_S32(b, 30, 25, c, 30);
609 ADD_S16_S32(b, 31, 24, c, 31);
610
611 // Stage 4.
612 ADD_S32(c, 0, 3, d, 0);
613 ADD_S32(c, 1, 2, d, 1);
614 SUB_S32(c, 1, 2, d, 2);
615 SUB_S32(c, 0, 3, d, 3);
616
617 PASS_THROUGH(c, d, 4);
618
619 BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
620
621 PASS_THROUGH(c, d, 7);
622
623 ADDW_S16_S32(c, 11, a, 8, d, 8);
624 ADDW_S16_S32(c, 10, a, 9, d, 9);
625 SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
626 SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
627 SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
628 SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
629 ADDW_S16_S32(c, 13, b, 14, d, 14);
630 ADDW_S16_S32(c, 12, b, 15, d, 15);
631
632 PASS_THROUGH(c, d, 16);
633 PASS_THROUGH(c, d, 17);
634
635 BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
636 BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
637 BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
638 BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
639
640 PASS_THROUGH(c, d, 22);
641 PASS_THROUGH(c, d, 23);
642 PASS_THROUGH(c, d, 24);
643 PASS_THROUGH(c, d, 25);
644
645 PASS_THROUGH(c, d, 30);
646 PASS_THROUGH(c, d, 31);
647
648 // Stage 5.
649 BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
650 BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
651
652 ADD_S32(d, 4, 5, c, 4);
653 SUB_S32(d, 4, 5, c, 5);
654 SUB_S32(d, 7, 6, c, 6);
655 ADD_S32(d, 7, 6, c, 7);
656
657 PASS_THROUGH(d, c, 8);
658
659 BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
660 BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
661
662 PASS_THROUGH(d, c, 11);
663 PASS_THROUGH(d, c, 12);
664 PASS_THROUGH(d, c, 15);
665
666 ADD_S32(d, 16, 19, c, 16);
667 ADD_S32(d, 17, 18, c, 17);
668 SUB_S32(d, 17, 18, c, 18);
669 SUB_S32(d, 16, 19, c, 19);
670 SUB_S32(d, 23, 20, c, 20);
671 SUB_S32(d, 22, 21, c, 21);
672 ADD_S32(d, 22, 21, c, 22);
673 ADD_S32(d, 23, 20, c, 23);
674 ADD_S32(d, 24, 27, c, 24);
675 ADD_S32(d, 25, 26, c, 25);
676 SUB_S32(d, 25, 26, c, 26);
677 SUB_S32(d, 24, 27, c, 27);
678 SUB_S32(d, 31, 28, c, 28);
679 SUB_S32(d, 30, 29, c, 29);
680 ADD_S32(d, 30, 29, c, 30);
681 ADD_S32(d, 31, 28, c, 31);
682
683 // Stage 6.
684 PASS_THROUGH(c, d, 0);
685 PASS_THROUGH(c, d, 1);
686 PASS_THROUGH(c, d, 2);
687 PASS_THROUGH(c, d, 3);
688
689 BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
690 BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
691
692 ADD_S32(c, 8, 9, d, 8);
693 SUB_S32(c, 8, 9, d, 9);
694 SUB_S32(c, 11, 10, d, 10);
695 ADD_S32(c, 11, 10, d, 11);
696 ADD_S32(c, 12, 13, d, 12);
697 SUB_S32(c, 12, 13, d, 13);
698 SUB_S32(c, 15, 14, d, 14);
699 ADD_S32(c, 15, 14, d, 15);
700
701 PASS_THROUGH(c, d, 16);
702 PASS_THROUGH(c, d, 19);
703 PASS_THROUGH(c, d, 20);
704 PASS_THROUGH(c, d, 23);
705 PASS_THROUGH(c, d, 24);
706 PASS_THROUGH(c, d, 27);
707 PASS_THROUGH(c, d, 28);
708 PASS_THROUGH(c, d, 31);
709
710 BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
711 BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
712 BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
713 BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
714
715 // Stage 7.
716 PASS_THROUGH(d, c, 0);
717 PASS_THROUGH(d, c, 1);
718 PASS_THROUGH(d, c, 2);
719 PASS_THROUGH(d, c, 3);
720 PASS_THROUGH(d, c, 4);
721 PASS_THROUGH(d, c, 5);
722 PASS_THROUGH(d, c, 6);
723 PASS_THROUGH(d, c, 7);
724
725 BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
726 BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
727 BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
728 BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
729
730 ADD_S32(d, 16, 17, c, 16);
731 SUB_S32(d, 16, 17, c, 17);
732 SUB_S32(d, 19, 18, c, 18);
733 ADD_S32(d, 19, 18, c, 19);
734 ADD_S32(d, 20, 21, c, 20);
735 SUB_S32(d, 20, 21, c, 21);
736 SUB_S32(d, 23, 22, c, 22);
737 ADD_S32(d, 23, 22, c, 23);
738 ADD_S32(d, 24, 25, c, 24);
739 SUB_S32(d, 24, 25, c, 25);
740 SUB_S32(d, 27, 26, c, 26);
741 ADD_S32(d, 27, 26, c, 27);
742 ADD_S32(d, 28, 29, c, 28);
743 SUB_S32(d, 28, 29, c, 29);
744 SUB_S32(d, 31, 30, c, 30);
745 ADD_S32(d, 31, 30, c, 31);
746
747 // Final stage.
748 // Roll rounding into this function so we can pass back int16x8.
749
750 out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
751 out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
752
753 out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
754 out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
755 out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
756 out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
757 out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
758
759 out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
760 out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
761 out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
762 out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
763
764 out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
765 out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
766 out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
767 out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
768 out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
769
770 BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
771 out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
772 out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
773
774 BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
775 out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
776 out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
777
778 BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
779 out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
780 out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
781
782 BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
783 out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
784 out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
785
786 BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
787 out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
788 out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
789
790 BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
791 out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
792 out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
793
794 BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
795 out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
796 out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
797
798 BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
799 out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
800 out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
801 }
802
dct_body_second_pass_rd(const int16x8_t * in,int16x8_t * out)803 static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
804 int16x8_t *out) {
805 int16x8_t a[32];
806 int16x8_t b[32];
807
808 // Stage 1. Done as part of the load for the first pass.
809 a[0] = vaddq_s16(in[0], in[31]);
810 a[1] = vaddq_s16(in[1], in[30]);
811 a[2] = vaddq_s16(in[2], in[29]);
812 a[3] = vaddq_s16(in[3], in[28]);
813 a[4] = vaddq_s16(in[4], in[27]);
814 a[5] = vaddq_s16(in[5], in[26]);
815 a[6] = vaddq_s16(in[6], in[25]);
816 a[7] = vaddq_s16(in[7], in[24]);
817 a[8] = vaddq_s16(in[8], in[23]);
818 a[9] = vaddq_s16(in[9], in[22]);
819 a[10] = vaddq_s16(in[10], in[21]);
820 a[11] = vaddq_s16(in[11], in[20]);
821 a[12] = vaddq_s16(in[12], in[19]);
822 a[13] = vaddq_s16(in[13], in[18]);
823 a[14] = vaddq_s16(in[14], in[17]);
824 a[15] = vaddq_s16(in[15], in[16]);
825 a[16] = vsubq_s16(in[15], in[16]);
826 a[17] = vsubq_s16(in[14], in[17]);
827 a[18] = vsubq_s16(in[13], in[18]);
828 a[19] = vsubq_s16(in[12], in[19]);
829 a[20] = vsubq_s16(in[11], in[20]);
830 a[21] = vsubq_s16(in[10], in[21]);
831 a[22] = vsubq_s16(in[9], in[22]);
832 a[23] = vsubq_s16(in[8], in[23]);
833 a[24] = vsubq_s16(in[7], in[24]);
834 a[25] = vsubq_s16(in[6], in[25]);
835 a[26] = vsubq_s16(in[5], in[26]);
836 a[27] = vsubq_s16(in[4], in[27]);
837 a[28] = vsubq_s16(in[3], in[28]);
838 a[29] = vsubq_s16(in[2], in[29]);
839 a[30] = vsubq_s16(in[1], in[30]);
840 a[31] = vsubq_s16(in[0], in[31]);
841
842 // Stage 2.
843 // For the "rd" version, all the values are rounded down after stage 2 to keep
844 // the values in 16 bits.
845 b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
846 b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
847 b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
848 b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
849 b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
850 b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
851 b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
852 b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
853
854 b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
855 b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
856 b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
857 b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
858 b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
859 b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
860 b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
861 b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
862
863 b[16] = add_round_shift_s16(a[16]);
864 b[17] = add_round_shift_s16(a[17]);
865 b[18] = add_round_shift_s16(a[18]);
866 b[19] = add_round_shift_s16(a[19]);
867
868 butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
869 butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
870 butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
871 butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
872 b[20] = add_round_shift_s16(b[20]);
873 b[21] = add_round_shift_s16(b[21]);
874 b[22] = add_round_shift_s16(b[22]);
875 b[23] = add_round_shift_s16(b[23]);
876 b[24] = add_round_shift_s16(b[24]);
877 b[25] = add_round_shift_s16(b[25]);
878 b[26] = add_round_shift_s16(b[26]);
879 b[27] = add_round_shift_s16(b[27]);
880
881 b[28] = add_round_shift_s16(a[28]);
882 b[29] = add_round_shift_s16(a[29]);
883 b[30] = add_round_shift_s16(a[30]);
884 b[31] = add_round_shift_s16(a[31]);
885
886 // Stage 3.
887 a[0] = vaddq_s16(b[0], b[7]);
888 a[1] = vaddq_s16(b[1], b[6]);
889 a[2] = vaddq_s16(b[2], b[5]);
890 a[3] = vaddq_s16(b[3], b[4]);
891
892 a[4] = vsubq_s16(b[3], b[4]);
893 a[5] = vsubq_s16(b[2], b[5]);
894 a[6] = vsubq_s16(b[1], b[6]);
895 a[7] = vsubq_s16(b[0], b[7]);
896
897 a[8] = b[8];
898 a[9] = b[9];
899
900 butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
901 butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
902
903 a[14] = b[14];
904 a[15] = b[15];
905
906 a[16] = vaddq_s16(b[16], b[23]);
907 a[17] = vaddq_s16(b[17], b[22]);
908 a[18] = vaddq_s16(b[18], b[21]);
909 a[19] = vaddq_s16(b[19], b[20]);
910
911 a[20] = vsubq_s16(b[19], b[20]);
912 a[21] = vsubq_s16(b[18], b[21]);
913 a[22] = vsubq_s16(b[17], b[22]);
914 a[23] = vsubq_s16(b[16], b[23]);
915
916 a[24] = vsubq_s16(b[31], b[24]);
917 a[25] = vsubq_s16(b[30], b[25]);
918 a[26] = vsubq_s16(b[29], b[26]);
919 a[27] = vsubq_s16(b[28], b[27]);
920
921 a[28] = vaddq_s16(b[28], b[27]);
922 a[29] = vaddq_s16(b[29], b[26]);
923 a[30] = vaddq_s16(b[30], b[25]);
924 a[31] = vaddq_s16(b[31], b[24]);
925
926 // Stage 4.
927 b[0] = vaddq_s16(a[0], a[3]);
928 b[1] = vaddq_s16(a[1], a[2]);
929 b[2] = vsubq_s16(a[1], a[2]);
930 b[3] = vsubq_s16(a[0], a[3]);
931
932 b[4] = a[4];
933
934 butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
935
936 b[7] = a[7];
937
938 b[8] = vaddq_s16(a[8], a[11]);
939 b[9] = vaddq_s16(a[9], a[10]);
940 b[10] = vsubq_s16(a[9], a[10]);
941 b[11] = vsubq_s16(a[8], a[11]);
942 b[12] = vsubq_s16(a[15], a[12]);
943 b[13] = vsubq_s16(a[14], a[13]);
944 b[14] = vaddq_s16(a[14], a[13]);
945 b[15] = vaddq_s16(a[15], a[12]);
946
947 b[16] = a[16];
948 b[17] = a[17];
949
950 butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
951 butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
952 butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
953 butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
954
955 b[22] = a[22];
956 b[23] = a[23];
957 b[24] = a[24];
958 b[25] = a[25];
959
960 b[30] = a[30];
961 b[31] = a[31];
962
963 // Stage 5.
964 butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
965 butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
966
967 a[4] = vaddq_s16(b[4], b[5]);
968 a[5] = vsubq_s16(b[4], b[5]);
969 a[6] = vsubq_s16(b[7], b[6]);
970 a[7] = vaddq_s16(b[7], b[6]);
971
972 a[8] = b[8];
973
974 butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
975 butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
976
977 a[11] = b[11];
978 a[12] = b[12];
979
980 a[15] = b[15];
981
982 a[16] = vaddq_s16(b[19], b[16]);
983 a[17] = vaddq_s16(b[18], b[17]);
984 a[18] = vsubq_s16(b[17], b[18]);
985 a[19] = vsubq_s16(b[16], b[19]);
986 a[20] = vsubq_s16(b[23], b[20]);
987 a[21] = vsubq_s16(b[22], b[21]);
988 a[22] = vaddq_s16(b[21], b[22]);
989 a[23] = vaddq_s16(b[20], b[23]);
990 a[24] = vaddq_s16(b[27], b[24]);
991 a[25] = vaddq_s16(b[26], b[25]);
992 a[26] = vsubq_s16(b[25], b[26]);
993 a[27] = vsubq_s16(b[24], b[27]);
994 a[28] = vsubq_s16(b[31], b[28]);
995 a[29] = vsubq_s16(b[30], b[29]);
996 a[30] = vaddq_s16(b[29], b[30]);
997 a[31] = vaddq_s16(b[28], b[31]);
998
999 // Stage 6.
1000 b[0] = a[0];
1001 b[1] = a[1];
1002 b[2] = a[2];
1003 b[3] = a[3];
1004
1005 butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
1006 butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
1007
1008 b[8] = vaddq_s16(a[8], a[9]);
1009 b[9] = vsubq_s16(a[8], a[9]);
1010 b[10] = vsubq_s16(a[11], a[10]);
1011 b[11] = vaddq_s16(a[11], a[10]);
1012 b[12] = vaddq_s16(a[12], a[13]);
1013 b[13] = vsubq_s16(a[12], a[13]);
1014 b[14] = vsubq_s16(a[15], a[14]);
1015 b[15] = vaddq_s16(a[15], a[14]);
1016
1017 b[16] = a[16];
1018 b[19] = a[19];
1019 b[20] = a[20];
1020 b[23] = a[23];
1021 b[24] = a[24];
1022 b[27] = a[27];
1023 b[28] = a[28];
1024 b[31] = a[31];
1025
1026 butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
1027 butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
1028
1029 butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
1030 butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
1031
1032 // Stage 7.
1033 a[0] = b[0];
1034 a[1] = b[1];
1035 a[2] = b[2];
1036 a[3] = b[3];
1037 a[4] = b[4];
1038 a[5] = b[5];
1039 a[6] = b[6];
1040 a[7] = b[7];
1041
1042 butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
1043 butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
1044 butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
1045 butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
1046
1047 a[16] = vaddq_s16(b[16], b[17]);
1048 a[17] = vsubq_s16(b[16], b[17]);
1049 a[18] = vsubq_s16(b[19], b[18]);
1050 a[19] = vaddq_s16(b[19], b[18]);
1051 a[20] = vaddq_s16(b[20], b[21]);
1052 a[21] = vsubq_s16(b[20], b[21]);
1053 a[22] = vsubq_s16(b[23], b[22]);
1054 a[23] = vaddq_s16(b[23], b[22]);
1055 a[24] = vaddq_s16(b[24], b[25]);
1056 a[25] = vsubq_s16(b[24], b[25]);
1057 a[26] = vsubq_s16(b[27], b[26]);
1058 a[27] = vaddq_s16(b[27], b[26]);
1059 a[28] = vaddq_s16(b[28], b[29]);
1060 a[29] = vsubq_s16(b[28], b[29]);
1061 a[30] = vsubq_s16(b[31], b[30]);
1062 a[31] = vaddq_s16(b[31], b[30]);
1063
1064 // Final stage.
1065 out[0] = a[0];
1066 out[16] = a[1];
1067 out[8] = a[2];
1068 out[24] = a[3];
1069 out[4] = a[4];
1070 out[20] = a[5];
1071 out[12] = a[6];
1072 out[28] = a[7];
1073 out[2] = a[8];
1074 out[18] = a[9];
1075 out[10] = a[10];
1076 out[26] = a[11];
1077 out[6] = a[12];
1078 out[22] = a[13];
1079 out[14] = a[14];
1080 out[30] = a[15];
1081
1082 butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
1083 butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
1084 &out[15]);
1085 butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
1086 butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
1087 butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
1088 butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
1089 &out[11]);
1090 butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
1091 &out[19]);
1092 butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
1093 }
1094
1095 #undef PASS_THROUGH
1096 #undef ADD_S16_S32
1097 #undef SUB_S16_S32
1098 #undef ADDW_S16_S32
1099 #undef SUBW_S16_S32
1100 #undef ADD_S32
1101 #undef SUB_S32
1102 #undef BUTTERFLY_ONE_S16_S32
1103 #undef BUTTERFLY_ONE_S32
1104 #undef BUTTERFLY_TWO_S32
1105
1106 #if CONFIG_VP9_HIGHBITDEPTH
1107
1108 // Store 32 32x4 vectors, assuming stride == 32.
store32x32_s32(tran_low_t * a,const int32x4_t * l1,const int32x4_t * r1,const int32x4_t * l2,const int32x4_t * r2,const int32x4_t * l3,const int32x4_t * r3,const int32x4_t * l4,const int32x4_t * r4)1109 static INLINE void store32x32_s32(
1110 tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
1111 const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
1112 const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
1113 const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
1114 int i;
1115 for (i = 0; i < 32; i++) {
1116 vst1q_s32(a, l1[i]);
1117 vst1q_s32(a + 4, r1[i]);
1118 vst1q_s32(a + 8, l2[i]);
1119 vst1q_s32(a + 12, r2[i]);
1120 vst1q_s32(a + 16, l3[i]);
1121 vst1q_s32(a + 20, r3[i]);
1122 vst1q_s32(a + 24, l4[i]);
1123 vst1q_s32(a + 28, r4[i]);
1124 a += 32;
1125 }
1126 }
1127
highbd_scale_input(const int16x8_t * a,int32x4_t * left,int32x4_t * right)1128 static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
1129 int32x4_t *left /*[32]*/,
1130 int32x4_t *right /* [32] */) {
1131 left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
1132 left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
1133 left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
1134 left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
1135 left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
1136 left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
1137 left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
1138 left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
1139 left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
1140 left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
1141 left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
1142 left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
1143 left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
1144 left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
1145 left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
1146 left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
1147 left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
1148 left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
1149 left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
1150 left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
1151 left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
1152 left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
1153 left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
1154 left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
1155 left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
1156 left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
1157 left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
1158 left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
1159 left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
1160 left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
1161 left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
1162 left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
1163
1164 right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
1165 right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
1166 right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
1167 right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
1168 right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
1169 right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
1170 right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
1171 right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
1172 right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
1173 right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
1174 right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
1175 right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
1176 right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
1177 right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
1178 right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
1179 right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
1180 right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
1181 right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
1182 right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
1183 right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
1184 right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
1185 right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
1186 right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
1187 right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
1188 right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
1189 right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
1190 right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
1191 right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
1192 right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
1193 right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
1194 right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
1195 right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
1196 }
1197
highbd_cross_input(const int32x4_t * a_left,int32x4_t * a_right,int32x4_t * b_left,int32x4_t * b_right)1198 static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
1199 int32x4_t *a_right /*[32]*/,
1200 int32x4_t *b_left /*[32]*/,
1201 int32x4_t *b_right /*[32]*/) {
1202 // Stage 1. Done as part of the load for the first pass.
1203 b_left[0] = vaddq_s32(a_left[0], a_left[31]);
1204 b_left[1] = vaddq_s32(a_left[1], a_left[30]);
1205 b_left[2] = vaddq_s32(a_left[2], a_left[29]);
1206 b_left[3] = vaddq_s32(a_left[3], a_left[28]);
1207 b_left[4] = vaddq_s32(a_left[4], a_left[27]);
1208 b_left[5] = vaddq_s32(a_left[5], a_left[26]);
1209 b_left[6] = vaddq_s32(a_left[6], a_left[25]);
1210 b_left[7] = vaddq_s32(a_left[7], a_left[24]);
1211 b_left[8] = vaddq_s32(a_left[8], a_left[23]);
1212 b_left[9] = vaddq_s32(a_left[9], a_left[22]);
1213 b_left[10] = vaddq_s32(a_left[10], a_left[21]);
1214 b_left[11] = vaddq_s32(a_left[11], a_left[20]);
1215 b_left[12] = vaddq_s32(a_left[12], a_left[19]);
1216 b_left[13] = vaddq_s32(a_left[13], a_left[18]);
1217 b_left[14] = vaddq_s32(a_left[14], a_left[17]);
1218 b_left[15] = vaddq_s32(a_left[15], a_left[16]);
1219
1220 b_right[0] = vaddq_s32(a_right[0], a_right[31]);
1221 b_right[1] = vaddq_s32(a_right[1], a_right[30]);
1222 b_right[2] = vaddq_s32(a_right[2], a_right[29]);
1223 b_right[3] = vaddq_s32(a_right[3], a_right[28]);
1224 b_right[4] = vaddq_s32(a_right[4], a_right[27]);
1225 b_right[5] = vaddq_s32(a_right[5], a_right[26]);
1226 b_right[6] = vaddq_s32(a_right[6], a_right[25]);
1227 b_right[7] = vaddq_s32(a_right[7], a_right[24]);
1228 b_right[8] = vaddq_s32(a_right[8], a_right[23]);
1229 b_right[9] = vaddq_s32(a_right[9], a_right[22]);
1230 b_right[10] = vaddq_s32(a_right[10], a_right[21]);
1231 b_right[11] = vaddq_s32(a_right[11], a_right[20]);
1232 b_right[12] = vaddq_s32(a_right[12], a_right[19]);
1233 b_right[13] = vaddq_s32(a_right[13], a_right[18]);
1234 b_right[14] = vaddq_s32(a_right[14], a_right[17]);
1235 b_right[15] = vaddq_s32(a_right[15], a_right[16]);
1236
1237 b_left[16] = vsubq_s32(a_left[15], a_left[16]);
1238 b_left[17] = vsubq_s32(a_left[14], a_left[17]);
1239 b_left[18] = vsubq_s32(a_left[13], a_left[18]);
1240 b_left[19] = vsubq_s32(a_left[12], a_left[19]);
1241 b_left[20] = vsubq_s32(a_left[11], a_left[20]);
1242 b_left[21] = vsubq_s32(a_left[10], a_left[21]);
1243 b_left[22] = vsubq_s32(a_left[9], a_left[22]);
1244 b_left[23] = vsubq_s32(a_left[8], a_left[23]);
1245 b_left[24] = vsubq_s32(a_left[7], a_left[24]);
1246 b_left[25] = vsubq_s32(a_left[6], a_left[25]);
1247 b_left[26] = vsubq_s32(a_left[5], a_left[26]);
1248 b_left[27] = vsubq_s32(a_left[4], a_left[27]);
1249 b_left[28] = vsubq_s32(a_left[3], a_left[28]);
1250 b_left[29] = vsubq_s32(a_left[2], a_left[29]);
1251 b_left[30] = vsubq_s32(a_left[1], a_left[30]);
1252 b_left[31] = vsubq_s32(a_left[0], a_left[31]);
1253
1254 b_right[16] = vsubq_s32(a_right[15], a_right[16]);
1255 b_right[17] = vsubq_s32(a_right[14], a_right[17]);
1256 b_right[18] = vsubq_s32(a_right[13], a_right[18]);
1257 b_right[19] = vsubq_s32(a_right[12], a_right[19]);
1258 b_right[20] = vsubq_s32(a_right[11], a_right[20]);
1259 b_right[21] = vsubq_s32(a_right[10], a_right[21]);
1260 b_right[22] = vsubq_s32(a_right[9], a_right[22]);
1261 b_right[23] = vsubq_s32(a_right[8], a_right[23]);
1262 b_right[24] = vsubq_s32(a_right[7], a_right[24]);
1263 b_right[25] = vsubq_s32(a_right[6], a_right[25]);
1264 b_right[26] = vsubq_s32(a_right[5], a_right[26]);
1265 b_right[27] = vsubq_s32(a_right[4], a_right[27]);
1266 b_right[28] = vsubq_s32(a_right[3], a_right[28]);
1267 b_right[29] = vsubq_s32(a_right[2], a_right[29]);
1268 b_right[30] = vsubq_s32(a_right[1], a_right[30]);
1269 b_right[31] = vsubq_s32(a_right[0], a_right[31]);
1270 }
1271
highbd_partial_add_round_shift(int32x4_t * left,int32x4_t * right)1272 static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
1273 int32x4_t *right /* [32] */) {
1274 // Also compute partial rounding shift:
1275 // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1276
1277 left[0] = add_round_shift_s32(left[0]);
1278 left[1] = add_round_shift_s32(left[1]);
1279 left[2] = add_round_shift_s32(left[2]);
1280 left[3] = add_round_shift_s32(left[3]);
1281 left[4] = add_round_shift_s32(left[4]);
1282 left[5] = add_round_shift_s32(left[5]);
1283 left[6] = add_round_shift_s32(left[6]);
1284 left[7] = add_round_shift_s32(left[7]);
1285 left[8] = add_round_shift_s32(left[8]);
1286 left[9] = add_round_shift_s32(left[9]);
1287 left[10] = add_round_shift_s32(left[10]);
1288 left[11] = add_round_shift_s32(left[11]);
1289 left[12] = add_round_shift_s32(left[12]);
1290 left[13] = add_round_shift_s32(left[13]);
1291 left[14] = add_round_shift_s32(left[14]);
1292 left[15] = add_round_shift_s32(left[15]);
1293 left[16] = add_round_shift_s32(left[16]);
1294 left[17] = add_round_shift_s32(left[17]);
1295 left[18] = add_round_shift_s32(left[18]);
1296 left[19] = add_round_shift_s32(left[19]);
1297 left[20] = add_round_shift_s32(left[20]);
1298 left[21] = add_round_shift_s32(left[21]);
1299 left[22] = add_round_shift_s32(left[22]);
1300 left[23] = add_round_shift_s32(left[23]);
1301 left[24] = add_round_shift_s32(left[24]);
1302 left[25] = add_round_shift_s32(left[25]);
1303 left[26] = add_round_shift_s32(left[26]);
1304 left[27] = add_round_shift_s32(left[27]);
1305 left[28] = add_round_shift_s32(left[28]);
1306 left[29] = add_round_shift_s32(left[29]);
1307 left[30] = add_round_shift_s32(left[30]);
1308 left[31] = add_round_shift_s32(left[31]);
1309
1310 right[0] = add_round_shift_s32(right[0]);
1311 right[1] = add_round_shift_s32(right[1]);
1312 right[2] = add_round_shift_s32(right[2]);
1313 right[3] = add_round_shift_s32(right[3]);
1314 right[4] = add_round_shift_s32(right[4]);
1315 right[5] = add_round_shift_s32(right[5]);
1316 right[6] = add_round_shift_s32(right[6]);
1317 right[7] = add_round_shift_s32(right[7]);
1318 right[8] = add_round_shift_s32(right[8]);
1319 right[9] = add_round_shift_s32(right[9]);
1320 right[10] = add_round_shift_s32(right[10]);
1321 right[11] = add_round_shift_s32(right[11]);
1322 right[12] = add_round_shift_s32(right[12]);
1323 right[13] = add_round_shift_s32(right[13]);
1324 right[14] = add_round_shift_s32(right[14]);
1325 right[15] = add_round_shift_s32(right[15]);
1326 right[16] = add_round_shift_s32(right[16]);
1327 right[17] = add_round_shift_s32(right[17]);
1328 right[18] = add_round_shift_s32(right[18]);
1329 right[19] = add_round_shift_s32(right[19]);
1330 right[20] = add_round_shift_s32(right[20]);
1331 right[21] = add_round_shift_s32(right[21]);
1332 right[22] = add_round_shift_s32(right[22]);
1333 right[23] = add_round_shift_s32(right[23]);
1334 right[24] = add_round_shift_s32(right[24]);
1335 right[25] = add_round_shift_s32(right[25]);
1336 right[26] = add_round_shift_s32(right[26]);
1337 right[27] = add_round_shift_s32(right[27]);
1338 right[28] = add_round_shift_s32(right[28]);
1339 right[29] = add_round_shift_s32(right[29]);
1340 right[30] = add_round_shift_s32(right[30]);
1341 right[31] = add_round_shift_s32(right[31]);
1342 }
1343
highbd_partial_sub_round_shift(int32x4_t * left,int32x4_t * right)1344 static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
1345 int32x4_t *right /* [32] */) {
1346 // Also compute partial rounding shift:
1347 // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1348
1349 left[0] = sub_round_shift_s32(left[0]);
1350 left[1] = sub_round_shift_s32(left[1]);
1351 left[2] = sub_round_shift_s32(left[2]);
1352 left[3] = sub_round_shift_s32(left[3]);
1353 left[4] = sub_round_shift_s32(left[4]);
1354 left[5] = sub_round_shift_s32(left[5]);
1355 left[6] = sub_round_shift_s32(left[6]);
1356 left[7] = sub_round_shift_s32(left[7]);
1357 left[8] = sub_round_shift_s32(left[8]);
1358 left[9] = sub_round_shift_s32(left[9]);
1359 left[10] = sub_round_shift_s32(left[10]);
1360 left[11] = sub_round_shift_s32(left[11]);
1361 left[12] = sub_round_shift_s32(left[12]);
1362 left[13] = sub_round_shift_s32(left[13]);
1363 left[14] = sub_round_shift_s32(left[14]);
1364 left[15] = sub_round_shift_s32(left[15]);
1365 left[16] = sub_round_shift_s32(left[16]);
1366 left[17] = sub_round_shift_s32(left[17]);
1367 left[18] = sub_round_shift_s32(left[18]);
1368 left[19] = sub_round_shift_s32(left[19]);
1369 left[20] = sub_round_shift_s32(left[20]);
1370 left[21] = sub_round_shift_s32(left[21]);
1371 left[22] = sub_round_shift_s32(left[22]);
1372 left[23] = sub_round_shift_s32(left[23]);
1373 left[24] = sub_round_shift_s32(left[24]);
1374 left[25] = sub_round_shift_s32(left[25]);
1375 left[26] = sub_round_shift_s32(left[26]);
1376 left[27] = sub_round_shift_s32(left[27]);
1377 left[28] = sub_round_shift_s32(left[28]);
1378 left[29] = sub_round_shift_s32(left[29]);
1379 left[30] = sub_round_shift_s32(left[30]);
1380 left[31] = sub_round_shift_s32(left[31]);
1381
1382 right[0] = sub_round_shift_s32(right[0]);
1383 right[1] = sub_round_shift_s32(right[1]);
1384 right[2] = sub_round_shift_s32(right[2]);
1385 right[3] = sub_round_shift_s32(right[3]);
1386 right[4] = sub_round_shift_s32(right[4]);
1387 right[5] = sub_round_shift_s32(right[5]);
1388 right[6] = sub_round_shift_s32(right[6]);
1389 right[7] = sub_round_shift_s32(right[7]);
1390 right[8] = sub_round_shift_s32(right[8]);
1391 right[9] = sub_round_shift_s32(right[9]);
1392 right[10] = sub_round_shift_s32(right[10]);
1393 right[11] = sub_round_shift_s32(right[11]);
1394 right[12] = sub_round_shift_s32(right[12]);
1395 right[13] = sub_round_shift_s32(right[13]);
1396 right[14] = sub_round_shift_s32(right[14]);
1397 right[15] = sub_round_shift_s32(right[15]);
1398 right[16] = sub_round_shift_s32(right[16]);
1399 right[17] = sub_round_shift_s32(right[17]);
1400 right[18] = sub_round_shift_s32(right[18]);
1401 right[19] = sub_round_shift_s32(right[19]);
1402 right[20] = sub_round_shift_s32(right[20]);
1403 right[21] = sub_round_shift_s32(right[21]);
1404 right[22] = sub_round_shift_s32(right[22]);
1405 right[23] = sub_round_shift_s32(right[23]);
1406 right[24] = sub_round_shift_s32(right[24]);
1407 right[25] = sub_round_shift_s32(right[25]);
1408 right[26] = sub_round_shift_s32(right[26]);
1409 right[27] = sub_round_shift_s32(right[27]);
1410 right[28] = sub_round_shift_s32(right[28]);
1411 right[29] = sub_round_shift_s32(right[29]);
1412 right[30] = sub_round_shift_s32(right[30]);
1413 right[31] = sub_round_shift_s32(right[31]);
1414 }
1415
highbd_dct8x32_body_first_pass(int32x4_t * left,int32x4_t * right)1416 static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
1417 int32x4_t *right /*32*/) {
1418 int32x4_t al[32], ar[32];
1419 int32x4_t bl[32], br[32];
1420
1421 // Stage 1: Done as part of the load.
1422
1423 // Stage 2.
1424 // Mini cross. X the first 16 values and the middle 8 of the second half.
1425 al[0] = vaddq_s32(left[0], left[15]);
1426 ar[0] = vaddq_s32(right[0], right[15]);
1427 al[1] = vaddq_s32(left[1], left[14]);
1428 ar[1] = vaddq_s32(right[1], right[14]);
1429 al[2] = vaddq_s32(left[2], left[13]);
1430 ar[2] = vaddq_s32(right[2], right[13]);
1431 al[3] = vaddq_s32(left[3], left[12]);
1432 ar[3] = vaddq_s32(right[3], right[12]);
1433 al[4] = vaddq_s32(left[4], left[11]);
1434 ar[4] = vaddq_s32(right[4], right[11]);
1435 al[5] = vaddq_s32(left[5], left[10]);
1436 ar[5] = vaddq_s32(right[5], right[10]);
1437 al[6] = vaddq_s32(left[6], left[9]);
1438 ar[6] = vaddq_s32(right[6], right[9]);
1439 al[7] = vaddq_s32(left[7], left[8]);
1440 ar[7] = vaddq_s32(right[7], right[8]);
1441
1442 al[8] = vsubq_s32(left[7], left[8]);
1443 ar[8] = vsubq_s32(right[7], right[8]);
1444 al[9] = vsubq_s32(left[6], left[9]);
1445 ar[9] = vsubq_s32(right[6], right[9]);
1446 al[10] = vsubq_s32(left[5], left[10]);
1447 ar[10] = vsubq_s32(right[5], right[10]);
1448 al[11] = vsubq_s32(left[4], left[11]);
1449 ar[11] = vsubq_s32(right[4], right[11]);
1450 al[12] = vsubq_s32(left[3], left[12]);
1451 ar[12] = vsubq_s32(right[3], right[12]);
1452 al[13] = vsubq_s32(left[2], left[13]);
1453 ar[13] = vsubq_s32(right[2], right[13]);
1454 al[14] = vsubq_s32(left[1], left[14]);
1455 ar[14] = vsubq_s32(right[1], right[14]);
1456 al[15] = vsubq_s32(left[0], left[15]);
1457 ar[15] = vsubq_s32(right[0], right[15]);
1458
1459 al[16] = left[16];
1460 ar[16] = right[16];
1461 al[17] = left[17];
1462 ar[17] = right[17];
1463 al[18] = left[18];
1464 ar[18] = right[18];
1465 al[19] = left[19];
1466 ar[19] = right[19];
1467
1468 butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
1469 cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
1470 butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
1471 cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
1472 butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
1473 cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
1474 butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
1475 cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
1476
1477 al[28] = left[28];
1478 ar[28] = right[28];
1479 al[29] = left[29];
1480 ar[29] = right[29];
1481 al[30] = left[30];
1482 ar[30] = right[30];
1483 al[31] = left[31];
1484 ar[31] = right[31];
1485
1486 // Stage 3.
1487 bl[0] = vaddq_s32(al[0], al[7]);
1488 br[0] = vaddq_s32(ar[0], ar[7]);
1489 bl[1] = vaddq_s32(al[1], al[6]);
1490 br[1] = vaddq_s32(ar[1], ar[6]);
1491 bl[2] = vaddq_s32(al[2], al[5]);
1492 br[2] = vaddq_s32(ar[2], ar[5]);
1493 bl[3] = vaddq_s32(al[3], al[4]);
1494 br[3] = vaddq_s32(ar[3], ar[4]);
1495
1496 bl[4] = vsubq_s32(al[3], al[4]);
1497 br[4] = vsubq_s32(ar[3], ar[4]);
1498 bl[5] = vsubq_s32(al[2], al[5]);
1499 br[5] = vsubq_s32(ar[2], ar[5]);
1500 bl[6] = vsubq_s32(al[1], al[6]);
1501 br[6] = vsubq_s32(ar[1], ar[6]);
1502 bl[7] = vsubq_s32(al[0], al[7]);
1503 br[7] = vsubq_s32(ar[0], ar[7]);
1504
1505 bl[8] = al[8];
1506 br[8] = ar[8];
1507 bl[9] = al[9];
1508 br[9] = ar[9];
1509
1510 butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
1511 &bl[13], &br[13], &bl[10], &br[10]);
1512 butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
1513 &bl[12], &br[12], &bl[11], &br[11]);
1514
1515 bl[14] = al[14];
1516 br[14] = ar[14];
1517 bl[15] = al[15];
1518 br[15] = ar[15];
1519
1520 bl[16] = vaddq_s32(left[16], al[23]);
1521 br[16] = vaddq_s32(right[16], ar[23]);
1522 bl[17] = vaddq_s32(left[17], al[22]);
1523 br[17] = vaddq_s32(right[17], ar[22]);
1524 bl[18] = vaddq_s32(left[18], al[21]);
1525 br[18] = vaddq_s32(right[18], ar[21]);
1526 bl[19] = vaddq_s32(left[19], al[20]);
1527 br[19] = vaddq_s32(right[19], ar[20]);
1528
1529 bl[20] = vsubq_s32(left[19], al[20]);
1530 br[20] = vsubq_s32(right[19], ar[20]);
1531 bl[21] = vsubq_s32(left[18], al[21]);
1532 br[21] = vsubq_s32(right[18], ar[21]);
1533 bl[22] = vsubq_s32(left[17], al[22]);
1534 br[22] = vsubq_s32(right[17], ar[22]);
1535 bl[23] = vsubq_s32(left[16], al[23]);
1536 br[23] = vsubq_s32(right[16], ar[23]);
1537
1538 bl[24] = vsubq_s32(left[31], al[24]);
1539 br[24] = vsubq_s32(right[31], ar[24]);
1540 bl[25] = vsubq_s32(left[30], al[25]);
1541 br[25] = vsubq_s32(right[30], ar[25]);
1542 bl[26] = vsubq_s32(left[29], al[26]);
1543 br[26] = vsubq_s32(right[29], ar[26]);
1544 bl[27] = vsubq_s32(left[28], al[27]);
1545 br[27] = vsubq_s32(right[28], ar[27]);
1546
1547 bl[28] = vaddq_s32(left[28], al[27]);
1548 br[28] = vaddq_s32(right[28], ar[27]);
1549 bl[29] = vaddq_s32(left[29], al[26]);
1550 br[29] = vaddq_s32(right[29], ar[26]);
1551 bl[30] = vaddq_s32(left[30], al[25]);
1552 br[30] = vaddq_s32(right[30], ar[25]);
1553 bl[31] = vaddq_s32(left[31], al[24]);
1554 br[31] = vaddq_s32(right[31], ar[24]);
1555
1556 // Stage 4.
1557 al[0] = vaddq_s32(bl[0], bl[3]);
1558 ar[0] = vaddq_s32(br[0], br[3]);
1559 al[1] = vaddq_s32(bl[1], bl[2]);
1560 ar[1] = vaddq_s32(br[1], br[2]);
1561 al[2] = vsubq_s32(bl[1], bl[2]);
1562 ar[2] = vsubq_s32(br[1], br[2]);
1563 al[3] = vsubq_s32(bl[0], bl[3]);
1564 ar[3] = vsubq_s32(br[0], br[3]);
1565
1566 al[4] = bl[4];
1567 ar[4] = br[4];
1568
1569 butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
1570 &ar[6], &al[5], &ar[5]);
1571
1572 al[7] = bl[7];
1573 ar[7] = br[7];
1574
1575 al[8] = vaddq_s32(bl[8], bl[11]);
1576 ar[8] = vaddq_s32(br[8], br[11]);
1577 al[9] = vaddq_s32(bl[9], bl[10]);
1578 ar[9] = vaddq_s32(br[9], br[10]);
1579 al[10] = vsubq_s32(bl[9], bl[10]);
1580 ar[10] = vsubq_s32(br[9], br[10]);
1581 al[11] = vsubq_s32(bl[8], bl[11]);
1582 ar[11] = vsubq_s32(br[8], br[11]);
1583 al[12] = vsubq_s32(bl[15], bl[12]);
1584 ar[12] = vsubq_s32(br[15], br[12]);
1585 al[13] = vsubq_s32(bl[14], bl[13]);
1586 ar[13] = vsubq_s32(br[14], br[13]);
1587 al[14] = vaddq_s32(bl[14], bl[13]);
1588 ar[14] = vaddq_s32(br[14], br[13]);
1589 al[15] = vaddq_s32(bl[15], bl[12]);
1590 ar[15] = vaddq_s32(br[15], br[12]);
1591
1592 al[16] = bl[16];
1593 ar[16] = br[16];
1594 al[17] = bl[17];
1595 ar[17] = br[17];
1596
1597 butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
1598 cospi_24_64, &al[29], &ar[29], &al[18],
1599 &ar[18]);
1600 butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
1601 cospi_24_64, &al[28], &ar[28], &al[19],
1602 &ar[19]);
1603 butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
1604 cospi_24_64, -cospi_8_64, &al[27], &ar[27],
1605 &al[20], &ar[20]);
1606 butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
1607 cospi_24_64, -cospi_8_64, &al[26], &ar[26],
1608 &al[21], &ar[21]);
1609
1610 al[22] = bl[22];
1611 ar[22] = br[22];
1612 al[23] = bl[23];
1613 ar[23] = br[23];
1614 al[24] = bl[24];
1615 ar[24] = br[24];
1616 al[25] = bl[25];
1617 ar[25] = br[25];
1618
1619 al[30] = bl[30];
1620 ar[30] = br[30];
1621 al[31] = bl[31];
1622 ar[31] = br[31];
1623
1624 // Stage 5.
1625 butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
1626 &br[0], &bl[1], &br[1]);
1627 butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
1628 cospi_24_64, &bl[2], &br[2], &bl[3],
1629 &br[3]);
1630
1631 bl[4] = vaddq_s32(al[4], al[5]);
1632 br[4] = vaddq_s32(ar[4], ar[5]);
1633 bl[5] = vsubq_s32(al[4], al[5]);
1634 br[5] = vsubq_s32(ar[4], ar[5]);
1635 bl[6] = vsubq_s32(al[7], al[6]);
1636 br[6] = vsubq_s32(ar[7], ar[6]);
1637 bl[7] = vaddq_s32(al[7], al[6]);
1638 br[7] = vaddq_s32(ar[7], ar[6]);
1639
1640 bl[8] = al[8];
1641 br[8] = ar[8];
1642
1643 butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
1644 cospi_24_64, &bl[14], &br[14], &bl[9],
1645 &br[9]);
1646 butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
1647 cospi_24_64, -cospi_8_64, &bl[13], &br[13],
1648 &bl[10], &br[10]);
1649
1650 bl[11] = al[11];
1651 br[11] = ar[11];
1652 bl[12] = al[12];
1653 br[12] = ar[12];
1654
1655 bl[15] = al[15];
1656 br[15] = ar[15];
1657
1658 bl[16] = vaddq_s32(al[19], al[16]);
1659 br[16] = vaddq_s32(ar[19], ar[16]);
1660 bl[17] = vaddq_s32(al[18], al[17]);
1661 br[17] = vaddq_s32(ar[18], ar[17]);
1662 bl[18] = vsubq_s32(al[17], al[18]);
1663 br[18] = vsubq_s32(ar[17], ar[18]);
1664 bl[19] = vsubq_s32(al[16], al[19]);
1665 br[19] = vsubq_s32(ar[16], ar[19]);
1666 bl[20] = vsubq_s32(al[23], al[20]);
1667 br[20] = vsubq_s32(ar[23], ar[20]);
1668 bl[21] = vsubq_s32(al[22], al[21]);
1669 br[21] = vsubq_s32(ar[22], ar[21]);
1670 bl[22] = vaddq_s32(al[21], al[22]);
1671 br[22] = vaddq_s32(ar[21], ar[22]);
1672 bl[23] = vaddq_s32(al[20], al[23]);
1673 br[23] = vaddq_s32(ar[20], ar[23]);
1674 bl[24] = vaddq_s32(al[27], al[24]);
1675 br[24] = vaddq_s32(ar[27], ar[24]);
1676 bl[25] = vaddq_s32(al[26], al[25]);
1677 br[25] = vaddq_s32(ar[26], ar[25]);
1678 bl[26] = vsubq_s32(al[25], al[26]);
1679 br[26] = vsubq_s32(ar[25], ar[26]);
1680 bl[27] = vsubq_s32(al[24], al[27]);
1681 br[27] = vsubq_s32(ar[24], ar[27]);
1682 bl[28] = vsubq_s32(al[31], al[28]);
1683 br[28] = vsubq_s32(ar[31], ar[28]);
1684 bl[29] = vsubq_s32(al[30], al[29]);
1685 br[29] = vsubq_s32(ar[30], ar[29]);
1686 bl[30] = vaddq_s32(al[29], al[30]);
1687 br[30] = vaddq_s32(ar[29], ar[30]);
1688 bl[31] = vaddq_s32(al[28], al[31]);
1689 br[31] = vaddq_s32(ar[28], ar[31]);
1690
1691 // Stage 6.
1692 al[0] = bl[0];
1693 ar[0] = br[0];
1694 al[1] = bl[1];
1695 ar[1] = br[1];
1696 al[2] = bl[2];
1697 ar[2] = br[2];
1698 al[3] = bl[3];
1699 ar[3] = br[3];
1700
1701 butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
1702 cospi_28_64, &al[4], &ar[4], &al[7],
1703 &ar[7]);
1704 butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
1705 cospi_12_64, &al[5], &ar[5], &al[6],
1706 &ar[6]);
1707
1708 al[8] = vaddq_s32(bl[8], bl[9]);
1709 ar[8] = vaddq_s32(br[8], br[9]);
1710 al[9] = vsubq_s32(bl[8], bl[9]);
1711 ar[9] = vsubq_s32(br[8], br[9]);
1712 al[10] = vsubq_s32(bl[11], bl[10]);
1713 ar[10] = vsubq_s32(br[11], br[10]);
1714 al[11] = vaddq_s32(bl[11], bl[10]);
1715 ar[11] = vaddq_s32(br[11], br[10]);
1716 al[12] = vaddq_s32(bl[12], bl[13]);
1717 ar[12] = vaddq_s32(br[12], br[13]);
1718 al[13] = vsubq_s32(bl[12], bl[13]);
1719 ar[13] = vsubq_s32(br[12], br[13]);
1720 al[14] = vsubq_s32(bl[15], bl[14]);
1721 ar[14] = vsubq_s32(br[15], br[14]);
1722 al[15] = vaddq_s32(bl[15], bl[14]);
1723 ar[15] = vaddq_s32(br[15], br[14]);
1724
1725 al[16] = bl[16];
1726 ar[16] = br[16];
1727 al[19] = bl[19];
1728 ar[19] = br[19];
1729 al[20] = bl[20];
1730 ar[20] = br[20];
1731 al[23] = bl[23];
1732 ar[23] = br[23];
1733 al[24] = bl[24];
1734 ar[24] = br[24];
1735 al[27] = bl[27];
1736 ar[27] = br[27];
1737 al[28] = bl[28];
1738 ar[28] = br[28];
1739 al[31] = bl[31];
1740 ar[31] = br[31];
1741
1742 butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
1743 cospi_28_64, &al[30], &ar[30], &al[17],
1744 &ar[17]);
1745 butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
1746 cospi_28_64, -cospi_4_64, &al[29], &ar[29],
1747 &al[18], &ar[18]);
1748 butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
1749 cospi_20_64, cospi_12_64, &al[26], &ar[26],
1750 &al[21], &ar[21]);
1751 butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
1752 cospi_12_64, -cospi_20_64, &al[25],
1753 &ar[25], &al[22], &ar[22]);
1754
1755 // Stage 7.
1756 bl[0] = al[0];
1757 br[0] = ar[0];
1758 bl[1] = al[1];
1759 br[1] = ar[1];
1760 bl[2] = al[2];
1761 br[2] = ar[2];
1762 bl[3] = al[3];
1763 br[3] = ar[3];
1764 bl[4] = al[4];
1765 br[4] = ar[4];
1766 bl[5] = al[5];
1767 br[5] = ar[5];
1768 bl[6] = al[6];
1769 br[6] = ar[6];
1770 bl[7] = al[7];
1771 br[7] = ar[7];
1772
1773 butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
1774 cospi_30_64, &bl[8], &br[8], &bl[15],
1775 &br[15]);
1776 butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
1777 cospi_14_64, &bl[9], &br[9], &bl[14],
1778 &br[14]);
1779 butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
1780 cospi_10_64, cospi_22_64, &bl[10], &br[10],
1781 &bl[13], &br[13]);
1782 butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
1783 cospi_26_64, cospi_6_64, &bl[11], &br[11],
1784 &bl[12], &br[12]);
1785
1786 bl[16] = vaddq_s32(al[16], al[17]);
1787 br[16] = vaddq_s32(ar[16], ar[17]);
1788 bl[17] = vsubq_s32(al[16], al[17]);
1789 br[17] = vsubq_s32(ar[16], ar[17]);
1790 bl[18] = vsubq_s32(al[19], al[18]);
1791 br[18] = vsubq_s32(ar[19], ar[18]);
1792 bl[19] = vaddq_s32(al[19], al[18]);
1793 br[19] = vaddq_s32(ar[19], ar[18]);
1794 bl[20] = vaddq_s32(al[20], al[21]);
1795 br[20] = vaddq_s32(ar[20], ar[21]);
1796 bl[21] = vsubq_s32(al[20], al[21]);
1797 br[21] = vsubq_s32(ar[20], ar[21]);
1798 bl[22] = vsubq_s32(al[23], al[22]);
1799 br[22] = vsubq_s32(ar[23], ar[22]);
1800 bl[23] = vaddq_s32(al[23], al[22]);
1801 br[23] = vaddq_s32(ar[23], ar[22]);
1802 bl[24] = vaddq_s32(al[24], al[25]);
1803 br[24] = vaddq_s32(ar[24], ar[25]);
1804 bl[25] = vsubq_s32(al[24], al[25]);
1805 br[25] = vsubq_s32(ar[24], ar[25]);
1806 bl[26] = vsubq_s32(al[27], al[26]);
1807 br[26] = vsubq_s32(ar[27], ar[26]);
1808 bl[27] = vaddq_s32(al[27], al[26]);
1809 br[27] = vaddq_s32(ar[27], ar[26]);
1810 bl[28] = vaddq_s32(al[28], al[29]);
1811 br[28] = vaddq_s32(ar[28], ar[29]);
1812 bl[29] = vsubq_s32(al[28], al[29]);
1813 br[29] = vsubq_s32(ar[28], ar[29]);
1814 bl[30] = vsubq_s32(al[31], al[30]);
1815 br[30] = vsubq_s32(ar[31], ar[30]);
1816 bl[31] = vaddq_s32(al[31], al[30]);
1817 br[31] = vaddq_s32(ar[31], ar[30]);
1818
1819 // Final stage.
1820
1821 left[0] = bl[0];
1822 right[0] = br[0];
1823 left[16] = bl[1];
1824 right[16] = br[1];
1825 left[8] = bl[2];
1826 right[8] = br[2];
1827 left[24] = bl[3];
1828 right[24] = br[3];
1829 left[4] = bl[4];
1830 right[4] = br[4];
1831 left[20] = bl[5];
1832 right[20] = br[5];
1833 left[12] = bl[6];
1834 right[12] = br[6];
1835 left[28] = bl[7];
1836 right[28] = br[7];
1837 left[2] = bl[8];
1838 right[2] = br[8];
1839 left[18] = bl[9];
1840 right[18] = br[9];
1841 left[10] = bl[10];
1842 right[10] = br[10];
1843 left[26] = bl[11];
1844 right[26] = br[11];
1845 left[6] = bl[12];
1846 right[6] = br[12];
1847 left[22] = bl[13];
1848 right[22] = br[13];
1849 left[14] = bl[14];
1850 right[14] = br[14];
1851 left[30] = bl[15];
1852 right[30] = br[15];
1853
1854 butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
1855 cospi_31_64, &al[1], &ar[1], &al[31],
1856 &ar[31]);
1857 left[1] = al[1];
1858 right[1] = ar[1];
1859 left[31] = al[31];
1860 right[31] = ar[31];
1861
1862 butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
1863 cospi_17_64, cospi_15_64, &al[17], &ar[17],
1864 &al[15], &ar[15]);
1865 left[17] = al[17];
1866 right[17] = ar[17];
1867 left[15] = al[15];
1868 right[15] = ar[15];
1869
1870 butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
1871 cospi_23_64, &al[9], &ar[9], &al[23],
1872 &ar[23]);
1873 left[9] = al[9];
1874 right[9] = ar[9];
1875 left[23] = al[23];
1876 right[23] = ar[23];
1877
1878 butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
1879 cospi_25_64, cospi_7_64, &al[25], &ar[25],
1880 &al[7], &ar[7]);
1881 left[25] = al[25];
1882 right[25] = ar[25];
1883 left[7] = al[7];
1884 right[7] = ar[7];
1885
1886 butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
1887 cospi_27_64, &al[5], &ar[5], &al[27],
1888 &ar[27]);
1889 left[5] = al[5];
1890 right[5] = ar[5];
1891 left[27] = al[27];
1892 right[27] = ar[27];
1893
1894 butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
1895 cospi_21_64, cospi_11_64, &al[21], &ar[21],
1896 &al[11], &ar[11]);
1897 left[21] = al[21];
1898 right[21] = ar[21];
1899 left[11] = al[11];
1900 right[11] = ar[11];
1901
1902 butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
1903 cospi_13_64, cospi_19_64, &al[13], &ar[13],
1904 &al[19], &ar[19]);
1905 left[13] = al[13];
1906 right[13] = ar[13];
1907 left[19] = al[19];
1908 right[19] = ar[19];
1909
1910 butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
1911 cospi_29_64, cospi_3_64, &al[29], &ar[29],
1912 &al[3], &ar[3]);
1913 left[29] = al[29];
1914 right[29] = ar[29];
1915 left[3] = al[3];
1916 right[3] = ar[3];
1917 }
1918
highbd_dct8x32_body_second_pass(int32x4_t * left,int32x4_t * right)1919 static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
1920 int32x4_t *right /*32*/) {
1921 int32x4_t al[32], ar[32];
1922 int32x4_t bl[32], br[32];
1923
1924 // Stage 1: Done as part of the load.
1925
1926 // Stage 2.
1927 // Mini cross. X the first 16 values and the middle 8 of the second half.
1928 al[0] = vaddq_s32(left[0], left[15]);
1929 ar[0] = vaddq_s32(right[0], right[15]);
1930 al[1] = vaddq_s32(left[1], left[14]);
1931 ar[1] = vaddq_s32(right[1], right[14]);
1932 al[2] = vaddq_s32(left[2], left[13]);
1933 ar[2] = vaddq_s32(right[2], right[13]);
1934 al[3] = vaddq_s32(left[3], left[12]);
1935 ar[3] = vaddq_s32(right[3], right[12]);
1936 al[4] = vaddq_s32(left[4], left[11]);
1937 ar[4] = vaddq_s32(right[4], right[11]);
1938 al[5] = vaddq_s32(left[5], left[10]);
1939 ar[5] = vaddq_s32(right[5], right[10]);
1940 al[6] = vaddq_s32(left[6], left[9]);
1941 ar[6] = vaddq_s32(right[6], right[9]);
1942 al[7] = vaddq_s32(left[7], left[8]);
1943 ar[7] = vaddq_s32(right[7], right[8]);
1944
1945 al[8] = vsubq_s32(left[7], left[8]);
1946 ar[8] = vsubq_s32(right[7], right[8]);
1947 al[9] = vsubq_s32(left[6], left[9]);
1948 ar[9] = vsubq_s32(right[6], right[9]);
1949 al[10] = vsubq_s32(left[5], left[10]);
1950 ar[10] = vsubq_s32(right[5], right[10]);
1951 al[11] = vsubq_s32(left[4], left[11]);
1952 ar[11] = vsubq_s32(right[4], right[11]);
1953 al[12] = vsubq_s32(left[3], left[12]);
1954 ar[12] = vsubq_s32(right[3], right[12]);
1955 al[13] = vsubq_s32(left[2], left[13]);
1956 ar[13] = vsubq_s32(right[2], right[13]);
1957 al[14] = vsubq_s32(left[1], left[14]);
1958 ar[14] = vsubq_s32(right[1], right[14]);
1959 al[15] = vsubq_s32(left[0], left[15]);
1960 ar[15] = vsubq_s32(right[0], right[15]);
1961
1962 al[16] = left[16];
1963 ar[16] = right[16];
1964 al[17] = left[17];
1965 ar[17] = right[17];
1966 al[18] = left[18];
1967 ar[18] = right[18];
1968 al[19] = left[19];
1969 ar[19] = right[19];
1970
1971 butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
1972 cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
1973 butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
1974 cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
1975 butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
1976 cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
1977 butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
1978 cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
1979
1980 al[28] = left[28];
1981 ar[28] = right[28];
1982 al[29] = left[29];
1983 ar[29] = right[29];
1984 al[30] = left[30];
1985 ar[30] = right[30];
1986 al[31] = left[31];
1987 ar[31] = right[31];
1988
1989 // Stage 3.
1990 bl[0] = vaddq_s32(al[0], al[7]);
1991 br[0] = vaddq_s32(ar[0], ar[7]);
1992 bl[1] = vaddq_s32(al[1], al[6]);
1993 br[1] = vaddq_s32(ar[1], ar[6]);
1994 bl[2] = vaddq_s32(al[2], al[5]);
1995 br[2] = vaddq_s32(ar[2], ar[5]);
1996 bl[3] = vaddq_s32(al[3], al[4]);
1997 br[3] = vaddq_s32(ar[3], ar[4]);
1998
1999 bl[4] = vsubq_s32(al[3], al[4]);
2000 br[4] = vsubq_s32(ar[3], ar[4]);
2001 bl[5] = vsubq_s32(al[2], al[5]);
2002 br[5] = vsubq_s32(ar[2], ar[5]);
2003 bl[6] = vsubq_s32(al[1], al[6]);
2004 br[6] = vsubq_s32(ar[1], ar[6]);
2005 bl[7] = vsubq_s32(al[0], al[7]);
2006 br[7] = vsubq_s32(ar[0], ar[7]);
2007
2008 bl[8] = al[8];
2009 br[8] = ar[8];
2010 bl[9] = al[9];
2011 br[9] = ar[9];
2012
2013 butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
2014 &bl[13], &br[13], &bl[10], &br[10]);
2015 butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
2016 &bl[12], &br[12], &bl[11], &br[11]);
2017
2018 bl[14] = al[14];
2019 br[14] = ar[14];
2020 bl[15] = al[15];
2021 br[15] = ar[15];
2022
2023 bl[16] = vaddq_s32(left[16], al[23]);
2024 br[16] = vaddq_s32(right[16], ar[23]);
2025 bl[17] = vaddq_s32(left[17], al[22]);
2026 br[17] = vaddq_s32(right[17], ar[22]);
2027 bl[18] = vaddq_s32(left[18], al[21]);
2028 br[18] = vaddq_s32(right[18], ar[21]);
2029 bl[19] = vaddq_s32(left[19], al[20]);
2030 br[19] = vaddq_s32(right[19], ar[20]);
2031
2032 bl[20] = vsubq_s32(left[19], al[20]);
2033 br[20] = vsubq_s32(right[19], ar[20]);
2034 bl[21] = vsubq_s32(left[18], al[21]);
2035 br[21] = vsubq_s32(right[18], ar[21]);
2036 bl[22] = vsubq_s32(left[17], al[22]);
2037 br[22] = vsubq_s32(right[17], ar[22]);
2038 bl[23] = vsubq_s32(left[16], al[23]);
2039 br[23] = vsubq_s32(right[16], ar[23]);
2040
2041 bl[24] = vsubq_s32(left[31], al[24]);
2042 br[24] = vsubq_s32(right[31], ar[24]);
2043 bl[25] = vsubq_s32(left[30], al[25]);
2044 br[25] = vsubq_s32(right[30], ar[25]);
2045 bl[26] = vsubq_s32(left[29], al[26]);
2046 br[26] = vsubq_s32(right[29], ar[26]);
2047 bl[27] = vsubq_s32(left[28], al[27]);
2048 br[27] = vsubq_s32(right[28], ar[27]);
2049
2050 bl[28] = vaddq_s32(left[28], al[27]);
2051 br[28] = vaddq_s32(right[28], ar[27]);
2052 bl[29] = vaddq_s32(left[29], al[26]);
2053 br[29] = vaddq_s32(right[29], ar[26]);
2054 bl[30] = vaddq_s32(left[30], al[25]);
2055 br[30] = vaddq_s32(right[30], ar[25]);
2056 bl[31] = vaddq_s32(left[31], al[24]);
2057 br[31] = vaddq_s32(right[31], ar[24]);
2058
2059 // Stage 4.
2060 al[0] = vaddq_s32(bl[0], bl[3]);
2061 ar[0] = vaddq_s32(br[0], br[3]);
2062 al[1] = vaddq_s32(bl[1], bl[2]);
2063 ar[1] = vaddq_s32(br[1], br[2]);
2064 al[2] = vsubq_s32(bl[1], bl[2]);
2065 ar[2] = vsubq_s32(br[1], br[2]);
2066 al[3] = vsubq_s32(bl[0], bl[3]);
2067 ar[3] = vsubq_s32(br[0], br[3]);
2068
2069 al[4] = bl[4];
2070 ar[4] = br[4];
2071
2072 butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
2073 &ar[6], &al[5], &ar[5]);
2074
2075 al[7] = bl[7];
2076 ar[7] = br[7];
2077
2078 al[8] = vaddq_s32(bl[8], bl[11]);
2079 ar[8] = vaddq_s32(br[8], br[11]);
2080 al[9] = vaddq_s32(bl[9], bl[10]);
2081 ar[9] = vaddq_s32(br[9], br[10]);
2082 al[10] = vsubq_s32(bl[9], bl[10]);
2083 ar[10] = vsubq_s32(br[9], br[10]);
2084 al[11] = vsubq_s32(bl[8], bl[11]);
2085 ar[11] = vsubq_s32(br[8], br[11]);
2086 al[12] = vsubq_s32(bl[15], bl[12]);
2087 ar[12] = vsubq_s32(br[15], br[12]);
2088 al[13] = vsubq_s32(bl[14], bl[13]);
2089 ar[13] = vsubq_s32(br[14], br[13]);
2090 al[14] = vaddq_s32(bl[14], bl[13]);
2091 ar[14] = vaddq_s32(br[14], br[13]);
2092 al[15] = vaddq_s32(bl[15], bl[12]);
2093 ar[15] = vaddq_s32(br[15], br[12]);
2094
2095 al[16] = bl[16];
2096 ar[16] = br[16];
2097 al[17] = bl[17];
2098 ar[17] = br[17];
2099
2100 butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
2101 cospi_24_64, &al[29], &ar[29], &al[18],
2102 &ar[18]);
2103 butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
2104 cospi_24_64, &al[28], &ar[28], &al[19],
2105 &ar[19]);
2106 butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
2107 cospi_24_64, -cospi_8_64, &al[27], &ar[27],
2108 &al[20], &ar[20]);
2109 butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
2110 cospi_24_64, -cospi_8_64, &al[26], &ar[26],
2111 &al[21], &ar[21]);
2112
2113 al[22] = bl[22];
2114 ar[22] = br[22];
2115 al[23] = bl[23];
2116 ar[23] = br[23];
2117 al[24] = bl[24];
2118 ar[24] = br[24];
2119 al[25] = bl[25];
2120 ar[25] = br[25];
2121
2122 al[30] = bl[30];
2123 ar[30] = br[30];
2124 al[31] = bl[31];
2125 ar[31] = br[31];
2126
2127 // Stage 5.
2128 butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
2129 &br[0], &bl[1], &br[1]);
2130 butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
2131 cospi_24_64, &bl[2], &br[2], &bl[3],
2132 &br[3]);
2133
2134 bl[4] = vaddq_s32(al[4], al[5]);
2135 br[4] = vaddq_s32(ar[4], ar[5]);
2136 bl[5] = vsubq_s32(al[4], al[5]);
2137 br[5] = vsubq_s32(ar[4], ar[5]);
2138 bl[6] = vsubq_s32(al[7], al[6]);
2139 br[6] = vsubq_s32(ar[7], ar[6]);
2140 bl[7] = vaddq_s32(al[7], al[6]);
2141 br[7] = vaddq_s32(ar[7], ar[6]);
2142
2143 bl[8] = al[8];
2144 br[8] = ar[8];
2145
2146 butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
2147 cospi_24_64, &bl[14], &br[14], &bl[9],
2148 &br[9]);
2149 butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
2150 cospi_24_64, -cospi_8_64, &bl[13], &br[13],
2151 &bl[10], &br[10]);
2152
2153 bl[11] = al[11];
2154 br[11] = ar[11];
2155 bl[12] = al[12];
2156 br[12] = ar[12];
2157
2158 bl[15] = al[15];
2159 br[15] = ar[15];
2160
2161 bl[16] = vaddq_s32(al[19], al[16]);
2162 br[16] = vaddq_s32(ar[19], ar[16]);
2163 bl[17] = vaddq_s32(al[18], al[17]);
2164 br[17] = vaddq_s32(ar[18], ar[17]);
2165 bl[18] = vsubq_s32(al[17], al[18]);
2166 br[18] = vsubq_s32(ar[17], ar[18]);
2167 bl[19] = vsubq_s32(al[16], al[19]);
2168 br[19] = vsubq_s32(ar[16], ar[19]);
2169 bl[20] = vsubq_s32(al[23], al[20]);
2170 br[20] = vsubq_s32(ar[23], ar[20]);
2171 bl[21] = vsubq_s32(al[22], al[21]);
2172 br[21] = vsubq_s32(ar[22], ar[21]);
2173 bl[22] = vaddq_s32(al[21], al[22]);
2174 br[22] = vaddq_s32(ar[21], ar[22]);
2175 bl[23] = vaddq_s32(al[20], al[23]);
2176 br[23] = vaddq_s32(ar[20], ar[23]);
2177 bl[24] = vaddq_s32(al[27], al[24]);
2178 br[24] = vaddq_s32(ar[27], ar[24]);
2179 bl[25] = vaddq_s32(al[26], al[25]);
2180 br[25] = vaddq_s32(ar[26], ar[25]);
2181 bl[26] = vsubq_s32(al[25], al[26]);
2182 br[26] = vsubq_s32(ar[25], ar[26]);
2183 bl[27] = vsubq_s32(al[24], al[27]);
2184 br[27] = vsubq_s32(ar[24], ar[27]);
2185 bl[28] = vsubq_s32(al[31], al[28]);
2186 br[28] = vsubq_s32(ar[31], ar[28]);
2187 bl[29] = vsubq_s32(al[30], al[29]);
2188 br[29] = vsubq_s32(ar[30], ar[29]);
2189 bl[30] = vaddq_s32(al[29], al[30]);
2190 br[30] = vaddq_s32(ar[29], ar[30]);
2191 bl[31] = vaddq_s32(al[28], al[31]);
2192 br[31] = vaddq_s32(ar[28], ar[31]);
2193
2194 // Stage 6.
2195 al[0] = bl[0];
2196 ar[0] = br[0];
2197 al[1] = bl[1];
2198 ar[1] = br[1];
2199 al[2] = bl[2];
2200 ar[2] = br[2];
2201 al[3] = bl[3];
2202 ar[3] = br[3];
2203
2204 butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
2205 cospi_28_64, &al[4], &ar[4], &al[7],
2206 &ar[7]);
2207 butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
2208 cospi_12_64, &al[5], &ar[5], &al[6],
2209 &ar[6]);
2210
2211 al[8] = vaddq_s32(bl[8], bl[9]);
2212 ar[8] = vaddq_s32(br[8], br[9]);
2213 al[9] = vsubq_s32(bl[8], bl[9]);
2214 ar[9] = vsubq_s32(br[8], br[9]);
2215 al[10] = vsubq_s32(bl[11], bl[10]);
2216 ar[10] = vsubq_s32(br[11], br[10]);
2217 al[11] = vaddq_s32(bl[11], bl[10]);
2218 ar[11] = vaddq_s32(br[11], br[10]);
2219 al[12] = vaddq_s32(bl[12], bl[13]);
2220 ar[12] = vaddq_s32(br[12], br[13]);
2221 al[13] = vsubq_s32(bl[12], bl[13]);
2222 ar[13] = vsubq_s32(br[12], br[13]);
2223 al[14] = vsubq_s32(bl[15], bl[14]);
2224 ar[14] = vsubq_s32(br[15], br[14]);
2225 al[15] = vaddq_s32(bl[15], bl[14]);
2226 ar[15] = vaddq_s32(br[15], br[14]);
2227
2228 al[16] = bl[16];
2229 ar[16] = br[16];
2230 al[19] = bl[19];
2231 ar[19] = br[19];
2232 al[20] = bl[20];
2233 ar[20] = br[20];
2234 al[23] = bl[23];
2235 ar[23] = br[23];
2236 al[24] = bl[24];
2237 ar[24] = br[24];
2238 al[27] = bl[27];
2239 ar[27] = br[27];
2240 al[28] = bl[28];
2241 ar[28] = br[28];
2242 al[31] = bl[31];
2243 ar[31] = br[31];
2244
2245 butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
2246 cospi_28_64, &al[30], &ar[30], &al[17],
2247 &ar[17]);
2248 butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
2249 cospi_28_64, -cospi_4_64, &al[29], &ar[29],
2250 &al[18], &ar[18]);
2251 butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
2252 cospi_20_64, cospi_12_64, &al[26], &ar[26],
2253 &al[21], &ar[21]);
2254 butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
2255 cospi_12_64, -cospi_20_64, &al[25],
2256 &ar[25], &al[22], &ar[22]);
2257
2258 // Stage 7.
2259 bl[0] = al[0];
2260 br[0] = ar[0];
2261 bl[1] = al[1];
2262 br[1] = ar[1];
2263 bl[2] = al[2];
2264 br[2] = ar[2];
2265 bl[3] = al[3];
2266 br[3] = ar[3];
2267 bl[4] = al[4];
2268 br[4] = ar[4];
2269 bl[5] = al[5];
2270 br[5] = ar[5];
2271 bl[6] = al[6];
2272 br[6] = ar[6];
2273 bl[7] = al[7];
2274 br[7] = ar[7];
2275
2276 butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
2277 cospi_30_64, &bl[8], &br[8], &bl[15],
2278 &br[15]);
2279 butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
2280 cospi_14_64, &bl[9], &br[9], &bl[14],
2281 &br[14]);
2282 butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
2283 cospi_10_64, cospi_22_64, &bl[10], &br[10],
2284 &bl[13], &br[13]);
2285 butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
2286 cospi_26_64, cospi_6_64, &bl[11], &br[11],
2287 &bl[12], &br[12]);
2288
2289 bl[16] = vaddq_s32(al[16], al[17]);
2290 br[16] = vaddq_s32(ar[16], ar[17]);
2291 bl[17] = vsubq_s32(al[16], al[17]);
2292 br[17] = vsubq_s32(ar[16], ar[17]);
2293 bl[18] = vsubq_s32(al[19], al[18]);
2294 br[18] = vsubq_s32(ar[19], ar[18]);
2295 bl[19] = vaddq_s32(al[19], al[18]);
2296 br[19] = vaddq_s32(ar[19], ar[18]);
2297 bl[20] = vaddq_s32(al[20], al[21]);
2298 br[20] = vaddq_s32(ar[20], ar[21]);
2299 bl[21] = vsubq_s32(al[20], al[21]);
2300 br[21] = vsubq_s32(ar[20], ar[21]);
2301 bl[22] = vsubq_s32(al[23], al[22]);
2302 br[22] = vsubq_s32(ar[23], ar[22]);
2303 bl[23] = vaddq_s32(al[23], al[22]);
2304 br[23] = vaddq_s32(ar[23], ar[22]);
2305 bl[24] = vaddq_s32(al[24], al[25]);
2306 br[24] = vaddq_s32(ar[24], ar[25]);
2307 bl[25] = vsubq_s32(al[24], al[25]);
2308 br[25] = vsubq_s32(ar[24], ar[25]);
2309 bl[26] = vsubq_s32(al[27], al[26]);
2310 br[26] = vsubq_s32(ar[27], ar[26]);
2311 bl[27] = vaddq_s32(al[27], al[26]);
2312 br[27] = vaddq_s32(ar[27], ar[26]);
2313 bl[28] = vaddq_s32(al[28], al[29]);
2314 br[28] = vaddq_s32(ar[28], ar[29]);
2315 bl[29] = vsubq_s32(al[28], al[29]);
2316 br[29] = vsubq_s32(ar[28], ar[29]);
2317 bl[30] = vsubq_s32(al[31], al[30]);
2318 br[30] = vsubq_s32(ar[31], ar[30]);
2319 bl[31] = vaddq_s32(al[31], al[30]);
2320 br[31] = vaddq_s32(ar[31], ar[30]);
2321
2322 // Final stage.
2323
2324 left[0] = bl[0];
2325 right[0] = br[0];
2326 left[16] = bl[1];
2327 right[16] = br[1];
2328 left[8] = bl[2];
2329 right[8] = br[2];
2330 left[24] = bl[3];
2331 right[24] = br[3];
2332 left[4] = bl[4];
2333 right[4] = br[4];
2334 left[20] = bl[5];
2335 right[20] = br[5];
2336 left[12] = bl[6];
2337 right[12] = br[6];
2338 left[28] = bl[7];
2339 right[28] = br[7];
2340 left[2] = bl[8];
2341 right[2] = br[8];
2342 left[18] = bl[9];
2343 right[18] = br[9];
2344 left[10] = bl[10];
2345 right[10] = br[10];
2346 left[26] = bl[11];
2347 right[26] = br[11];
2348 left[6] = bl[12];
2349 right[6] = br[12];
2350 left[22] = bl[13];
2351 right[22] = br[13];
2352 left[14] = bl[14];
2353 right[14] = br[14];
2354 left[30] = bl[15];
2355 right[30] = br[15];
2356
2357 butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
2358 cospi_31_64, &al[1], &ar[1], &al[31],
2359 &ar[31]);
2360 left[1] = al[1];
2361 right[1] = ar[1];
2362 left[31] = al[31];
2363 right[31] = ar[31];
2364
2365 butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
2366 cospi_17_64, cospi_15_64, &al[17], &ar[17],
2367 &al[15], &ar[15]);
2368 left[17] = al[17];
2369 right[17] = ar[17];
2370 left[15] = al[15];
2371 right[15] = ar[15];
2372
2373 butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
2374 cospi_23_64, &al[9], &ar[9], &al[23],
2375 &ar[23]);
2376 left[9] = al[9];
2377 right[9] = ar[9];
2378 left[23] = al[23];
2379 right[23] = ar[23];
2380
2381 butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
2382 cospi_25_64, cospi_7_64, &al[25], &ar[25],
2383 &al[7], &ar[7]);
2384 left[25] = al[25];
2385 right[25] = ar[25];
2386 left[7] = al[7];
2387 right[7] = ar[7];
2388
2389 butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
2390 cospi_27_64, &al[5], &ar[5], &al[27],
2391 &ar[27]);
2392 left[5] = al[5];
2393 right[5] = ar[5];
2394 left[27] = al[27];
2395 right[27] = ar[27];
2396
2397 butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
2398 cospi_21_64, cospi_11_64, &al[21], &ar[21],
2399 &al[11], &ar[11]);
2400 left[21] = al[21];
2401 right[21] = ar[21];
2402 left[11] = al[11];
2403 right[11] = ar[11];
2404
2405 butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
2406 cospi_13_64, cospi_19_64, &al[13], &ar[13],
2407 &al[19], &ar[19]);
2408 left[13] = al[13];
2409 right[13] = ar[13];
2410 left[19] = al[19];
2411 right[19] = ar[19];
2412
2413 butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
2414 cospi_29_64, cospi_3_64, &al[29], &ar[29],
2415 &al[3], &ar[3]);
2416 left[29] = al[29];
2417 right[29] = ar[29];
2418 left[3] = al[3];
2419 right[3] = ar[3];
2420 }
2421
highbd_dct8x32_body_second_pass_rd(int32x4_t * left,int32x4_t * right)2422 static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
2423 int32x4_t *right /*32*/) {
2424 int32x4_t al[32], ar[32];
2425 int32x4_t bl[32], br[32];
2426
2427 // Stage 1: Done as part of the load.
2428
2429 // Stage 2.
2430 // For the "rd" version, all the values are rounded down after stage 2 to keep
2431 // the values in 16 bits.
2432 al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
2433 ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
2434 al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
2435 ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
2436 al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
2437 ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
2438 al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
2439 ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
2440 al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
2441 ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
2442 al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
2443 ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
2444 al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
2445 ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
2446 al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
2447 ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
2448
2449 al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
2450 ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
2451 al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
2452 ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
2453 al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
2454 ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
2455 al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
2456 ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
2457 al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
2458 ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
2459 al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
2460 ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
2461 al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
2462 ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
2463 al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
2464 ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
2465
2466 al[16] = add_round_shift_s32(left[16]);
2467 ar[16] = add_round_shift_s32(right[16]);
2468 al[17] = add_round_shift_s32(left[17]);
2469 ar[17] = add_round_shift_s32(right[17]);
2470 al[18] = add_round_shift_s32(left[18]);
2471 ar[18] = add_round_shift_s32(right[18]);
2472 al[19] = add_round_shift_s32(left[19]);
2473 ar[19] = add_round_shift_s32(right[19]);
2474
2475 butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
2476 cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
2477 butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
2478 cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
2479 butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
2480 cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
2481 butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
2482 cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
2483
2484 al[20] = add_round_shift_s32(al[20]);
2485 ar[20] = add_round_shift_s32(ar[20]);
2486 al[21] = add_round_shift_s32(al[21]);
2487 ar[21] = add_round_shift_s32(ar[21]);
2488 al[22] = add_round_shift_s32(al[22]);
2489 ar[22] = add_round_shift_s32(ar[22]);
2490 al[23] = add_round_shift_s32(al[23]);
2491 ar[23] = add_round_shift_s32(ar[23]);
2492 al[24] = add_round_shift_s32(al[24]);
2493 ar[24] = add_round_shift_s32(ar[24]);
2494 al[25] = add_round_shift_s32(al[25]);
2495 ar[25] = add_round_shift_s32(ar[25]);
2496 al[26] = add_round_shift_s32(al[26]);
2497 ar[26] = add_round_shift_s32(ar[26]);
2498 al[27] = add_round_shift_s32(al[27]);
2499 ar[27] = add_round_shift_s32(ar[27]);
2500
2501 al[28] = add_round_shift_s32(left[28]);
2502 ar[28] = add_round_shift_s32(right[28]);
2503 al[29] = add_round_shift_s32(left[29]);
2504 ar[29] = add_round_shift_s32(right[29]);
2505 al[30] = add_round_shift_s32(left[30]);
2506 ar[30] = add_round_shift_s32(right[30]);
2507 al[31] = add_round_shift_s32(left[31]);
2508 ar[31] = add_round_shift_s32(right[31]);
2509
2510 // Stage 3.
2511 bl[0] = vaddq_s32(al[0], al[7]);
2512 br[0] = vaddq_s32(ar[0], ar[7]);
2513 bl[1] = vaddq_s32(al[1], al[6]);
2514 br[1] = vaddq_s32(ar[1], ar[6]);
2515 bl[2] = vaddq_s32(al[2], al[5]);
2516 br[2] = vaddq_s32(ar[2], ar[5]);
2517 bl[3] = vaddq_s32(al[3], al[4]);
2518 br[3] = vaddq_s32(ar[3], ar[4]);
2519
2520 bl[4] = vsubq_s32(al[3], al[4]);
2521 br[4] = vsubq_s32(ar[3], ar[4]);
2522 bl[5] = vsubq_s32(al[2], al[5]);
2523 br[5] = vsubq_s32(ar[2], ar[5]);
2524 bl[6] = vsubq_s32(al[1], al[6]);
2525 br[6] = vsubq_s32(ar[1], ar[6]);
2526 bl[7] = vsubq_s32(al[0], al[7]);
2527 br[7] = vsubq_s32(ar[0], ar[7]);
2528
2529 bl[8] = al[8];
2530 br[8] = ar[8];
2531 bl[9] = al[9];
2532 br[9] = ar[9];
2533
2534 butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
2535 &bl[13], &br[13], &bl[10], &br[10]);
2536 butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
2537 &bl[12], &br[12], &bl[11], &br[11]);
2538
2539 bl[14] = al[14];
2540 br[14] = ar[14];
2541 bl[15] = al[15];
2542 br[15] = ar[15];
2543
2544 bl[16] = vaddq_s32(al[16], al[23]);
2545 br[16] = vaddq_s32(ar[16], ar[23]);
2546 bl[17] = vaddq_s32(al[17], al[22]);
2547 br[17] = vaddq_s32(ar[17], ar[22]);
2548 bl[18] = vaddq_s32(al[18], al[21]);
2549 br[18] = vaddq_s32(ar[18], ar[21]);
2550 bl[19] = vaddq_s32(al[19], al[20]);
2551 br[19] = vaddq_s32(ar[19], ar[20]);
2552
2553 bl[20] = vsubq_s32(al[19], al[20]);
2554 br[20] = vsubq_s32(ar[19], ar[20]);
2555 bl[21] = vsubq_s32(al[18], al[21]);
2556 br[21] = vsubq_s32(ar[18], ar[21]);
2557 bl[22] = vsubq_s32(al[17], al[22]);
2558 br[22] = vsubq_s32(ar[17], ar[22]);
2559 bl[23] = vsubq_s32(al[16], al[23]);
2560 br[23] = vsubq_s32(ar[16], ar[23]);
2561
2562 bl[24] = vsubq_s32(al[31], al[24]);
2563 br[24] = vsubq_s32(ar[31], ar[24]);
2564 bl[25] = vsubq_s32(al[30], al[25]);
2565 br[25] = vsubq_s32(ar[30], ar[25]);
2566 bl[26] = vsubq_s32(al[29], al[26]);
2567 br[26] = vsubq_s32(ar[29], ar[26]);
2568 bl[27] = vsubq_s32(al[28], al[27]);
2569 br[27] = vsubq_s32(ar[28], ar[27]);
2570
2571 bl[28] = vaddq_s32(al[28], al[27]);
2572 br[28] = vaddq_s32(ar[28], ar[27]);
2573 bl[29] = vaddq_s32(al[29], al[26]);
2574 br[29] = vaddq_s32(ar[29], ar[26]);
2575 bl[30] = vaddq_s32(al[30], al[25]);
2576 br[30] = vaddq_s32(ar[30], ar[25]);
2577 bl[31] = vaddq_s32(al[31], al[24]);
2578 br[31] = vaddq_s32(ar[31], ar[24]);
2579
2580 // Stage 4.
2581 al[0] = vaddq_s32(bl[0], bl[3]);
2582 ar[0] = vaddq_s32(br[0], br[3]);
2583 al[1] = vaddq_s32(bl[1], bl[2]);
2584 ar[1] = vaddq_s32(br[1], br[2]);
2585 al[2] = vsubq_s32(bl[1], bl[2]);
2586 ar[2] = vsubq_s32(br[1], br[2]);
2587 al[3] = vsubq_s32(bl[0], bl[3]);
2588 ar[3] = vsubq_s32(br[0], br[3]);
2589
2590 al[4] = bl[4];
2591 ar[4] = br[4];
2592
2593 butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
2594 &ar[6], &al[5], &ar[5]);
2595
2596 al[7] = bl[7];
2597 ar[7] = br[7];
2598
2599 al[8] = vaddq_s32(bl[8], bl[11]);
2600 ar[8] = vaddq_s32(br[8], br[11]);
2601 al[9] = vaddq_s32(bl[9], bl[10]);
2602 ar[9] = vaddq_s32(br[9], br[10]);
2603 al[10] = vsubq_s32(bl[9], bl[10]);
2604 ar[10] = vsubq_s32(br[9], br[10]);
2605 al[11] = vsubq_s32(bl[8], bl[11]);
2606 ar[11] = vsubq_s32(br[8], br[11]);
2607 al[12] = vsubq_s32(bl[15], bl[12]);
2608 ar[12] = vsubq_s32(br[15], br[12]);
2609 al[13] = vsubq_s32(bl[14], bl[13]);
2610 ar[13] = vsubq_s32(br[14], br[13]);
2611 al[14] = vaddq_s32(bl[14], bl[13]);
2612 ar[14] = vaddq_s32(br[14], br[13]);
2613 al[15] = vaddq_s32(bl[15], bl[12]);
2614 ar[15] = vaddq_s32(br[15], br[12]);
2615
2616 al[16] = bl[16];
2617 ar[16] = br[16];
2618 al[17] = bl[17];
2619 ar[17] = br[17];
2620
2621 butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
2622 cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
2623 butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
2624 cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
2625 butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
2626 -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
2627 butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
2628 -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
2629
2630 al[22] = bl[22];
2631 ar[22] = br[22];
2632 al[23] = bl[23];
2633 ar[23] = br[23];
2634 al[24] = bl[24];
2635 ar[24] = br[24];
2636 al[25] = bl[25];
2637 ar[25] = br[25];
2638
2639 al[30] = bl[30];
2640 ar[30] = br[30];
2641 al[31] = bl[31];
2642 ar[31] = br[31];
2643
2644 // Stage 5.
2645 butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
2646 &br[0], &bl[1], &br[1]);
2647 butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
2648 &bl[2], &br[2], &bl[3], &br[3]);
2649
2650 bl[4] = vaddq_s32(al[4], al[5]);
2651 br[4] = vaddq_s32(ar[4], ar[5]);
2652 bl[5] = vsubq_s32(al[4], al[5]);
2653 br[5] = vsubq_s32(ar[4], ar[5]);
2654 bl[6] = vsubq_s32(al[7], al[6]);
2655 br[6] = vsubq_s32(ar[7], ar[6]);
2656 bl[7] = vaddq_s32(al[7], al[6]);
2657 br[7] = vaddq_s32(ar[7], ar[6]);
2658
2659 bl[8] = al[8];
2660 br[8] = ar[8];
2661
2662 butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
2663 &bl[14], &br[14], &bl[9], &br[9]);
2664 butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
2665 -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
2666
2667 bl[11] = al[11];
2668 br[11] = ar[11];
2669 bl[12] = al[12];
2670 br[12] = ar[12];
2671
2672 bl[15] = al[15];
2673 br[15] = ar[15];
2674
2675 bl[16] = vaddq_s32(al[19], al[16]);
2676 br[16] = vaddq_s32(ar[19], ar[16]);
2677 bl[17] = vaddq_s32(al[18], al[17]);
2678 br[17] = vaddq_s32(ar[18], ar[17]);
2679 bl[18] = vsubq_s32(al[17], al[18]);
2680 br[18] = vsubq_s32(ar[17], ar[18]);
2681 bl[19] = vsubq_s32(al[16], al[19]);
2682 br[19] = vsubq_s32(ar[16], ar[19]);
2683 bl[20] = vsubq_s32(al[23], al[20]);
2684 br[20] = vsubq_s32(ar[23], ar[20]);
2685 bl[21] = vsubq_s32(al[22], al[21]);
2686 br[21] = vsubq_s32(ar[22], ar[21]);
2687 bl[22] = vaddq_s32(al[21], al[22]);
2688 br[22] = vaddq_s32(ar[21], ar[22]);
2689 bl[23] = vaddq_s32(al[20], al[23]);
2690 br[23] = vaddq_s32(ar[20], ar[23]);
2691 bl[24] = vaddq_s32(al[27], al[24]);
2692 br[24] = vaddq_s32(ar[27], ar[24]);
2693 bl[25] = vaddq_s32(al[26], al[25]);
2694 br[25] = vaddq_s32(ar[26], ar[25]);
2695 bl[26] = vsubq_s32(al[25], al[26]);
2696 br[26] = vsubq_s32(ar[25], ar[26]);
2697 bl[27] = vsubq_s32(al[24], al[27]);
2698 br[27] = vsubq_s32(ar[24], ar[27]);
2699 bl[28] = vsubq_s32(al[31], al[28]);
2700 br[28] = vsubq_s32(ar[31], ar[28]);
2701 bl[29] = vsubq_s32(al[30], al[29]);
2702 br[29] = vsubq_s32(ar[30], ar[29]);
2703 bl[30] = vaddq_s32(al[29], al[30]);
2704 br[30] = vaddq_s32(ar[29], ar[30]);
2705 bl[31] = vaddq_s32(al[28], al[31]);
2706 br[31] = vaddq_s32(ar[28], ar[31]);
2707
2708 // Stage 6.
2709 al[0] = bl[0];
2710 ar[0] = br[0];
2711 al[1] = bl[1];
2712 ar[1] = br[1];
2713 al[2] = bl[2];
2714 ar[2] = br[2];
2715 al[3] = bl[3];
2716 ar[3] = br[3];
2717
2718 butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
2719 &al[4], &ar[4], &al[7], &ar[7]);
2720 butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
2721 &al[5], &ar[5], &al[6], &ar[6]);
2722
2723 al[8] = vaddq_s32(bl[8], bl[9]);
2724 ar[8] = vaddq_s32(br[8], br[9]);
2725 al[9] = vsubq_s32(bl[8], bl[9]);
2726 ar[9] = vsubq_s32(br[8], br[9]);
2727 al[10] = vsubq_s32(bl[11], bl[10]);
2728 ar[10] = vsubq_s32(br[11], br[10]);
2729 al[11] = vaddq_s32(bl[11], bl[10]);
2730 ar[11] = vaddq_s32(br[11], br[10]);
2731 al[12] = vaddq_s32(bl[12], bl[13]);
2732 ar[12] = vaddq_s32(br[12], br[13]);
2733 al[13] = vsubq_s32(bl[12], bl[13]);
2734 ar[13] = vsubq_s32(br[12], br[13]);
2735 al[14] = vsubq_s32(bl[15], bl[14]);
2736 ar[14] = vsubq_s32(br[15], br[14]);
2737 al[15] = vaddq_s32(bl[15], bl[14]);
2738 ar[15] = vaddq_s32(br[15], br[14]);
2739
2740 al[16] = bl[16];
2741 ar[16] = br[16];
2742 al[19] = bl[19];
2743 ar[19] = br[19];
2744 al[20] = bl[20];
2745 ar[20] = br[20];
2746 al[23] = bl[23];
2747 ar[23] = br[23];
2748 al[24] = bl[24];
2749 ar[24] = br[24];
2750 al[27] = bl[27];
2751 ar[27] = br[27];
2752 al[28] = bl[28];
2753 ar[28] = br[28];
2754 al[31] = bl[31];
2755 ar[31] = br[31];
2756
2757 butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
2758 cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
2759 butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
2760 -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
2761 butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
2762 cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
2763 butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
2764 -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
2765
2766 // Stage 7.
2767 bl[0] = al[0];
2768 br[0] = ar[0];
2769 bl[1] = al[1];
2770 br[1] = ar[1];
2771 bl[2] = al[2];
2772 br[2] = ar[2];
2773 bl[3] = al[3];
2774 br[3] = ar[3];
2775 bl[4] = al[4];
2776 br[4] = ar[4];
2777 bl[5] = al[5];
2778 br[5] = ar[5];
2779 bl[6] = al[6];
2780 br[6] = ar[6];
2781 bl[7] = al[7];
2782 br[7] = ar[7];
2783
2784 butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
2785 &bl[8], &br[8], &bl[15], &br[15]);
2786 butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
2787 cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
2788 butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
2789 cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
2790 butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
2791 cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
2792
2793 bl[16] = vaddq_s32(al[16], al[17]);
2794 br[16] = vaddq_s32(ar[16], ar[17]);
2795 bl[17] = vsubq_s32(al[16], al[17]);
2796 br[17] = vsubq_s32(ar[16], ar[17]);
2797 bl[18] = vsubq_s32(al[19], al[18]);
2798 br[18] = vsubq_s32(ar[19], ar[18]);
2799 bl[19] = vaddq_s32(al[19], al[18]);
2800 br[19] = vaddq_s32(ar[19], ar[18]);
2801 bl[20] = vaddq_s32(al[20], al[21]);
2802 br[20] = vaddq_s32(ar[20], ar[21]);
2803 bl[21] = vsubq_s32(al[20], al[21]);
2804 br[21] = vsubq_s32(ar[20], ar[21]);
2805 bl[22] = vsubq_s32(al[23], al[22]);
2806 br[22] = vsubq_s32(ar[23], ar[22]);
2807 bl[23] = vaddq_s32(al[23], al[22]);
2808 br[23] = vaddq_s32(ar[23], ar[22]);
2809 bl[24] = vaddq_s32(al[24], al[25]);
2810 br[24] = vaddq_s32(ar[24], ar[25]);
2811 bl[25] = vsubq_s32(al[24], al[25]);
2812 br[25] = vsubq_s32(ar[24], ar[25]);
2813 bl[26] = vsubq_s32(al[27], al[26]);
2814 br[26] = vsubq_s32(ar[27], ar[26]);
2815 bl[27] = vaddq_s32(al[27], al[26]);
2816 br[27] = vaddq_s32(ar[27], ar[26]);
2817 bl[28] = vaddq_s32(al[28], al[29]);
2818 br[28] = vaddq_s32(ar[28], ar[29]);
2819 bl[29] = vsubq_s32(al[28], al[29]);
2820 br[29] = vsubq_s32(ar[28], ar[29]);
2821 bl[30] = vsubq_s32(al[31], al[30]);
2822 br[30] = vsubq_s32(ar[31], ar[30]);
2823 bl[31] = vaddq_s32(al[31], al[30]);
2824 br[31] = vaddq_s32(ar[31], ar[30]);
2825
2826 // Final stage.
2827 left[0] = bl[0];
2828 right[0] = br[0];
2829 left[16] = bl[1];
2830 right[16] = br[1];
2831 left[8] = bl[2];
2832 right[8] = br[2];
2833 left[24] = bl[3];
2834 right[24] = br[3];
2835 left[4] = bl[4];
2836 right[4] = br[4];
2837 left[20] = bl[5];
2838 right[20] = br[5];
2839 left[12] = bl[6];
2840 right[12] = br[6];
2841 left[28] = bl[7];
2842 right[28] = br[7];
2843 left[2] = bl[8];
2844 right[2] = br[8];
2845 left[18] = bl[9];
2846 right[18] = br[9];
2847 left[10] = bl[10];
2848 right[10] = br[10];
2849 left[26] = bl[11];
2850 right[26] = br[11];
2851 left[6] = bl[12];
2852 right[6] = br[12];
2853 left[22] = bl[13];
2854 right[22] = br[13];
2855 left[14] = bl[14];
2856 right[14] = br[14];
2857 left[30] = bl[15];
2858 right[30] = br[15];
2859
2860 butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
2861 cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
2862 left[1] = al[1];
2863 right[1] = ar[1];
2864 left[31] = al[31];
2865 right[31] = ar[31];
2866
2867 butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
2868 cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
2869 left[17] = al[17];
2870 right[17] = ar[17];
2871 left[15] = al[15];
2872 right[15] = ar[15];
2873
2874 butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
2875 cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
2876 left[9] = al[9];
2877 right[9] = ar[9];
2878 left[23] = al[23];
2879 right[23] = ar[23];
2880
2881 butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
2882 cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
2883 left[25] = al[25];
2884 right[25] = ar[25];
2885 left[7] = al[7];
2886 right[7] = ar[7];
2887
2888 butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
2889 cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
2890 left[5] = al[5];
2891 right[5] = ar[5];
2892 left[27] = al[27];
2893 right[27] = ar[27];
2894
2895 butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
2896 cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
2897 left[21] = al[21];
2898 right[21] = ar[21];
2899 left[11] = al[11];
2900 right[11] = ar[11];
2901
2902 butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
2903 cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
2904 left[13] = al[13];
2905 right[13] = ar[13];
2906 left[19] = al[19];
2907 right[19] = ar[19];
2908
2909 butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
2910 cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
2911 left[29] = al[29];
2912 right[29] = ar[29];
2913 left[3] = al[3];
2914 right[3] = ar[3];
2915 }
2916
2917 #endif // CONFIG_VP9_HIGHBITDEPTH
2918
2919 #endif // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
2920