xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/fdct32x32_neon.h (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
12 #define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
13 
14 #include <arm_neon.h>
15 
16 #include "./vpx_config.h"
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx_dsp/txfm_common.h"
19 #include "vpx_dsp/arm/mem_neon.h"
20 #include "vpx_dsp/arm/transpose_neon.h"
21 #include "vpx_dsp/arm/fdct_neon.h"
22 
23 // Load & cross the first 8 and last 8, then the middle
load_cross(const int16_t * a,int stride,int16x8_t * b)24 static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
25   b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
26   b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
27   b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
28   b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
29   b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
30   b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
31   b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
32   b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
33 
34   b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
35   b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
36   b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
37   b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
38   b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
39   b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
40   b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
41   b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
42 
43   b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
44   b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
45   b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
46   b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
47   b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
48   b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
49   b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
50   b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
51 
52   b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
53   b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
54   b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
55   b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
56   b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
57   b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
58   b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
59   b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
60 }
61 
62 #define STORE_S16(src, index, dest)           \
63   do {                                        \
64     store_s16q_to_tran_low(dest, src[index]); \
65     dest += 8;                                \
66   } while (0)
67 
68 // Store 32 16x8 values, assuming stride == 32.
69 // Slight twist: store horizontally in blocks of 8.
store(tran_low_t * a,const int16x8_t * b)70 static INLINE void store(tran_low_t *a, const int16x8_t *b) {
71   STORE_S16(b, 0, a);
72   STORE_S16(b, 8, a);
73   STORE_S16(b, 16, a);
74   STORE_S16(b, 24, a);
75   STORE_S16(b, 1, a);
76   STORE_S16(b, 9, a);
77   STORE_S16(b, 17, a);
78   STORE_S16(b, 25, a);
79   STORE_S16(b, 2, a);
80   STORE_S16(b, 10, a);
81   STORE_S16(b, 18, a);
82   STORE_S16(b, 26, a);
83   STORE_S16(b, 3, a);
84   STORE_S16(b, 11, a);
85   STORE_S16(b, 19, a);
86   STORE_S16(b, 27, a);
87   STORE_S16(b, 4, a);
88   STORE_S16(b, 12, a);
89   STORE_S16(b, 20, a);
90   STORE_S16(b, 28, a);
91   STORE_S16(b, 5, a);
92   STORE_S16(b, 13, a);
93   STORE_S16(b, 21, a);
94   STORE_S16(b, 29, a);
95   STORE_S16(b, 6, a);
96   STORE_S16(b, 14, a);
97   STORE_S16(b, 22, a);
98   STORE_S16(b, 30, a);
99   STORE_S16(b, 7, a);
100   STORE_S16(b, 15, a);
101   STORE_S16(b, 23, a);
102   STORE_S16(b, 31, a);
103 }
104 
105 #undef STORE_S16
106 
scale_input(const int16x8_t * in,int16x8_t * out)107 static INLINE void scale_input(const int16x8_t *in /*32*/,
108                                int16x8_t *out /*32*/) {
109   out[0] = vshlq_n_s16(in[0], 2);
110   out[1] = vshlq_n_s16(in[1], 2);
111   out[2] = vshlq_n_s16(in[2], 2);
112   out[3] = vshlq_n_s16(in[3], 2);
113   out[4] = vshlq_n_s16(in[4], 2);
114   out[5] = vshlq_n_s16(in[5], 2);
115   out[6] = vshlq_n_s16(in[6], 2);
116   out[7] = vshlq_n_s16(in[7], 2);
117 
118   out[8] = vshlq_n_s16(in[8], 2);
119   out[9] = vshlq_n_s16(in[9], 2);
120   out[10] = vshlq_n_s16(in[10], 2);
121   out[11] = vshlq_n_s16(in[11], 2);
122   out[12] = vshlq_n_s16(in[12], 2);
123   out[13] = vshlq_n_s16(in[13], 2);
124   out[14] = vshlq_n_s16(in[14], 2);
125   out[15] = vshlq_n_s16(in[15], 2);
126 
127   out[16] = vshlq_n_s16(in[16], 2);
128   out[17] = vshlq_n_s16(in[17], 2);
129   out[18] = vshlq_n_s16(in[18], 2);
130   out[19] = vshlq_n_s16(in[19], 2);
131   out[20] = vshlq_n_s16(in[20], 2);
132   out[21] = vshlq_n_s16(in[21], 2);
133   out[22] = vshlq_n_s16(in[22], 2);
134   out[23] = vshlq_n_s16(in[23], 2);
135 
136   out[24] = vshlq_n_s16(in[24], 2);
137   out[25] = vshlq_n_s16(in[25], 2);
138   out[26] = vshlq_n_s16(in[26], 2);
139   out[27] = vshlq_n_s16(in[27], 2);
140   out[28] = vshlq_n_s16(in[28], 2);
141   out[29] = vshlq_n_s16(in[29], 2);
142   out[30] = vshlq_n_s16(in[30], 2);
143   out[31] = vshlq_n_s16(in[31], 2);
144 }
145 
dct_body_first_pass(const int16x8_t * in,int16x8_t * out)146 static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
147   int16x8_t a[32];
148   int16x8_t b[32];
149 
150   // Stage 1: Done as part of the load.
151 
152   // Stage 2.
153   // Mini cross. X the first 16 values and the middle 8 of the second half.
154   a[0] = vaddq_s16(in[0], in[15]);
155   a[1] = vaddq_s16(in[1], in[14]);
156   a[2] = vaddq_s16(in[2], in[13]);
157   a[3] = vaddq_s16(in[3], in[12]);
158   a[4] = vaddq_s16(in[4], in[11]);
159   a[5] = vaddq_s16(in[5], in[10]);
160   a[6] = vaddq_s16(in[6], in[9]);
161   a[7] = vaddq_s16(in[7], in[8]);
162 
163   a[8] = vsubq_s16(in[7], in[8]);
164   a[9] = vsubq_s16(in[6], in[9]);
165   a[10] = vsubq_s16(in[5], in[10]);
166   a[11] = vsubq_s16(in[4], in[11]);
167   a[12] = vsubq_s16(in[3], in[12]);
168   a[13] = vsubq_s16(in[2], in[13]);
169   a[14] = vsubq_s16(in[1], in[14]);
170   a[15] = vsubq_s16(in[0], in[15]);
171 
172   a[16] = in[16];
173   a[17] = in[17];
174   a[18] = in[18];
175   a[19] = in[19];
176 
177   butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
178                                      &a[20]);
179   butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
180                                      &a[21]);
181   butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
182                                      &a[22]);
183   butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
184                                      &a[23]);
185 
186   a[28] = in[28];
187   a[29] = in[29];
188   a[30] = in[30];
189   a[31] = in[31];
190 
191   // Stage 3.
192   b[0] = vaddq_s16(a[0], a[7]);
193   b[1] = vaddq_s16(a[1], a[6]);
194   b[2] = vaddq_s16(a[2], a[5]);
195   b[3] = vaddq_s16(a[3], a[4]);
196 
197   b[4] = vsubq_s16(a[3], a[4]);
198   b[5] = vsubq_s16(a[2], a[5]);
199   b[6] = vsubq_s16(a[1], a[6]);
200   b[7] = vsubq_s16(a[0], a[7]);
201 
202   b[8] = a[8];
203   b[9] = a[9];
204 
205   butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
206   butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
207 
208   b[14] = a[14];
209   b[15] = a[15];
210 
211   b[16] = vaddq_s16(in[16], a[23]);
212   b[17] = vaddq_s16(in[17], a[22]);
213   b[18] = vaddq_s16(in[18], a[21]);
214   b[19] = vaddq_s16(in[19], a[20]);
215 
216   b[20] = vsubq_s16(in[19], a[20]);
217   b[21] = vsubq_s16(in[18], a[21]);
218   b[22] = vsubq_s16(in[17], a[22]);
219   b[23] = vsubq_s16(in[16], a[23]);
220 
221   b[24] = vsubq_s16(in[31], a[24]);
222   b[25] = vsubq_s16(in[30], a[25]);
223   b[26] = vsubq_s16(in[29], a[26]);
224   b[27] = vsubq_s16(in[28], a[27]);
225 
226   b[28] = vaddq_s16(in[28], a[27]);
227   b[29] = vaddq_s16(in[29], a[26]);
228   b[30] = vaddq_s16(in[30], a[25]);
229   b[31] = vaddq_s16(in[31], a[24]);
230 
231   // Stage 4.
232   a[0] = vaddq_s16(b[0], b[3]);
233   a[1] = vaddq_s16(b[1], b[2]);
234   a[2] = vsubq_s16(b[1], b[2]);
235   a[3] = vsubq_s16(b[0], b[3]);
236 
237   a[4] = b[4];
238 
239   butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
240 
241   a[7] = b[7];
242 
243   a[8] = vaddq_s16(b[8], b[11]);
244   a[9] = vaddq_s16(b[9], b[10]);
245   a[10] = vsubq_s16(b[9], b[10]);
246   a[11] = vsubq_s16(b[8], b[11]);
247   a[12] = vsubq_s16(b[15], b[12]);
248   a[13] = vsubq_s16(b[14], b[13]);
249   a[14] = vaddq_s16(b[14], b[13]);
250   a[15] = vaddq_s16(b[15], b[12]);
251 
252   a[16] = b[16];
253   a[17] = b[17];
254 
255   butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
256   butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
257   butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
258   butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
259 
260   a[22] = b[22];
261   a[23] = b[23];
262   a[24] = b[24];
263   a[25] = b[25];
264 
265   a[30] = b[30];
266   a[31] = b[31];
267 
268   // Stage 5.
269   butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
270   butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
271 
272   b[4] = vaddq_s16(a[4], a[5]);
273   b[5] = vsubq_s16(a[4], a[5]);
274   b[6] = vsubq_s16(a[7], a[6]);
275   b[7] = vaddq_s16(a[7], a[6]);
276 
277   b[8] = a[8];
278 
279   butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
280   butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
281 
282   b[11] = a[11];
283   b[12] = a[12];
284 
285   b[15] = a[15];
286 
287   b[16] = vaddq_s16(a[19], a[16]);
288   b[17] = vaddq_s16(a[18], a[17]);
289   b[18] = vsubq_s16(a[17], a[18]);
290   b[19] = vsubq_s16(a[16], a[19]);
291   b[20] = vsubq_s16(a[23], a[20]);
292   b[21] = vsubq_s16(a[22], a[21]);
293   b[22] = vaddq_s16(a[21], a[22]);
294   b[23] = vaddq_s16(a[20], a[23]);
295   b[24] = vaddq_s16(a[27], a[24]);
296   b[25] = vaddq_s16(a[26], a[25]);
297   b[26] = vsubq_s16(a[25], a[26]);
298   b[27] = vsubq_s16(a[24], a[27]);
299   b[28] = vsubq_s16(a[31], a[28]);
300   b[29] = vsubq_s16(a[30], a[29]);
301   b[30] = vaddq_s16(a[29], a[30]);
302   b[31] = vaddq_s16(a[28], a[31]);
303 
304   // Stage 6.
305   a[0] = b[0];
306   a[1] = b[1];
307   a[2] = b[2];
308   a[3] = b[3];
309 
310   butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
311   butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
312 
313   a[8] = vaddq_s16(b[8], b[9]);
314   a[9] = vsubq_s16(b[8], b[9]);
315   a[10] = vsubq_s16(b[11], b[10]);
316   a[11] = vaddq_s16(b[11], b[10]);
317   a[12] = vaddq_s16(b[12], b[13]);
318   a[13] = vsubq_s16(b[12], b[13]);
319   a[14] = vsubq_s16(b[15], b[14]);
320   a[15] = vaddq_s16(b[15], b[14]);
321 
322   a[16] = b[16];
323   a[19] = b[19];
324   a[20] = b[20];
325   a[23] = b[23];
326   a[24] = b[24];
327   a[27] = b[27];
328   a[28] = b[28];
329   a[31] = b[31];
330 
331   butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
332   butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
333 
334   butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
335   butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
336 
337   // Stage 7.
338   b[0] = a[0];
339   b[1] = a[1];
340   b[2] = a[2];
341   b[3] = a[3];
342   b[4] = a[4];
343   b[5] = a[5];
344   b[6] = a[6];
345   b[7] = a[7];
346 
347   butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
348   butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
349   butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
350   butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
351 
352   b[16] = vaddq_s16(a[16], a[17]);
353   b[17] = vsubq_s16(a[16], a[17]);
354   b[18] = vsubq_s16(a[19], a[18]);
355   b[19] = vaddq_s16(a[19], a[18]);
356   b[20] = vaddq_s16(a[20], a[21]);
357   b[21] = vsubq_s16(a[20], a[21]);
358   b[22] = vsubq_s16(a[23], a[22]);
359   b[23] = vaddq_s16(a[23], a[22]);
360   b[24] = vaddq_s16(a[24], a[25]);
361   b[25] = vsubq_s16(a[24], a[25]);
362   b[26] = vsubq_s16(a[27], a[26]);
363   b[27] = vaddq_s16(a[27], a[26]);
364   b[28] = vaddq_s16(a[28], a[29]);
365   b[29] = vsubq_s16(a[28], a[29]);
366   b[30] = vsubq_s16(a[31], a[30]);
367   b[31] = vaddq_s16(a[31], a[30]);
368 
369   // Final stage.
370   // Also compute partial rounding shift:
371   // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
372   out[0] = sub_round_shift_s16(b[0]);
373   out[16] = sub_round_shift_s16(b[1]);
374   out[8] = sub_round_shift_s16(b[2]);
375   out[24] = sub_round_shift_s16(b[3]);
376   out[4] = sub_round_shift_s16(b[4]);
377   out[20] = sub_round_shift_s16(b[5]);
378   out[12] = sub_round_shift_s16(b[6]);
379   out[28] = sub_round_shift_s16(b[7]);
380   out[2] = sub_round_shift_s16(b[8]);
381   out[18] = sub_round_shift_s16(b[9]);
382   out[10] = sub_round_shift_s16(b[10]);
383   out[26] = sub_round_shift_s16(b[11]);
384   out[6] = sub_round_shift_s16(b[12]);
385   out[22] = sub_round_shift_s16(b[13]);
386   out[14] = sub_round_shift_s16(b[14]);
387   out[30] = sub_round_shift_s16(b[15]);
388 
389   butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
390   out[1] = sub_round_shift_s16(a[1]);
391   out[31] = sub_round_shift_s16(a[31]);
392 
393   butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
394   out[17] = sub_round_shift_s16(a[17]);
395   out[15] = sub_round_shift_s16(a[15]);
396 
397   butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
398   out[9] = sub_round_shift_s16(a[9]);
399   out[23] = sub_round_shift_s16(a[23]);
400 
401   butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
402   out[25] = sub_round_shift_s16(a[25]);
403   out[7] = sub_round_shift_s16(a[7]);
404 
405   butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
406   out[5] = sub_round_shift_s16(a[5]);
407   out[27] = sub_round_shift_s16(a[27]);
408 
409   butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
410   out[21] = sub_round_shift_s16(a[21]);
411   out[11] = sub_round_shift_s16(a[11]);
412 
413   butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
414   out[13] = sub_round_shift_s16(a[13]);
415   out[19] = sub_round_shift_s16(a[19]);
416 
417   butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
418   out[29] = sub_round_shift_s16(a[29]);
419   out[3] = sub_round_shift_s16(a[3]);
420 }
421 
422 #define PASS_THROUGH(src, dst, element)    \
423   do {                                     \
424     dst##_lo[element] = src##_lo[element]; \
425     dst##_hi[element] = src##_hi[element]; \
426   } while (0)
427 
428 #define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
429   do {                                                                        \
430     b##_lo[b_index] =                                                         \
431         vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
432     b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
433                                 vget_high_s16(a[right_index]));               \
434   } while (0)
435 
436 #define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
437   do {                                                                        \
438     b##_lo[b_index] =                                                         \
439         vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
440     b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
441                                 vget_high_s16(a[right_index]));               \
442   } while (0)
443 
444 #define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
445   do {                                                                       \
446     c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
447     c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
448   } while (0)
449 
450 #define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
451   do {                                                                     \
452     temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
453     temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
454     c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
455     c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
456   } while (0)
457 
458 #define ADD_S32(a, left_index, right_index, b, b_index)                   \
459   do {                                                                    \
460     b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
461     b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
462   } while (0)
463 
464 #define SUB_S32(a, left_index, right_index, b, b_index)                   \
465   do {                                                                    \
466     b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
467     b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
468   } while (0)
469 
470 #define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
471                               add_index, sub_index)                      \
472   do {                                                                   \
473     butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
474                                 &b##_lo[add_index], &b##_hi[add_index],  \
475                                 &b##_lo[sub_index], &b##_hi[sub_index]); \
476   } while (0)
477 
478 #define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index,  \
479                           sub_index)                                           \
480   do {                                                                         \
481     butterfly_one_coeff_s32_fast(                                              \
482         a##_lo[left_index], a##_hi[left_index], a##_lo[right_index],           \
483         a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
484         &b##_lo[sub_index], &b##_hi[sub_index]);                               \
485   } while (0)
486 
487 #define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
488                           right_constant, b, add_index, sub_index)             \
489   do {                                                                         \
490     butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
491                             a##_lo[right_index], a##_hi[right_index],          \
492                             left_constant, right_constant, &b##_lo[add_index], \
493                             &b##_hi[add_index], &b##_lo[sub_index],            \
494                             &b##_hi[sub_index]);                               \
495   } while (0)
496 
dct_body_second_pass(const int16x8_t * in,int16x8_t * out)497 static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
498   int16x8_t a[32];
499   int16x8_t b[32];
500   int32x4_t c_lo[32];
501   int32x4_t c_hi[32];
502   int32x4_t d_lo[32];
503   int32x4_t d_hi[32];
504 
505   // Stage 1. Done as part of the load for the first pass.
506   a[0] = vaddq_s16(in[0], in[31]);
507   a[1] = vaddq_s16(in[1], in[30]);
508   a[2] = vaddq_s16(in[2], in[29]);
509   a[3] = vaddq_s16(in[3], in[28]);
510   a[4] = vaddq_s16(in[4], in[27]);
511   a[5] = vaddq_s16(in[5], in[26]);
512   a[6] = vaddq_s16(in[6], in[25]);
513   a[7] = vaddq_s16(in[7], in[24]);
514   a[8] = vaddq_s16(in[8], in[23]);
515   a[9] = vaddq_s16(in[9], in[22]);
516   a[10] = vaddq_s16(in[10], in[21]);
517   a[11] = vaddq_s16(in[11], in[20]);
518   a[12] = vaddq_s16(in[12], in[19]);
519   a[13] = vaddq_s16(in[13], in[18]);
520   a[14] = vaddq_s16(in[14], in[17]);
521   a[15] = vaddq_s16(in[15], in[16]);
522   a[16] = vsubq_s16(in[15], in[16]);
523   a[17] = vsubq_s16(in[14], in[17]);
524   a[18] = vsubq_s16(in[13], in[18]);
525   a[19] = vsubq_s16(in[12], in[19]);
526   a[20] = vsubq_s16(in[11], in[20]);
527   a[21] = vsubq_s16(in[10], in[21]);
528   a[22] = vsubq_s16(in[9], in[22]);
529   a[23] = vsubq_s16(in[8], in[23]);
530   a[24] = vsubq_s16(in[7], in[24]);
531   a[25] = vsubq_s16(in[6], in[25]);
532   a[26] = vsubq_s16(in[5], in[26]);
533   a[27] = vsubq_s16(in[4], in[27]);
534   a[28] = vsubq_s16(in[3], in[28]);
535   a[29] = vsubq_s16(in[2], in[29]);
536   a[30] = vsubq_s16(in[1], in[30]);
537   a[31] = vsubq_s16(in[0], in[31]);
538 
539   // Stage 2.
540   b[0] = vaddq_s16(a[0], a[15]);
541   b[1] = vaddq_s16(a[1], a[14]);
542   b[2] = vaddq_s16(a[2], a[13]);
543   b[3] = vaddq_s16(a[3], a[12]);
544   b[4] = vaddq_s16(a[4], a[11]);
545   b[5] = vaddq_s16(a[5], a[10]);
546   b[6] = vaddq_s16(a[6], a[9]);
547   b[7] = vaddq_s16(a[7], a[8]);
548 
549   b[8] = vsubq_s16(a[7], a[8]);
550   b[9] = vsubq_s16(a[6], a[9]);
551   b[10] = vsubq_s16(a[5], a[10]);
552   b[11] = vsubq_s16(a[4], a[11]);
553   b[12] = vsubq_s16(a[3], a[12]);
554   b[13] = vsubq_s16(a[2], a[13]);
555   b[14] = vsubq_s16(a[1], a[14]);
556   b[15] = vsubq_s16(a[0], a[15]);
557 
558   b[16] = a[16];
559   b[17] = a[17];
560   b[18] = a[18];
561   b[19] = a[19];
562 
563   butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
564   butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
565   butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
566   butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
567 
568   b[28] = a[28];
569   b[29] = a[29];
570   b[30] = a[30];
571   b[31] = a[31];
572 
573   // Stage 3. With extreme values for input this calculation rolls over int16_t.
574   // The sources for b[0] get added multiple times and, through testing, have
575   // been shown to overflow starting here.
576   ADD_S16_S32(b, 0, 7, c, 0);
577   ADD_S16_S32(b, 1, 6, c, 1);
578   ADD_S16_S32(b, 2, 5, c, 2);
579   ADD_S16_S32(b, 3, 4, c, 3);
580   SUB_S16_S32(b, 3, 4, c, 4);
581   SUB_S16_S32(b, 2, 5, c, 5);
582   SUB_S16_S32(b, 1, 6, c, 6);
583   SUB_S16_S32(b, 0, 7, c, 7);
584 
585   a[8] = b[8];
586   a[9] = b[9];
587 
588   BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
589   BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
590 
591   a[14] = b[14];
592   a[15] = b[15];
593 
594   ADD_S16_S32(b, 16, 23, c, 16);
595   ADD_S16_S32(b, 17, 22, c, 17);
596   ADD_S16_S32(b, 18, 21, c, 18);
597   ADD_S16_S32(b, 19, 20, c, 19);
598   SUB_S16_S32(b, 19, 20, c, 20);
599   SUB_S16_S32(b, 18, 21, c, 21);
600   SUB_S16_S32(b, 17, 22, c, 22);
601   SUB_S16_S32(b, 16, 23, c, 23);
602   SUB_S16_S32(b, 31, 24, c, 24);
603   SUB_S16_S32(b, 30, 25, c, 25);
604   SUB_S16_S32(b, 29, 26, c, 26);
605   SUB_S16_S32(b, 28, 27, c, 27);
606   ADD_S16_S32(b, 28, 27, c, 28);
607   ADD_S16_S32(b, 29, 26, c, 29);
608   ADD_S16_S32(b, 30, 25, c, 30);
609   ADD_S16_S32(b, 31, 24, c, 31);
610 
611   // Stage 4.
612   ADD_S32(c, 0, 3, d, 0);
613   ADD_S32(c, 1, 2, d, 1);
614   SUB_S32(c, 1, 2, d, 2);
615   SUB_S32(c, 0, 3, d, 3);
616 
617   PASS_THROUGH(c, d, 4);
618 
619   BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
620 
621   PASS_THROUGH(c, d, 7);
622 
623   ADDW_S16_S32(c, 11, a, 8, d, 8);
624   ADDW_S16_S32(c, 10, a, 9, d, 9);
625   SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
626   SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
627   SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
628   SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
629   ADDW_S16_S32(c, 13, b, 14, d, 14);
630   ADDW_S16_S32(c, 12, b, 15, d, 15);
631 
632   PASS_THROUGH(c, d, 16);
633   PASS_THROUGH(c, d, 17);
634 
635   BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
636   BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
637   BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
638   BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
639 
640   PASS_THROUGH(c, d, 22);
641   PASS_THROUGH(c, d, 23);
642   PASS_THROUGH(c, d, 24);
643   PASS_THROUGH(c, d, 25);
644 
645   PASS_THROUGH(c, d, 30);
646   PASS_THROUGH(c, d, 31);
647 
648   // Stage 5.
649   BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
650   BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
651 
652   ADD_S32(d, 4, 5, c, 4);
653   SUB_S32(d, 4, 5, c, 5);
654   SUB_S32(d, 7, 6, c, 6);
655   ADD_S32(d, 7, 6, c, 7);
656 
657   PASS_THROUGH(d, c, 8);
658 
659   BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
660   BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
661 
662   PASS_THROUGH(d, c, 11);
663   PASS_THROUGH(d, c, 12);
664   PASS_THROUGH(d, c, 15);
665 
666   ADD_S32(d, 16, 19, c, 16);
667   ADD_S32(d, 17, 18, c, 17);
668   SUB_S32(d, 17, 18, c, 18);
669   SUB_S32(d, 16, 19, c, 19);
670   SUB_S32(d, 23, 20, c, 20);
671   SUB_S32(d, 22, 21, c, 21);
672   ADD_S32(d, 22, 21, c, 22);
673   ADD_S32(d, 23, 20, c, 23);
674   ADD_S32(d, 24, 27, c, 24);
675   ADD_S32(d, 25, 26, c, 25);
676   SUB_S32(d, 25, 26, c, 26);
677   SUB_S32(d, 24, 27, c, 27);
678   SUB_S32(d, 31, 28, c, 28);
679   SUB_S32(d, 30, 29, c, 29);
680   ADD_S32(d, 30, 29, c, 30);
681   ADD_S32(d, 31, 28, c, 31);
682 
683   // Stage 6.
684   PASS_THROUGH(c, d, 0);
685   PASS_THROUGH(c, d, 1);
686   PASS_THROUGH(c, d, 2);
687   PASS_THROUGH(c, d, 3);
688 
689   BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
690   BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
691 
692   ADD_S32(c, 8, 9, d, 8);
693   SUB_S32(c, 8, 9, d, 9);
694   SUB_S32(c, 11, 10, d, 10);
695   ADD_S32(c, 11, 10, d, 11);
696   ADD_S32(c, 12, 13, d, 12);
697   SUB_S32(c, 12, 13, d, 13);
698   SUB_S32(c, 15, 14, d, 14);
699   ADD_S32(c, 15, 14, d, 15);
700 
701   PASS_THROUGH(c, d, 16);
702   PASS_THROUGH(c, d, 19);
703   PASS_THROUGH(c, d, 20);
704   PASS_THROUGH(c, d, 23);
705   PASS_THROUGH(c, d, 24);
706   PASS_THROUGH(c, d, 27);
707   PASS_THROUGH(c, d, 28);
708   PASS_THROUGH(c, d, 31);
709 
710   BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
711   BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
712   BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
713   BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
714 
715   // Stage 7.
716   PASS_THROUGH(d, c, 0);
717   PASS_THROUGH(d, c, 1);
718   PASS_THROUGH(d, c, 2);
719   PASS_THROUGH(d, c, 3);
720   PASS_THROUGH(d, c, 4);
721   PASS_THROUGH(d, c, 5);
722   PASS_THROUGH(d, c, 6);
723   PASS_THROUGH(d, c, 7);
724 
725   BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
726   BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
727   BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
728   BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
729 
730   ADD_S32(d, 16, 17, c, 16);
731   SUB_S32(d, 16, 17, c, 17);
732   SUB_S32(d, 19, 18, c, 18);
733   ADD_S32(d, 19, 18, c, 19);
734   ADD_S32(d, 20, 21, c, 20);
735   SUB_S32(d, 20, 21, c, 21);
736   SUB_S32(d, 23, 22, c, 22);
737   ADD_S32(d, 23, 22, c, 23);
738   ADD_S32(d, 24, 25, c, 24);
739   SUB_S32(d, 24, 25, c, 25);
740   SUB_S32(d, 27, 26, c, 26);
741   ADD_S32(d, 27, 26, c, 27);
742   ADD_S32(d, 28, 29, c, 28);
743   SUB_S32(d, 28, 29, c, 29);
744   SUB_S32(d, 31, 30, c, 30);
745   ADD_S32(d, 31, 30, c, 31);
746 
747   // Final stage.
748   // Roll rounding into this function so we can pass back int16x8.
749 
750   out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
751   out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
752 
753   out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
754   out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
755   out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
756   out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
757   out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
758 
759   out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
760   out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
761   out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
762   out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
763 
764   out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
765   out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
766   out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
767   out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
768   out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
769 
770   BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
771   out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
772   out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
773 
774   BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
775   out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
776   out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
777 
778   BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
779   out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
780   out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
781 
782   BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
783   out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
784   out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
785 
786   BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
787   out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
788   out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
789 
790   BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
791   out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
792   out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
793 
794   BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
795   out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
796   out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
797 
798   BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
799   out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
800   out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
801 }
802 
dct_body_second_pass_rd(const int16x8_t * in,int16x8_t * out)803 static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
804                                            int16x8_t *out) {
805   int16x8_t a[32];
806   int16x8_t b[32];
807 
808   // Stage 1. Done as part of the load for the first pass.
809   a[0] = vaddq_s16(in[0], in[31]);
810   a[1] = vaddq_s16(in[1], in[30]);
811   a[2] = vaddq_s16(in[2], in[29]);
812   a[3] = vaddq_s16(in[3], in[28]);
813   a[4] = vaddq_s16(in[4], in[27]);
814   a[5] = vaddq_s16(in[5], in[26]);
815   a[6] = vaddq_s16(in[6], in[25]);
816   a[7] = vaddq_s16(in[7], in[24]);
817   a[8] = vaddq_s16(in[8], in[23]);
818   a[9] = vaddq_s16(in[9], in[22]);
819   a[10] = vaddq_s16(in[10], in[21]);
820   a[11] = vaddq_s16(in[11], in[20]);
821   a[12] = vaddq_s16(in[12], in[19]);
822   a[13] = vaddq_s16(in[13], in[18]);
823   a[14] = vaddq_s16(in[14], in[17]);
824   a[15] = vaddq_s16(in[15], in[16]);
825   a[16] = vsubq_s16(in[15], in[16]);
826   a[17] = vsubq_s16(in[14], in[17]);
827   a[18] = vsubq_s16(in[13], in[18]);
828   a[19] = vsubq_s16(in[12], in[19]);
829   a[20] = vsubq_s16(in[11], in[20]);
830   a[21] = vsubq_s16(in[10], in[21]);
831   a[22] = vsubq_s16(in[9], in[22]);
832   a[23] = vsubq_s16(in[8], in[23]);
833   a[24] = vsubq_s16(in[7], in[24]);
834   a[25] = vsubq_s16(in[6], in[25]);
835   a[26] = vsubq_s16(in[5], in[26]);
836   a[27] = vsubq_s16(in[4], in[27]);
837   a[28] = vsubq_s16(in[3], in[28]);
838   a[29] = vsubq_s16(in[2], in[29]);
839   a[30] = vsubq_s16(in[1], in[30]);
840   a[31] = vsubq_s16(in[0], in[31]);
841 
842   // Stage 2.
843   // For the "rd" version, all the values are rounded down after stage 2 to keep
844   // the values in 16 bits.
845   b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
846   b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
847   b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
848   b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
849   b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
850   b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
851   b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
852   b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
853 
854   b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
855   b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
856   b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
857   b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
858   b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
859   b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
860   b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
861   b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
862 
863   b[16] = add_round_shift_s16(a[16]);
864   b[17] = add_round_shift_s16(a[17]);
865   b[18] = add_round_shift_s16(a[18]);
866   b[19] = add_round_shift_s16(a[19]);
867 
868   butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
869   butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
870   butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
871   butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
872   b[20] = add_round_shift_s16(b[20]);
873   b[21] = add_round_shift_s16(b[21]);
874   b[22] = add_round_shift_s16(b[22]);
875   b[23] = add_round_shift_s16(b[23]);
876   b[24] = add_round_shift_s16(b[24]);
877   b[25] = add_round_shift_s16(b[25]);
878   b[26] = add_round_shift_s16(b[26]);
879   b[27] = add_round_shift_s16(b[27]);
880 
881   b[28] = add_round_shift_s16(a[28]);
882   b[29] = add_round_shift_s16(a[29]);
883   b[30] = add_round_shift_s16(a[30]);
884   b[31] = add_round_shift_s16(a[31]);
885 
886   // Stage 3.
887   a[0] = vaddq_s16(b[0], b[7]);
888   a[1] = vaddq_s16(b[1], b[6]);
889   a[2] = vaddq_s16(b[2], b[5]);
890   a[3] = vaddq_s16(b[3], b[4]);
891 
892   a[4] = vsubq_s16(b[3], b[4]);
893   a[5] = vsubq_s16(b[2], b[5]);
894   a[6] = vsubq_s16(b[1], b[6]);
895   a[7] = vsubq_s16(b[0], b[7]);
896 
897   a[8] = b[8];
898   a[9] = b[9];
899 
900   butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
901   butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
902 
903   a[14] = b[14];
904   a[15] = b[15];
905 
906   a[16] = vaddq_s16(b[16], b[23]);
907   a[17] = vaddq_s16(b[17], b[22]);
908   a[18] = vaddq_s16(b[18], b[21]);
909   a[19] = vaddq_s16(b[19], b[20]);
910 
911   a[20] = vsubq_s16(b[19], b[20]);
912   a[21] = vsubq_s16(b[18], b[21]);
913   a[22] = vsubq_s16(b[17], b[22]);
914   a[23] = vsubq_s16(b[16], b[23]);
915 
916   a[24] = vsubq_s16(b[31], b[24]);
917   a[25] = vsubq_s16(b[30], b[25]);
918   a[26] = vsubq_s16(b[29], b[26]);
919   a[27] = vsubq_s16(b[28], b[27]);
920 
921   a[28] = vaddq_s16(b[28], b[27]);
922   a[29] = vaddq_s16(b[29], b[26]);
923   a[30] = vaddq_s16(b[30], b[25]);
924   a[31] = vaddq_s16(b[31], b[24]);
925 
926   // Stage 4.
927   b[0] = vaddq_s16(a[0], a[3]);
928   b[1] = vaddq_s16(a[1], a[2]);
929   b[2] = vsubq_s16(a[1], a[2]);
930   b[3] = vsubq_s16(a[0], a[3]);
931 
932   b[4] = a[4];
933 
934   butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
935 
936   b[7] = a[7];
937 
938   b[8] = vaddq_s16(a[8], a[11]);
939   b[9] = vaddq_s16(a[9], a[10]);
940   b[10] = vsubq_s16(a[9], a[10]);
941   b[11] = vsubq_s16(a[8], a[11]);
942   b[12] = vsubq_s16(a[15], a[12]);
943   b[13] = vsubq_s16(a[14], a[13]);
944   b[14] = vaddq_s16(a[14], a[13]);
945   b[15] = vaddq_s16(a[15], a[12]);
946 
947   b[16] = a[16];
948   b[17] = a[17];
949 
950   butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
951   butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
952   butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
953   butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
954 
955   b[22] = a[22];
956   b[23] = a[23];
957   b[24] = a[24];
958   b[25] = a[25];
959 
960   b[30] = a[30];
961   b[31] = a[31];
962 
963   // Stage 5.
964   butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
965   butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
966 
967   a[4] = vaddq_s16(b[4], b[5]);
968   a[5] = vsubq_s16(b[4], b[5]);
969   a[6] = vsubq_s16(b[7], b[6]);
970   a[7] = vaddq_s16(b[7], b[6]);
971 
972   a[8] = b[8];
973 
974   butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
975   butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
976 
977   a[11] = b[11];
978   a[12] = b[12];
979 
980   a[15] = b[15];
981 
982   a[16] = vaddq_s16(b[19], b[16]);
983   a[17] = vaddq_s16(b[18], b[17]);
984   a[18] = vsubq_s16(b[17], b[18]);
985   a[19] = vsubq_s16(b[16], b[19]);
986   a[20] = vsubq_s16(b[23], b[20]);
987   a[21] = vsubq_s16(b[22], b[21]);
988   a[22] = vaddq_s16(b[21], b[22]);
989   a[23] = vaddq_s16(b[20], b[23]);
990   a[24] = vaddq_s16(b[27], b[24]);
991   a[25] = vaddq_s16(b[26], b[25]);
992   a[26] = vsubq_s16(b[25], b[26]);
993   a[27] = vsubq_s16(b[24], b[27]);
994   a[28] = vsubq_s16(b[31], b[28]);
995   a[29] = vsubq_s16(b[30], b[29]);
996   a[30] = vaddq_s16(b[29], b[30]);
997   a[31] = vaddq_s16(b[28], b[31]);
998 
999   // Stage 6.
1000   b[0] = a[0];
1001   b[1] = a[1];
1002   b[2] = a[2];
1003   b[3] = a[3];
1004 
1005   butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
1006   butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
1007 
1008   b[8] = vaddq_s16(a[8], a[9]);
1009   b[9] = vsubq_s16(a[8], a[9]);
1010   b[10] = vsubq_s16(a[11], a[10]);
1011   b[11] = vaddq_s16(a[11], a[10]);
1012   b[12] = vaddq_s16(a[12], a[13]);
1013   b[13] = vsubq_s16(a[12], a[13]);
1014   b[14] = vsubq_s16(a[15], a[14]);
1015   b[15] = vaddq_s16(a[15], a[14]);
1016 
1017   b[16] = a[16];
1018   b[19] = a[19];
1019   b[20] = a[20];
1020   b[23] = a[23];
1021   b[24] = a[24];
1022   b[27] = a[27];
1023   b[28] = a[28];
1024   b[31] = a[31];
1025 
1026   butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
1027   butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
1028 
1029   butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
1030   butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
1031 
1032   // Stage 7.
1033   a[0] = b[0];
1034   a[1] = b[1];
1035   a[2] = b[2];
1036   a[3] = b[3];
1037   a[4] = b[4];
1038   a[5] = b[5];
1039   a[6] = b[6];
1040   a[7] = b[7];
1041 
1042   butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
1043   butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
1044   butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
1045   butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
1046 
1047   a[16] = vaddq_s16(b[16], b[17]);
1048   a[17] = vsubq_s16(b[16], b[17]);
1049   a[18] = vsubq_s16(b[19], b[18]);
1050   a[19] = vaddq_s16(b[19], b[18]);
1051   a[20] = vaddq_s16(b[20], b[21]);
1052   a[21] = vsubq_s16(b[20], b[21]);
1053   a[22] = vsubq_s16(b[23], b[22]);
1054   a[23] = vaddq_s16(b[23], b[22]);
1055   a[24] = vaddq_s16(b[24], b[25]);
1056   a[25] = vsubq_s16(b[24], b[25]);
1057   a[26] = vsubq_s16(b[27], b[26]);
1058   a[27] = vaddq_s16(b[27], b[26]);
1059   a[28] = vaddq_s16(b[28], b[29]);
1060   a[29] = vsubq_s16(b[28], b[29]);
1061   a[30] = vsubq_s16(b[31], b[30]);
1062   a[31] = vaddq_s16(b[31], b[30]);
1063 
1064   // Final stage.
1065   out[0] = a[0];
1066   out[16] = a[1];
1067   out[8] = a[2];
1068   out[24] = a[3];
1069   out[4] = a[4];
1070   out[20] = a[5];
1071   out[12] = a[6];
1072   out[28] = a[7];
1073   out[2] = a[8];
1074   out[18] = a[9];
1075   out[10] = a[10];
1076   out[26] = a[11];
1077   out[6] = a[12];
1078   out[22] = a[13];
1079   out[14] = a[14];
1080   out[30] = a[15];
1081 
1082   butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
1083   butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
1084                       &out[15]);
1085   butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
1086   butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
1087   butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
1088   butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
1089                       &out[11]);
1090   butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
1091                       &out[19]);
1092   butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
1093 }
1094 
1095 #undef PASS_THROUGH
1096 #undef ADD_S16_S32
1097 #undef SUB_S16_S32
1098 #undef ADDW_S16_S32
1099 #undef SUBW_S16_S32
1100 #undef ADD_S32
1101 #undef SUB_S32
1102 #undef BUTTERFLY_ONE_S16_S32
1103 #undef BUTTERFLY_ONE_S32
1104 #undef BUTTERFLY_TWO_S32
1105 
1106 #if CONFIG_VP9_HIGHBITDEPTH
1107 
1108 // Store 32 32x4 vectors, assuming stride == 32.
store32x32_s32(tran_low_t * a,const int32x4_t * l1,const int32x4_t * r1,const int32x4_t * l2,const int32x4_t * r2,const int32x4_t * l3,const int32x4_t * r3,const int32x4_t * l4,const int32x4_t * r4)1109 static INLINE void store32x32_s32(
1110     tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
1111     const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
1112     const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
1113     const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
1114   int i;
1115   for (i = 0; i < 32; i++) {
1116     vst1q_s32(a, l1[i]);
1117     vst1q_s32(a + 4, r1[i]);
1118     vst1q_s32(a + 8, l2[i]);
1119     vst1q_s32(a + 12, r2[i]);
1120     vst1q_s32(a + 16, l3[i]);
1121     vst1q_s32(a + 20, r3[i]);
1122     vst1q_s32(a + 24, l4[i]);
1123     vst1q_s32(a + 28, r4[i]);
1124     a += 32;
1125   }
1126 }
1127 
highbd_scale_input(const int16x8_t * a,int32x4_t * left,int32x4_t * right)1128 static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
1129                                       int32x4_t *left /*[32]*/,
1130                                       int32x4_t *right /* [32] */) {
1131   left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
1132   left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
1133   left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
1134   left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
1135   left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
1136   left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
1137   left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
1138   left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
1139   left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
1140   left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
1141   left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
1142   left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
1143   left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
1144   left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
1145   left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
1146   left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
1147   left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
1148   left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
1149   left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
1150   left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
1151   left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
1152   left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
1153   left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
1154   left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
1155   left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
1156   left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
1157   left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
1158   left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
1159   left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
1160   left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
1161   left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
1162   left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
1163 
1164   right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
1165   right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
1166   right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
1167   right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
1168   right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
1169   right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
1170   right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
1171   right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
1172   right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
1173   right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
1174   right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
1175   right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
1176   right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
1177   right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
1178   right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
1179   right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
1180   right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
1181   right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
1182   right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
1183   right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
1184   right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
1185   right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
1186   right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
1187   right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
1188   right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
1189   right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
1190   right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
1191   right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
1192   right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
1193   right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
1194   right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
1195   right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
1196 }
1197 
highbd_cross_input(const int32x4_t * a_left,int32x4_t * a_right,int32x4_t * b_left,int32x4_t * b_right)1198 static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
1199                                       int32x4_t *a_right /*[32]*/,
1200                                       int32x4_t *b_left /*[32]*/,
1201                                       int32x4_t *b_right /*[32]*/) {
1202   // Stage 1. Done as part of the load for the first pass.
1203   b_left[0] = vaddq_s32(a_left[0], a_left[31]);
1204   b_left[1] = vaddq_s32(a_left[1], a_left[30]);
1205   b_left[2] = vaddq_s32(a_left[2], a_left[29]);
1206   b_left[3] = vaddq_s32(a_left[3], a_left[28]);
1207   b_left[4] = vaddq_s32(a_left[4], a_left[27]);
1208   b_left[5] = vaddq_s32(a_left[5], a_left[26]);
1209   b_left[6] = vaddq_s32(a_left[6], a_left[25]);
1210   b_left[7] = vaddq_s32(a_left[7], a_left[24]);
1211   b_left[8] = vaddq_s32(a_left[8], a_left[23]);
1212   b_left[9] = vaddq_s32(a_left[9], a_left[22]);
1213   b_left[10] = vaddq_s32(a_left[10], a_left[21]);
1214   b_left[11] = vaddq_s32(a_left[11], a_left[20]);
1215   b_left[12] = vaddq_s32(a_left[12], a_left[19]);
1216   b_left[13] = vaddq_s32(a_left[13], a_left[18]);
1217   b_left[14] = vaddq_s32(a_left[14], a_left[17]);
1218   b_left[15] = vaddq_s32(a_left[15], a_left[16]);
1219 
1220   b_right[0] = vaddq_s32(a_right[0], a_right[31]);
1221   b_right[1] = vaddq_s32(a_right[1], a_right[30]);
1222   b_right[2] = vaddq_s32(a_right[2], a_right[29]);
1223   b_right[3] = vaddq_s32(a_right[3], a_right[28]);
1224   b_right[4] = vaddq_s32(a_right[4], a_right[27]);
1225   b_right[5] = vaddq_s32(a_right[5], a_right[26]);
1226   b_right[6] = vaddq_s32(a_right[6], a_right[25]);
1227   b_right[7] = vaddq_s32(a_right[7], a_right[24]);
1228   b_right[8] = vaddq_s32(a_right[8], a_right[23]);
1229   b_right[9] = vaddq_s32(a_right[9], a_right[22]);
1230   b_right[10] = vaddq_s32(a_right[10], a_right[21]);
1231   b_right[11] = vaddq_s32(a_right[11], a_right[20]);
1232   b_right[12] = vaddq_s32(a_right[12], a_right[19]);
1233   b_right[13] = vaddq_s32(a_right[13], a_right[18]);
1234   b_right[14] = vaddq_s32(a_right[14], a_right[17]);
1235   b_right[15] = vaddq_s32(a_right[15], a_right[16]);
1236 
1237   b_left[16] = vsubq_s32(a_left[15], a_left[16]);
1238   b_left[17] = vsubq_s32(a_left[14], a_left[17]);
1239   b_left[18] = vsubq_s32(a_left[13], a_left[18]);
1240   b_left[19] = vsubq_s32(a_left[12], a_left[19]);
1241   b_left[20] = vsubq_s32(a_left[11], a_left[20]);
1242   b_left[21] = vsubq_s32(a_left[10], a_left[21]);
1243   b_left[22] = vsubq_s32(a_left[9], a_left[22]);
1244   b_left[23] = vsubq_s32(a_left[8], a_left[23]);
1245   b_left[24] = vsubq_s32(a_left[7], a_left[24]);
1246   b_left[25] = vsubq_s32(a_left[6], a_left[25]);
1247   b_left[26] = vsubq_s32(a_left[5], a_left[26]);
1248   b_left[27] = vsubq_s32(a_left[4], a_left[27]);
1249   b_left[28] = vsubq_s32(a_left[3], a_left[28]);
1250   b_left[29] = vsubq_s32(a_left[2], a_left[29]);
1251   b_left[30] = vsubq_s32(a_left[1], a_left[30]);
1252   b_left[31] = vsubq_s32(a_left[0], a_left[31]);
1253 
1254   b_right[16] = vsubq_s32(a_right[15], a_right[16]);
1255   b_right[17] = vsubq_s32(a_right[14], a_right[17]);
1256   b_right[18] = vsubq_s32(a_right[13], a_right[18]);
1257   b_right[19] = vsubq_s32(a_right[12], a_right[19]);
1258   b_right[20] = vsubq_s32(a_right[11], a_right[20]);
1259   b_right[21] = vsubq_s32(a_right[10], a_right[21]);
1260   b_right[22] = vsubq_s32(a_right[9], a_right[22]);
1261   b_right[23] = vsubq_s32(a_right[8], a_right[23]);
1262   b_right[24] = vsubq_s32(a_right[7], a_right[24]);
1263   b_right[25] = vsubq_s32(a_right[6], a_right[25]);
1264   b_right[26] = vsubq_s32(a_right[5], a_right[26]);
1265   b_right[27] = vsubq_s32(a_right[4], a_right[27]);
1266   b_right[28] = vsubq_s32(a_right[3], a_right[28]);
1267   b_right[29] = vsubq_s32(a_right[2], a_right[29]);
1268   b_right[30] = vsubq_s32(a_right[1], a_right[30]);
1269   b_right[31] = vsubq_s32(a_right[0], a_right[31]);
1270 }
1271 
highbd_partial_add_round_shift(int32x4_t * left,int32x4_t * right)1272 static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
1273                                                   int32x4_t *right /* [32] */) {
1274   // Also compute partial rounding shift:
1275   // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1276 
1277   left[0] = add_round_shift_s32(left[0]);
1278   left[1] = add_round_shift_s32(left[1]);
1279   left[2] = add_round_shift_s32(left[2]);
1280   left[3] = add_round_shift_s32(left[3]);
1281   left[4] = add_round_shift_s32(left[4]);
1282   left[5] = add_round_shift_s32(left[5]);
1283   left[6] = add_round_shift_s32(left[6]);
1284   left[7] = add_round_shift_s32(left[7]);
1285   left[8] = add_round_shift_s32(left[8]);
1286   left[9] = add_round_shift_s32(left[9]);
1287   left[10] = add_round_shift_s32(left[10]);
1288   left[11] = add_round_shift_s32(left[11]);
1289   left[12] = add_round_shift_s32(left[12]);
1290   left[13] = add_round_shift_s32(left[13]);
1291   left[14] = add_round_shift_s32(left[14]);
1292   left[15] = add_round_shift_s32(left[15]);
1293   left[16] = add_round_shift_s32(left[16]);
1294   left[17] = add_round_shift_s32(left[17]);
1295   left[18] = add_round_shift_s32(left[18]);
1296   left[19] = add_round_shift_s32(left[19]);
1297   left[20] = add_round_shift_s32(left[20]);
1298   left[21] = add_round_shift_s32(left[21]);
1299   left[22] = add_round_shift_s32(left[22]);
1300   left[23] = add_round_shift_s32(left[23]);
1301   left[24] = add_round_shift_s32(left[24]);
1302   left[25] = add_round_shift_s32(left[25]);
1303   left[26] = add_round_shift_s32(left[26]);
1304   left[27] = add_round_shift_s32(left[27]);
1305   left[28] = add_round_shift_s32(left[28]);
1306   left[29] = add_round_shift_s32(left[29]);
1307   left[30] = add_round_shift_s32(left[30]);
1308   left[31] = add_round_shift_s32(left[31]);
1309 
1310   right[0] = add_round_shift_s32(right[0]);
1311   right[1] = add_round_shift_s32(right[1]);
1312   right[2] = add_round_shift_s32(right[2]);
1313   right[3] = add_round_shift_s32(right[3]);
1314   right[4] = add_round_shift_s32(right[4]);
1315   right[5] = add_round_shift_s32(right[5]);
1316   right[6] = add_round_shift_s32(right[6]);
1317   right[7] = add_round_shift_s32(right[7]);
1318   right[8] = add_round_shift_s32(right[8]);
1319   right[9] = add_round_shift_s32(right[9]);
1320   right[10] = add_round_shift_s32(right[10]);
1321   right[11] = add_round_shift_s32(right[11]);
1322   right[12] = add_round_shift_s32(right[12]);
1323   right[13] = add_round_shift_s32(right[13]);
1324   right[14] = add_round_shift_s32(right[14]);
1325   right[15] = add_round_shift_s32(right[15]);
1326   right[16] = add_round_shift_s32(right[16]);
1327   right[17] = add_round_shift_s32(right[17]);
1328   right[18] = add_round_shift_s32(right[18]);
1329   right[19] = add_round_shift_s32(right[19]);
1330   right[20] = add_round_shift_s32(right[20]);
1331   right[21] = add_round_shift_s32(right[21]);
1332   right[22] = add_round_shift_s32(right[22]);
1333   right[23] = add_round_shift_s32(right[23]);
1334   right[24] = add_round_shift_s32(right[24]);
1335   right[25] = add_round_shift_s32(right[25]);
1336   right[26] = add_round_shift_s32(right[26]);
1337   right[27] = add_round_shift_s32(right[27]);
1338   right[28] = add_round_shift_s32(right[28]);
1339   right[29] = add_round_shift_s32(right[29]);
1340   right[30] = add_round_shift_s32(right[30]);
1341   right[31] = add_round_shift_s32(right[31]);
1342 }
1343 
highbd_partial_sub_round_shift(int32x4_t * left,int32x4_t * right)1344 static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
1345                                                   int32x4_t *right /* [32] */) {
1346   // Also compute partial rounding shift:
1347   // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
1348 
1349   left[0] = sub_round_shift_s32(left[0]);
1350   left[1] = sub_round_shift_s32(left[1]);
1351   left[2] = sub_round_shift_s32(left[2]);
1352   left[3] = sub_round_shift_s32(left[3]);
1353   left[4] = sub_round_shift_s32(left[4]);
1354   left[5] = sub_round_shift_s32(left[5]);
1355   left[6] = sub_round_shift_s32(left[6]);
1356   left[7] = sub_round_shift_s32(left[7]);
1357   left[8] = sub_round_shift_s32(left[8]);
1358   left[9] = sub_round_shift_s32(left[9]);
1359   left[10] = sub_round_shift_s32(left[10]);
1360   left[11] = sub_round_shift_s32(left[11]);
1361   left[12] = sub_round_shift_s32(left[12]);
1362   left[13] = sub_round_shift_s32(left[13]);
1363   left[14] = sub_round_shift_s32(left[14]);
1364   left[15] = sub_round_shift_s32(left[15]);
1365   left[16] = sub_round_shift_s32(left[16]);
1366   left[17] = sub_round_shift_s32(left[17]);
1367   left[18] = sub_round_shift_s32(left[18]);
1368   left[19] = sub_round_shift_s32(left[19]);
1369   left[20] = sub_round_shift_s32(left[20]);
1370   left[21] = sub_round_shift_s32(left[21]);
1371   left[22] = sub_round_shift_s32(left[22]);
1372   left[23] = sub_round_shift_s32(left[23]);
1373   left[24] = sub_round_shift_s32(left[24]);
1374   left[25] = sub_round_shift_s32(left[25]);
1375   left[26] = sub_round_shift_s32(left[26]);
1376   left[27] = sub_round_shift_s32(left[27]);
1377   left[28] = sub_round_shift_s32(left[28]);
1378   left[29] = sub_round_shift_s32(left[29]);
1379   left[30] = sub_round_shift_s32(left[30]);
1380   left[31] = sub_round_shift_s32(left[31]);
1381 
1382   right[0] = sub_round_shift_s32(right[0]);
1383   right[1] = sub_round_shift_s32(right[1]);
1384   right[2] = sub_round_shift_s32(right[2]);
1385   right[3] = sub_round_shift_s32(right[3]);
1386   right[4] = sub_round_shift_s32(right[4]);
1387   right[5] = sub_round_shift_s32(right[5]);
1388   right[6] = sub_round_shift_s32(right[6]);
1389   right[7] = sub_round_shift_s32(right[7]);
1390   right[8] = sub_round_shift_s32(right[8]);
1391   right[9] = sub_round_shift_s32(right[9]);
1392   right[10] = sub_round_shift_s32(right[10]);
1393   right[11] = sub_round_shift_s32(right[11]);
1394   right[12] = sub_round_shift_s32(right[12]);
1395   right[13] = sub_round_shift_s32(right[13]);
1396   right[14] = sub_round_shift_s32(right[14]);
1397   right[15] = sub_round_shift_s32(right[15]);
1398   right[16] = sub_round_shift_s32(right[16]);
1399   right[17] = sub_round_shift_s32(right[17]);
1400   right[18] = sub_round_shift_s32(right[18]);
1401   right[19] = sub_round_shift_s32(right[19]);
1402   right[20] = sub_round_shift_s32(right[20]);
1403   right[21] = sub_round_shift_s32(right[21]);
1404   right[22] = sub_round_shift_s32(right[22]);
1405   right[23] = sub_round_shift_s32(right[23]);
1406   right[24] = sub_round_shift_s32(right[24]);
1407   right[25] = sub_round_shift_s32(right[25]);
1408   right[26] = sub_round_shift_s32(right[26]);
1409   right[27] = sub_round_shift_s32(right[27]);
1410   right[28] = sub_round_shift_s32(right[28]);
1411   right[29] = sub_round_shift_s32(right[29]);
1412   right[30] = sub_round_shift_s32(right[30]);
1413   right[31] = sub_round_shift_s32(right[31]);
1414 }
1415 
highbd_dct8x32_body_first_pass(int32x4_t * left,int32x4_t * right)1416 static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
1417                                                   int32x4_t *right /*32*/) {
1418   int32x4_t al[32], ar[32];
1419   int32x4_t bl[32], br[32];
1420 
1421   // Stage 1: Done as part of the load.
1422 
1423   // Stage 2.
1424   // Mini cross. X the first 16 values and the middle 8 of the second half.
1425   al[0] = vaddq_s32(left[0], left[15]);
1426   ar[0] = vaddq_s32(right[0], right[15]);
1427   al[1] = vaddq_s32(left[1], left[14]);
1428   ar[1] = vaddq_s32(right[1], right[14]);
1429   al[2] = vaddq_s32(left[2], left[13]);
1430   ar[2] = vaddq_s32(right[2], right[13]);
1431   al[3] = vaddq_s32(left[3], left[12]);
1432   ar[3] = vaddq_s32(right[3], right[12]);
1433   al[4] = vaddq_s32(left[4], left[11]);
1434   ar[4] = vaddq_s32(right[4], right[11]);
1435   al[5] = vaddq_s32(left[5], left[10]);
1436   ar[5] = vaddq_s32(right[5], right[10]);
1437   al[6] = vaddq_s32(left[6], left[9]);
1438   ar[6] = vaddq_s32(right[6], right[9]);
1439   al[7] = vaddq_s32(left[7], left[8]);
1440   ar[7] = vaddq_s32(right[7], right[8]);
1441 
1442   al[8] = vsubq_s32(left[7], left[8]);
1443   ar[8] = vsubq_s32(right[7], right[8]);
1444   al[9] = vsubq_s32(left[6], left[9]);
1445   ar[9] = vsubq_s32(right[6], right[9]);
1446   al[10] = vsubq_s32(left[5], left[10]);
1447   ar[10] = vsubq_s32(right[5], right[10]);
1448   al[11] = vsubq_s32(left[4], left[11]);
1449   ar[11] = vsubq_s32(right[4], right[11]);
1450   al[12] = vsubq_s32(left[3], left[12]);
1451   ar[12] = vsubq_s32(right[3], right[12]);
1452   al[13] = vsubq_s32(left[2], left[13]);
1453   ar[13] = vsubq_s32(right[2], right[13]);
1454   al[14] = vsubq_s32(left[1], left[14]);
1455   ar[14] = vsubq_s32(right[1], right[14]);
1456   al[15] = vsubq_s32(left[0], left[15]);
1457   ar[15] = vsubq_s32(right[0], right[15]);
1458 
1459   al[16] = left[16];
1460   ar[16] = right[16];
1461   al[17] = left[17];
1462   ar[17] = right[17];
1463   al[18] = left[18];
1464   ar[18] = right[18];
1465   al[19] = left[19];
1466   ar[19] = right[19];
1467 
1468   butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
1469                                cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
1470   butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
1471                                cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
1472   butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
1473                                cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
1474   butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
1475                                cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
1476 
1477   al[28] = left[28];
1478   ar[28] = right[28];
1479   al[29] = left[29];
1480   ar[29] = right[29];
1481   al[30] = left[30];
1482   ar[30] = right[30];
1483   al[31] = left[31];
1484   ar[31] = right[31];
1485 
1486   // Stage 3.
1487   bl[0] = vaddq_s32(al[0], al[7]);
1488   br[0] = vaddq_s32(ar[0], ar[7]);
1489   bl[1] = vaddq_s32(al[1], al[6]);
1490   br[1] = vaddq_s32(ar[1], ar[6]);
1491   bl[2] = vaddq_s32(al[2], al[5]);
1492   br[2] = vaddq_s32(ar[2], ar[5]);
1493   bl[3] = vaddq_s32(al[3], al[4]);
1494   br[3] = vaddq_s32(ar[3], ar[4]);
1495 
1496   bl[4] = vsubq_s32(al[3], al[4]);
1497   br[4] = vsubq_s32(ar[3], ar[4]);
1498   bl[5] = vsubq_s32(al[2], al[5]);
1499   br[5] = vsubq_s32(ar[2], ar[5]);
1500   bl[6] = vsubq_s32(al[1], al[6]);
1501   br[6] = vsubq_s32(ar[1], ar[6]);
1502   bl[7] = vsubq_s32(al[0], al[7]);
1503   br[7] = vsubq_s32(ar[0], ar[7]);
1504 
1505   bl[8] = al[8];
1506   br[8] = ar[8];
1507   bl[9] = al[9];
1508   br[9] = ar[9];
1509 
1510   butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
1511                                &bl[13], &br[13], &bl[10], &br[10]);
1512   butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
1513                                &bl[12], &br[12], &bl[11], &br[11]);
1514 
1515   bl[14] = al[14];
1516   br[14] = ar[14];
1517   bl[15] = al[15];
1518   br[15] = ar[15];
1519 
1520   bl[16] = vaddq_s32(left[16], al[23]);
1521   br[16] = vaddq_s32(right[16], ar[23]);
1522   bl[17] = vaddq_s32(left[17], al[22]);
1523   br[17] = vaddq_s32(right[17], ar[22]);
1524   bl[18] = vaddq_s32(left[18], al[21]);
1525   br[18] = vaddq_s32(right[18], ar[21]);
1526   bl[19] = vaddq_s32(left[19], al[20]);
1527   br[19] = vaddq_s32(right[19], ar[20]);
1528 
1529   bl[20] = vsubq_s32(left[19], al[20]);
1530   br[20] = vsubq_s32(right[19], ar[20]);
1531   bl[21] = vsubq_s32(left[18], al[21]);
1532   br[21] = vsubq_s32(right[18], ar[21]);
1533   bl[22] = vsubq_s32(left[17], al[22]);
1534   br[22] = vsubq_s32(right[17], ar[22]);
1535   bl[23] = vsubq_s32(left[16], al[23]);
1536   br[23] = vsubq_s32(right[16], ar[23]);
1537 
1538   bl[24] = vsubq_s32(left[31], al[24]);
1539   br[24] = vsubq_s32(right[31], ar[24]);
1540   bl[25] = vsubq_s32(left[30], al[25]);
1541   br[25] = vsubq_s32(right[30], ar[25]);
1542   bl[26] = vsubq_s32(left[29], al[26]);
1543   br[26] = vsubq_s32(right[29], ar[26]);
1544   bl[27] = vsubq_s32(left[28], al[27]);
1545   br[27] = vsubq_s32(right[28], ar[27]);
1546 
1547   bl[28] = vaddq_s32(left[28], al[27]);
1548   br[28] = vaddq_s32(right[28], ar[27]);
1549   bl[29] = vaddq_s32(left[29], al[26]);
1550   br[29] = vaddq_s32(right[29], ar[26]);
1551   bl[30] = vaddq_s32(left[30], al[25]);
1552   br[30] = vaddq_s32(right[30], ar[25]);
1553   bl[31] = vaddq_s32(left[31], al[24]);
1554   br[31] = vaddq_s32(right[31], ar[24]);
1555 
1556   // Stage 4.
1557   al[0] = vaddq_s32(bl[0], bl[3]);
1558   ar[0] = vaddq_s32(br[0], br[3]);
1559   al[1] = vaddq_s32(bl[1], bl[2]);
1560   ar[1] = vaddq_s32(br[1], br[2]);
1561   al[2] = vsubq_s32(bl[1], bl[2]);
1562   ar[2] = vsubq_s32(br[1], br[2]);
1563   al[3] = vsubq_s32(bl[0], bl[3]);
1564   ar[3] = vsubq_s32(br[0], br[3]);
1565 
1566   al[4] = bl[4];
1567   ar[4] = br[4];
1568 
1569   butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
1570                                &ar[6], &al[5], &ar[5]);
1571 
1572   al[7] = bl[7];
1573   ar[7] = br[7];
1574 
1575   al[8] = vaddq_s32(bl[8], bl[11]);
1576   ar[8] = vaddq_s32(br[8], br[11]);
1577   al[9] = vaddq_s32(bl[9], bl[10]);
1578   ar[9] = vaddq_s32(br[9], br[10]);
1579   al[10] = vsubq_s32(bl[9], bl[10]);
1580   ar[10] = vsubq_s32(br[9], br[10]);
1581   al[11] = vsubq_s32(bl[8], bl[11]);
1582   ar[11] = vsubq_s32(br[8], br[11]);
1583   al[12] = vsubq_s32(bl[15], bl[12]);
1584   ar[12] = vsubq_s32(br[15], br[12]);
1585   al[13] = vsubq_s32(bl[14], bl[13]);
1586   ar[13] = vsubq_s32(br[14], br[13]);
1587   al[14] = vaddq_s32(bl[14], bl[13]);
1588   ar[14] = vaddq_s32(br[14], br[13]);
1589   al[15] = vaddq_s32(bl[15], bl[12]);
1590   ar[15] = vaddq_s32(br[15], br[12]);
1591 
1592   al[16] = bl[16];
1593   ar[16] = br[16];
1594   al[17] = bl[17];
1595   ar[17] = br[17];
1596 
1597   butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
1598                                      cospi_24_64, &al[29], &ar[29], &al[18],
1599                                      &ar[18]);
1600   butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
1601                                      cospi_24_64, &al[28], &ar[28], &al[19],
1602                                      &ar[19]);
1603   butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
1604                                      cospi_24_64, -cospi_8_64, &al[27], &ar[27],
1605                                      &al[20], &ar[20]);
1606   butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
1607                                      cospi_24_64, -cospi_8_64, &al[26], &ar[26],
1608                                      &al[21], &ar[21]);
1609 
1610   al[22] = bl[22];
1611   ar[22] = br[22];
1612   al[23] = bl[23];
1613   ar[23] = br[23];
1614   al[24] = bl[24];
1615   ar[24] = br[24];
1616   al[25] = bl[25];
1617   ar[25] = br[25];
1618 
1619   al[30] = bl[30];
1620   ar[30] = br[30];
1621   al[31] = bl[31];
1622   ar[31] = br[31];
1623 
1624   // Stage 5.
1625   butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
1626                                &br[0], &bl[1], &br[1]);
1627   butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
1628                                      cospi_24_64, &bl[2], &br[2], &bl[3],
1629                                      &br[3]);
1630 
1631   bl[4] = vaddq_s32(al[4], al[5]);
1632   br[4] = vaddq_s32(ar[4], ar[5]);
1633   bl[5] = vsubq_s32(al[4], al[5]);
1634   br[5] = vsubq_s32(ar[4], ar[5]);
1635   bl[6] = vsubq_s32(al[7], al[6]);
1636   br[6] = vsubq_s32(ar[7], ar[6]);
1637   bl[7] = vaddq_s32(al[7], al[6]);
1638   br[7] = vaddq_s32(ar[7], ar[6]);
1639 
1640   bl[8] = al[8];
1641   br[8] = ar[8];
1642 
1643   butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
1644                                      cospi_24_64, &bl[14], &br[14], &bl[9],
1645                                      &br[9]);
1646   butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
1647                                      cospi_24_64, -cospi_8_64, &bl[13], &br[13],
1648                                      &bl[10], &br[10]);
1649 
1650   bl[11] = al[11];
1651   br[11] = ar[11];
1652   bl[12] = al[12];
1653   br[12] = ar[12];
1654 
1655   bl[15] = al[15];
1656   br[15] = ar[15];
1657 
1658   bl[16] = vaddq_s32(al[19], al[16]);
1659   br[16] = vaddq_s32(ar[19], ar[16]);
1660   bl[17] = vaddq_s32(al[18], al[17]);
1661   br[17] = vaddq_s32(ar[18], ar[17]);
1662   bl[18] = vsubq_s32(al[17], al[18]);
1663   br[18] = vsubq_s32(ar[17], ar[18]);
1664   bl[19] = vsubq_s32(al[16], al[19]);
1665   br[19] = vsubq_s32(ar[16], ar[19]);
1666   bl[20] = vsubq_s32(al[23], al[20]);
1667   br[20] = vsubq_s32(ar[23], ar[20]);
1668   bl[21] = vsubq_s32(al[22], al[21]);
1669   br[21] = vsubq_s32(ar[22], ar[21]);
1670   bl[22] = vaddq_s32(al[21], al[22]);
1671   br[22] = vaddq_s32(ar[21], ar[22]);
1672   bl[23] = vaddq_s32(al[20], al[23]);
1673   br[23] = vaddq_s32(ar[20], ar[23]);
1674   bl[24] = vaddq_s32(al[27], al[24]);
1675   br[24] = vaddq_s32(ar[27], ar[24]);
1676   bl[25] = vaddq_s32(al[26], al[25]);
1677   br[25] = vaddq_s32(ar[26], ar[25]);
1678   bl[26] = vsubq_s32(al[25], al[26]);
1679   br[26] = vsubq_s32(ar[25], ar[26]);
1680   bl[27] = vsubq_s32(al[24], al[27]);
1681   br[27] = vsubq_s32(ar[24], ar[27]);
1682   bl[28] = vsubq_s32(al[31], al[28]);
1683   br[28] = vsubq_s32(ar[31], ar[28]);
1684   bl[29] = vsubq_s32(al[30], al[29]);
1685   br[29] = vsubq_s32(ar[30], ar[29]);
1686   bl[30] = vaddq_s32(al[29], al[30]);
1687   br[30] = vaddq_s32(ar[29], ar[30]);
1688   bl[31] = vaddq_s32(al[28], al[31]);
1689   br[31] = vaddq_s32(ar[28], ar[31]);
1690 
1691   // Stage 6.
1692   al[0] = bl[0];
1693   ar[0] = br[0];
1694   al[1] = bl[1];
1695   ar[1] = br[1];
1696   al[2] = bl[2];
1697   ar[2] = br[2];
1698   al[3] = bl[3];
1699   ar[3] = br[3];
1700 
1701   butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
1702                                      cospi_28_64, &al[4], &ar[4], &al[7],
1703                                      &ar[7]);
1704   butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
1705                                      cospi_12_64, &al[5], &ar[5], &al[6],
1706                                      &ar[6]);
1707 
1708   al[8] = vaddq_s32(bl[8], bl[9]);
1709   ar[8] = vaddq_s32(br[8], br[9]);
1710   al[9] = vsubq_s32(bl[8], bl[9]);
1711   ar[9] = vsubq_s32(br[8], br[9]);
1712   al[10] = vsubq_s32(bl[11], bl[10]);
1713   ar[10] = vsubq_s32(br[11], br[10]);
1714   al[11] = vaddq_s32(bl[11], bl[10]);
1715   ar[11] = vaddq_s32(br[11], br[10]);
1716   al[12] = vaddq_s32(bl[12], bl[13]);
1717   ar[12] = vaddq_s32(br[12], br[13]);
1718   al[13] = vsubq_s32(bl[12], bl[13]);
1719   ar[13] = vsubq_s32(br[12], br[13]);
1720   al[14] = vsubq_s32(bl[15], bl[14]);
1721   ar[14] = vsubq_s32(br[15], br[14]);
1722   al[15] = vaddq_s32(bl[15], bl[14]);
1723   ar[15] = vaddq_s32(br[15], br[14]);
1724 
1725   al[16] = bl[16];
1726   ar[16] = br[16];
1727   al[19] = bl[19];
1728   ar[19] = br[19];
1729   al[20] = bl[20];
1730   ar[20] = br[20];
1731   al[23] = bl[23];
1732   ar[23] = br[23];
1733   al[24] = bl[24];
1734   ar[24] = br[24];
1735   al[27] = bl[27];
1736   ar[27] = br[27];
1737   al[28] = bl[28];
1738   ar[28] = br[28];
1739   al[31] = bl[31];
1740   ar[31] = br[31];
1741 
1742   butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
1743                                      cospi_28_64, &al[30], &ar[30], &al[17],
1744                                      &ar[17]);
1745   butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
1746                                      cospi_28_64, -cospi_4_64, &al[29], &ar[29],
1747                                      &al[18], &ar[18]);
1748   butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
1749                                      cospi_20_64, cospi_12_64, &al[26], &ar[26],
1750                                      &al[21], &ar[21]);
1751   butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
1752                                      cospi_12_64, -cospi_20_64, &al[25],
1753                                      &ar[25], &al[22], &ar[22]);
1754 
1755   // Stage 7.
1756   bl[0] = al[0];
1757   br[0] = ar[0];
1758   bl[1] = al[1];
1759   br[1] = ar[1];
1760   bl[2] = al[2];
1761   br[2] = ar[2];
1762   bl[3] = al[3];
1763   br[3] = ar[3];
1764   bl[4] = al[4];
1765   br[4] = ar[4];
1766   bl[5] = al[5];
1767   br[5] = ar[5];
1768   bl[6] = al[6];
1769   br[6] = ar[6];
1770   bl[7] = al[7];
1771   br[7] = ar[7];
1772 
1773   butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
1774                                      cospi_30_64, &bl[8], &br[8], &bl[15],
1775                                      &br[15]);
1776   butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
1777                                      cospi_14_64, &bl[9], &br[9], &bl[14],
1778                                      &br[14]);
1779   butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
1780                                      cospi_10_64, cospi_22_64, &bl[10], &br[10],
1781                                      &bl[13], &br[13]);
1782   butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
1783                                      cospi_26_64, cospi_6_64, &bl[11], &br[11],
1784                                      &bl[12], &br[12]);
1785 
1786   bl[16] = vaddq_s32(al[16], al[17]);
1787   br[16] = vaddq_s32(ar[16], ar[17]);
1788   bl[17] = vsubq_s32(al[16], al[17]);
1789   br[17] = vsubq_s32(ar[16], ar[17]);
1790   bl[18] = vsubq_s32(al[19], al[18]);
1791   br[18] = vsubq_s32(ar[19], ar[18]);
1792   bl[19] = vaddq_s32(al[19], al[18]);
1793   br[19] = vaddq_s32(ar[19], ar[18]);
1794   bl[20] = vaddq_s32(al[20], al[21]);
1795   br[20] = vaddq_s32(ar[20], ar[21]);
1796   bl[21] = vsubq_s32(al[20], al[21]);
1797   br[21] = vsubq_s32(ar[20], ar[21]);
1798   bl[22] = vsubq_s32(al[23], al[22]);
1799   br[22] = vsubq_s32(ar[23], ar[22]);
1800   bl[23] = vaddq_s32(al[23], al[22]);
1801   br[23] = vaddq_s32(ar[23], ar[22]);
1802   bl[24] = vaddq_s32(al[24], al[25]);
1803   br[24] = vaddq_s32(ar[24], ar[25]);
1804   bl[25] = vsubq_s32(al[24], al[25]);
1805   br[25] = vsubq_s32(ar[24], ar[25]);
1806   bl[26] = vsubq_s32(al[27], al[26]);
1807   br[26] = vsubq_s32(ar[27], ar[26]);
1808   bl[27] = vaddq_s32(al[27], al[26]);
1809   br[27] = vaddq_s32(ar[27], ar[26]);
1810   bl[28] = vaddq_s32(al[28], al[29]);
1811   br[28] = vaddq_s32(ar[28], ar[29]);
1812   bl[29] = vsubq_s32(al[28], al[29]);
1813   br[29] = vsubq_s32(ar[28], ar[29]);
1814   bl[30] = vsubq_s32(al[31], al[30]);
1815   br[30] = vsubq_s32(ar[31], ar[30]);
1816   bl[31] = vaddq_s32(al[31], al[30]);
1817   br[31] = vaddq_s32(ar[31], ar[30]);
1818 
1819   // Final stage.
1820 
1821   left[0] = bl[0];
1822   right[0] = br[0];
1823   left[16] = bl[1];
1824   right[16] = br[1];
1825   left[8] = bl[2];
1826   right[8] = br[2];
1827   left[24] = bl[3];
1828   right[24] = br[3];
1829   left[4] = bl[4];
1830   right[4] = br[4];
1831   left[20] = bl[5];
1832   right[20] = br[5];
1833   left[12] = bl[6];
1834   right[12] = br[6];
1835   left[28] = bl[7];
1836   right[28] = br[7];
1837   left[2] = bl[8];
1838   right[2] = br[8];
1839   left[18] = bl[9];
1840   right[18] = br[9];
1841   left[10] = bl[10];
1842   right[10] = br[10];
1843   left[26] = bl[11];
1844   right[26] = br[11];
1845   left[6] = bl[12];
1846   right[6] = br[12];
1847   left[22] = bl[13];
1848   right[22] = br[13];
1849   left[14] = bl[14];
1850   right[14] = br[14];
1851   left[30] = bl[15];
1852   right[30] = br[15];
1853 
1854   butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
1855                                      cospi_31_64, &al[1], &ar[1], &al[31],
1856                                      &ar[31]);
1857   left[1] = al[1];
1858   right[1] = ar[1];
1859   left[31] = al[31];
1860   right[31] = ar[31];
1861 
1862   butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
1863                                      cospi_17_64, cospi_15_64, &al[17], &ar[17],
1864                                      &al[15], &ar[15]);
1865   left[17] = al[17];
1866   right[17] = ar[17];
1867   left[15] = al[15];
1868   right[15] = ar[15];
1869 
1870   butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
1871                                      cospi_23_64, &al[9], &ar[9], &al[23],
1872                                      &ar[23]);
1873   left[9] = al[9];
1874   right[9] = ar[9];
1875   left[23] = al[23];
1876   right[23] = ar[23];
1877 
1878   butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
1879                                      cospi_25_64, cospi_7_64, &al[25], &ar[25],
1880                                      &al[7], &ar[7]);
1881   left[25] = al[25];
1882   right[25] = ar[25];
1883   left[7] = al[7];
1884   right[7] = ar[7];
1885 
1886   butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
1887                                      cospi_27_64, &al[5], &ar[5], &al[27],
1888                                      &ar[27]);
1889   left[5] = al[5];
1890   right[5] = ar[5];
1891   left[27] = al[27];
1892   right[27] = ar[27];
1893 
1894   butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
1895                                      cospi_21_64, cospi_11_64, &al[21], &ar[21],
1896                                      &al[11], &ar[11]);
1897   left[21] = al[21];
1898   right[21] = ar[21];
1899   left[11] = al[11];
1900   right[11] = ar[11];
1901 
1902   butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
1903                                      cospi_13_64, cospi_19_64, &al[13], &ar[13],
1904                                      &al[19], &ar[19]);
1905   left[13] = al[13];
1906   right[13] = ar[13];
1907   left[19] = al[19];
1908   right[19] = ar[19];
1909 
1910   butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
1911                                      cospi_29_64, cospi_3_64, &al[29], &ar[29],
1912                                      &al[3], &ar[3]);
1913   left[29] = al[29];
1914   right[29] = ar[29];
1915   left[3] = al[3];
1916   right[3] = ar[3];
1917 }
1918 
highbd_dct8x32_body_second_pass(int32x4_t * left,int32x4_t * right)1919 static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
1920                                                    int32x4_t *right /*32*/) {
1921   int32x4_t al[32], ar[32];
1922   int32x4_t bl[32], br[32];
1923 
1924   // Stage 1: Done as part of the load.
1925 
1926   // Stage 2.
1927   // Mini cross. X the first 16 values and the middle 8 of the second half.
1928   al[0] = vaddq_s32(left[0], left[15]);
1929   ar[0] = vaddq_s32(right[0], right[15]);
1930   al[1] = vaddq_s32(left[1], left[14]);
1931   ar[1] = vaddq_s32(right[1], right[14]);
1932   al[2] = vaddq_s32(left[2], left[13]);
1933   ar[2] = vaddq_s32(right[2], right[13]);
1934   al[3] = vaddq_s32(left[3], left[12]);
1935   ar[3] = vaddq_s32(right[3], right[12]);
1936   al[4] = vaddq_s32(left[4], left[11]);
1937   ar[4] = vaddq_s32(right[4], right[11]);
1938   al[5] = vaddq_s32(left[5], left[10]);
1939   ar[5] = vaddq_s32(right[5], right[10]);
1940   al[6] = vaddq_s32(left[6], left[9]);
1941   ar[6] = vaddq_s32(right[6], right[9]);
1942   al[7] = vaddq_s32(left[7], left[8]);
1943   ar[7] = vaddq_s32(right[7], right[8]);
1944 
1945   al[8] = vsubq_s32(left[7], left[8]);
1946   ar[8] = vsubq_s32(right[7], right[8]);
1947   al[9] = vsubq_s32(left[6], left[9]);
1948   ar[9] = vsubq_s32(right[6], right[9]);
1949   al[10] = vsubq_s32(left[5], left[10]);
1950   ar[10] = vsubq_s32(right[5], right[10]);
1951   al[11] = vsubq_s32(left[4], left[11]);
1952   ar[11] = vsubq_s32(right[4], right[11]);
1953   al[12] = vsubq_s32(left[3], left[12]);
1954   ar[12] = vsubq_s32(right[3], right[12]);
1955   al[13] = vsubq_s32(left[2], left[13]);
1956   ar[13] = vsubq_s32(right[2], right[13]);
1957   al[14] = vsubq_s32(left[1], left[14]);
1958   ar[14] = vsubq_s32(right[1], right[14]);
1959   al[15] = vsubq_s32(left[0], left[15]);
1960   ar[15] = vsubq_s32(right[0], right[15]);
1961 
1962   al[16] = left[16];
1963   ar[16] = right[16];
1964   al[17] = left[17];
1965   ar[17] = right[17];
1966   al[18] = left[18];
1967   ar[18] = right[18];
1968   al[19] = left[19];
1969   ar[19] = right[19];
1970 
1971   butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
1972                                cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
1973   butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
1974                                cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
1975   butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
1976                                cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
1977   butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
1978                                cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
1979 
1980   al[28] = left[28];
1981   ar[28] = right[28];
1982   al[29] = left[29];
1983   ar[29] = right[29];
1984   al[30] = left[30];
1985   ar[30] = right[30];
1986   al[31] = left[31];
1987   ar[31] = right[31];
1988 
1989   // Stage 3.
1990   bl[0] = vaddq_s32(al[0], al[7]);
1991   br[0] = vaddq_s32(ar[0], ar[7]);
1992   bl[1] = vaddq_s32(al[1], al[6]);
1993   br[1] = vaddq_s32(ar[1], ar[6]);
1994   bl[2] = vaddq_s32(al[2], al[5]);
1995   br[2] = vaddq_s32(ar[2], ar[5]);
1996   bl[3] = vaddq_s32(al[3], al[4]);
1997   br[3] = vaddq_s32(ar[3], ar[4]);
1998 
1999   bl[4] = vsubq_s32(al[3], al[4]);
2000   br[4] = vsubq_s32(ar[3], ar[4]);
2001   bl[5] = vsubq_s32(al[2], al[5]);
2002   br[5] = vsubq_s32(ar[2], ar[5]);
2003   bl[6] = vsubq_s32(al[1], al[6]);
2004   br[6] = vsubq_s32(ar[1], ar[6]);
2005   bl[7] = vsubq_s32(al[0], al[7]);
2006   br[7] = vsubq_s32(ar[0], ar[7]);
2007 
2008   bl[8] = al[8];
2009   br[8] = ar[8];
2010   bl[9] = al[9];
2011   br[9] = ar[9];
2012 
2013   butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
2014                                &bl[13], &br[13], &bl[10], &br[10]);
2015   butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
2016                                &bl[12], &br[12], &bl[11], &br[11]);
2017 
2018   bl[14] = al[14];
2019   br[14] = ar[14];
2020   bl[15] = al[15];
2021   br[15] = ar[15];
2022 
2023   bl[16] = vaddq_s32(left[16], al[23]);
2024   br[16] = vaddq_s32(right[16], ar[23]);
2025   bl[17] = vaddq_s32(left[17], al[22]);
2026   br[17] = vaddq_s32(right[17], ar[22]);
2027   bl[18] = vaddq_s32(left[18], al[21]);
2028   br[18] = vaddq_s32(right[18], ar[21]);
2029   bl[19] = vaddq_s32(left[19], al[20]);
2030   br[19] = vaddq_s32(right[19], ar[20]);
2031 
2032   bl[20] = vsubq_s32(left[19], al[20]);
2033   br[20] = vsubq_s32(right[19], ar[20]);
2034   bl[21] = vsubq_s32(left[18], al[21]);
2035   br[21] = vsubq_s32(right[18], ar[21]);
2036   bl[22] = vsubq_s32(left[17], al[22]);
2037   br[22] = vsubq_s32(right[17], ar[22]);
2038   bl[23] = vsubq_s32(left[16], al[23]);
2039   br[23] = vsubq_s32(right[16], ar[23]);
2040 
2041   bl[24] = vsubq_s32(left[31], al[24]);
2042   br[24] = vsubq_s32(right[31], ar[24]);
2043   bl[25] = vsubq_s32(left[30], al[25]);
2044   br[25] = vsubq_s32(right[30], ar[25]);
2045   bl[26] = vsubq_s32(left[29], al[26]);
2046   br[26] = vsubq_s32(right[29], ar[26]);
2047   bl[27] = vsubq_s32(left[28], al[27]);
2048   br[27] = vsubq_s32(right[28], ar[27]);
2049 
2050   bl[28] = vaddq_s32(left[28], al[27]);
2051   br[28] = vaddq_s32(right[28], ar[27]);
2052   bl[29] = vaddq_s32(left[29], al[26]);
2053   br[29] = vaddq_s32(right[29], ar[26]);
2054   bl[30] = vaddq_s32(left[30], al[25]);
2055   br[30] = vaddq_s32(right[30], ar[25]);
2056   bl[31] = vaddq_s32(left[31], al[24]);
2057   br[31] = vaddq_s32(right[31], ar[24]);
2058 
2059   // Stage 4.
2060   al[0] = vaddq_s32(bl[0], bl[3]);
2061   ar[0] = vaddq_s32(br[0], br[3]);
2062   al[1] = vaddq_s32(bl[1], bl[2]);
2063   ar[1] = vaddq_s32(br[1], br[2]);
2064   al[2] = vsubq_s32(bl[1], bl[2]);
2065   ar[2] = vsubq_s32(br[1], br[2]);
2066   al[3] = vsubq_s32(bl[0], bl[3]);
2067   ar[3] = vsubq_s32(br[0], br[3]);
2068 
2069   al[4] = bl[4];
2070   ar[4] = br[4];
2071 
2072   butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
2073                                &ar[6], &al[5], &ar[5]);
2074 
2075   al[7] = bl[7];
2076   ar[7] = br[7];
2077 
2078   al[8] = vaddq_s32(bl[8], bl[11]);
2079   ar[8] = vaddq_s32(br[8], br[11]);
2080   al[9] = vaddq_s32(bl[9], bl[10]);
2081   ar[9] = vaddq_s32(br[9], br[10]);
2082   al[10] = vsubq_s32(bl[9], bl[10]);
2083   ar[10] = vsubq_s32(br[9], br[10]);
2084   al[11] = vsubq_s32(bl[8], bl[11]);
2085   ar[11] = vsubq_s32(br[8], br[11]);
2086   al[12] = vsubq_s32(bl[15], bl[12]);
2087   ar[12] = vsubq_s32(br[15], br[12]);
2088   al[13] = vsubq_s32(bl[14], bl[13]);
2089   ar[13] = vsubq_s32(br[14], br[13]);
2090   al[14] = vaddq_s32(bl[14], bl[13]);
2091   ar[14] = vaddq_s32(br[14], br[13]);
2092   al[15] = vaddq_s32(bl[15], bl[12]);
2093   ar[15] = vaddq_s32(br[15], br[12]);
2094 
2095   al[16] = bl[16];
2096   ar[16] = br[16];
2097   al[17] = bl[17];
2098   ar[17] = br[17];
2099 
2100   butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
2101                                      cospi_24_64, &al[29], &ar[29], &al[18],
2102                                      &ar[18]);
2103   butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
2104                                      cospi_24_64, &al[28], &ar[28], &al[19],
2105                                      &ar[19]);
2106   butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
2107                                      cospi_24_64, -cospi_8_64, &al[27], &ar[27],
2108                                      &al[20], &ar[20]);
2109   butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
2110                                      cospi_24_64, -cospi_8_64, &al[26], &ar[26],
2111                                      &al[21], &ar[21]);
2112 
2113   al[22] = bl[22];
2114   ar[22] = br[22];
2115   al[23] = bl[23];
2116   ar[23] = br[23];
2117   al[24] = bl[24];
2118   ar[24] = br[24];
2119   al[25] = bl[25];
2120   ar[25] = br[25];
2121 
2122   al[30] = bl[30];
2123   ar[30] = br[30];
2124   al[31] = bl[31];
2125   ar[31] = br[31];
2126 
2127   // Stage 5.
2128   butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
2129                                &br[0], &bl[1], &br[1]);
2130   butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
2131                                      cospi_24_64, &bl[2], &br[2], &bl[3],
2132                                      &br[3]);
2133 
2134   bl[4] = vaddq_s32(al[4], al[5]);
2135   br[4] = vaddq_s32(ar[4], ar[5]);
2136   bl[5] = vsubq_s32(al[4], al[5]);
2137   br[5] = vsubq_s32(ar[4], ar[5]);
2138   bl[6] = vsubq_s32(al[7], al[6]);
2139   br[6] = vsubq_s32(ar[7], ar[6]);
2140   bl[7] = vaddq_s32(al[7], al[6]);
2141   br[7] = vaddq_s32(ar[7], ar[6]);
2142 
2143   bl[8] = al[8];
2144   br[8] = ar[8];
2145 
2146   butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
2147                                      cospi_24_64, &bl[14], &br[14], &bl[9],
2148                                      &br[9]);
2149   butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
2150                                      cospi_24_64, -cospi_8_64, &bl[13], &br[13],
2151                                      &bl[10], &br[10]);
2152 
2153   bl[11] = al[11];
2154   br[11] = ar[11];
2155   bl[12] = al[12];
2156   br[12] = ar[12];
2157 
2158   bl[15] = al[15];
2159   br[15] = ar[15];
2160 
2161   bl[16] = vaddq_s32(al[19], al[16]);
2162   br[16] = vaddq_s32(ar[19], ar[16]);
2163   bl[17] = vaddq_s32(al[18], al[17]);
2164   br[17] = vaddq_s32(ar[18], ar[17]);
2165   bl[18] = vsubq_s32(al[17], al[18]);
2166   br[18] = vsubq_s32(ar[17], ar[18]);
2167   bl[19] = vsubq_s32(al[16], al[19]);
2168   br[19] = vsubq_s32(ar[16], ar[19]);
2169   bl[20] = vsubq_s32(al[23], al[20]);
2170   br[20] = vsubq_s32(ar[23], ar[20]);
2171   bl[21] = vsubq_s32(al[22], al[21]);
2172   br[21] = vsubq_s32(ar[22], ar[21]);
2173   bl[22] = vaddq_s32(al[21], al[22]);
2174   br[22] = vaddq_s32(ar[21], ar[22]);
2175   bl[23] = vaddq_s32(al[20], al[23]);
2176   br[23] = vaddq_s32(ar[20], ar[23]);
2177   bl[24] = vaddq_s32(al[27], al[24]);
2178   br[24] = vaddq_s32(ar[27], ar[24]);
2179   bl[25] = vaddq_s32(al[26], al[25]);
2180   br[25] = vaddq_s32(ar[26], ar[25]);
2181   bl[26] = vsubq_s32(al[25], al[26]);
2182   br[26] = vsubq_s32(ar[25], ar[26]);
2183   bl[27] = vsubq_s32(al[24], al[27]);
2184   br[27] = vsubq_s32(ar[24], ar[27]);
2185   bl[28] = vsubq_s32(al[31], al[28]);
2186   br[28] = vsubq_s32(ar[31], ar[28]);
2187   bl[29] = vsubq_s32(al[30], al[29]);
2188   br[29] = vsubq_s32(ar[30], ar[29]);
2189   bl[30] = vaddq_s32(al[29], al[30]);
2190   br[30] = vaddq_s32(ar[29], ar[30]);
2191   bl[31] = vaddq_s32(al[28], al[31]);
2192   br[31] = vaddq_s32(ar[28], ar[31]);
2193 
2194   // Stage 6.
2195   al[0] = bl[0];
2196   ar[0] = br[0];
2197   al[1] = bl[1];
2198   ar[1] = br[1];
2199   al[2] = bl[2];
2200   ar[2] = br[2];
2201   al[3] = bl[3];
2202   ar[3] = br[3];
2203 
2204   butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
2205                                      cospi_28_64, &al[4], &ar[4], &al[7],
2206                                      &ar[7]);
2207   butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
2208                                      cospi_12_64, &al[5], &ar[5], &al[6],
2209                                      &ar[6]);
2210 
2211   al[8] = vaddq_s32(bl[8], bl[9]);
2212   ar[8] = vaddq_s32(br[8], br[9]);
2213   al[9] = vsubq_s32(bl[8], bl[9]);
2214   ar[9] = vsubq_s32(br[8], br[9]);
2215   al[10] = vsubq_s32(bl[11], bl[10]);
2216   ar[10] = vsubq_s32(br[11], br[10]);
2217   al[11] = vaddq_s32(bl[11], bl[10]);
2218   ar[11] = vaddq_s32(br[11], br[10]);
2219   al[12] = vaddq_s32(bl[12], bl[13]);
2220   ar[12] = vaddq_s32(br[12], br[13]);
2221   al[13] = vsubq_s32(bl[12], bl[13]);
2222   ar[13] = vsubq_s32(br[12], br[13]);
2223   al[14] = vsubq_s32(bl[15], bl[14]);
2224   ar[14] = vsubq_s32(br[15], br[14]);
2225   al[15] = vaddq_s32(bl[15], bl[14]);
2226   ar[15] = vaddq_s32(br[15], br[14]);
2227 
2228   al[16] = bl[16];
2229   ar[16] = br[16];
2230   al[19] = bl[19];
2231   ar[19] = br[19];
2232   al[20] = bl[20];
2233   ar[20] = br[20];
2234   al[23] = bl[23];
2235   ar[23] = br[23];
2236   al[24] = bl[24];
2237   ar[24] = br[24];
2238   al[27] = bl[27];
2239   ar[27] = br[27];
2240   al[28] = bl[28];
2241   ar[28] = br[28];
2242   al[31] = bl[31];
2243   ar[31] = br[31];
2244 
2245   butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
2246                                      cospi_28_64, &al[30], &ar[30], &al[17],
2247                                      &ar[17]);
2248   butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
2249                                      cospi_28_64, -cospi_4_64, &al[29], &ar[29],
2250                                      &al[18], &ar[18]);
2251   butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
2252                                      cospi_20_64, cospi_12_64, &al[26], &ar[26],
2253                                      &al[21], &ar[21]);
2254   butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
2255                                      cospi_12_64, -cospi_20_64, &al[25],
2256                                      &ar[25], &al[22], &ar[22]);
2257 
2258   // Stage 7.
2259   bl[0] = al[0];
2260   br[0] = ar[0];
2261   bl[1] = al[1];
2262   br[1] = ar[1];
2263   bl[2] = al[2];
2264   br[2] = ar[2];
2265   bl[3] = al[3];
2266   br[3] = ar[3];
2267   bl[4] = al[4];
2268   br[4] = ar[4];
2269   bl[5] = al[5];
2270   br[5] = ar[5];
2271   bl[6] = al[6];
2272   br[6] = ar[6];
2273   bl[7] = al[7];
2274   br[7] = ar[7];
2275 
2276   butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
2277                                      cospi_30_64, &bl[8], &br[8], &bl[15],
2278                                      &br[15]);
2279   butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
2280                                      cospi_14_64, &bl[9], &br[9], &bl[14],
2281                                      &br[14]);
2282   butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
2283                                      cospi_10_64, cospi_22_64, &bl[10], &br[10],
2284                                      &bl[13], &br[13]);
2285   butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
2286                                      cospi_26_64, cospi_6_64, &bl[11], &br[11],
2287                                      &bl[12], &br[12]);
2288 
2289   bl[16] = vaddq_s32(al[16], al[17]);
2290   br[16] = vaddq_s32(ar[16], ar[17]);
2291   bl[17] = vsubq_s32(al[16], al[17]);
2292   br[17] = vsubq_s32(ar[16], ar[17]);
2293   bl[18] = vsubq_s32(al[19], al[18]);
2294   br[18] = vsubq_s32(ar[19], ar[18]);
2295   bl[19] = vaddq_s32(al[19], al[18]);
2296   br[19] = vaddq_s32(ar[19], ar[18]);
2297   bl[20] = vaddq_s32(al[20], al[21]);
2298   br[20] = vaddq_s32(ar[20], ar[21]);
2299   bl[21] = vsubq_s32(al[20], al[21]);
2300   br[21] = vsubq_s32(ar[20], ar[21]);
2301   bl[22] = vsubq_s32(al[23], al[22]);
2302   br[22] = vsubq_s32(ar[23], ar[22]);
2303   bl[23] = vaddq_s32(al[23], al[22]);
2304   br[23] = vaddq_s32(ar[23], ar[22]);
2305   bl[24] = vaddq_s32(al[24], al[25]);
2306   br[24] = vaddq_s32(ar[24], ar[25]);
2307   bl[25] = vsubq_s32(al[24], al[25]);
2308   br[25] = vsubq_s32(ar[24], ar[25]);
2309   bl[26] = vsubq_s32(al[27], al[26]);
2310   br[26] = vsubq_s32(ar[27], ar[26]);
2311   bl[27] = vaddq_s32(al[27], al[26]);
2312   br[27] = vaddq_s32(ar[27], ar[26]);
2313   bl[28] = vaddq_s32(al[28], al[29]);
2314   br[28] = vaddq_s32(ar[28], ar[29]);
2315   bl[29] = vsubq_s32(al[28], al[29]);
2316   br[29] = vsubq_s32(ar[28], ar[29]);
2317   bl[30] = vsubq_s32(al[31], al[30]);
2318   br[30] = vsubq_s32(ar[31], ar[30]);
2319   bl[31] = vaddq_s32(al[31], al[30]);
2320   br[31] = vaddq_s32(ar[31], ar[30]);
2321 
2322   // Final stage.
2323 
2324   left[0] = bl[0];
2325   right[0] = br[0];
2326   left[16] = bl[1];
2327   right[16] = br[1];
2328   left[8] = bl[2];
2329   right[8] = br[2];
2330   left[24] = bl[3];
2331   right[24] = br[3];
2332   left[4] = bl[4];
2333   right[4] = br[4];
2334   left[20] = bl[5];
2335   right[20] = br[5];
2336   left[12] = bl[6];
2337   right[12] = br[6];
2338   left[28] = bl[7];
2339   right[28] = br[7];
2340   left[2] = bl[8];
2341   right[2] = br[8];
2342   left[18] = bl[9];
2343   right[18] = br[9];
2344   left[10] = bl[10];
2345   right[10] = br[10];
2346   left[26] = bl[11];
2347   right[26] = br[11];
2348   left[6] = bl[12];
2349   right[6] = br[12];
2350   left[22] = bl[13];
2351   right[22] = br[13];
2352   left[14] = bl[14];
2353   right[14] = br[14];
2354   left[30] = bl[15];
2355   right[30] = br[15];
2356 
2357   butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
2358                                      cospi_31_64, &al[1], &ar[1], &al[31],
2359                                      &ar[31]);
2360   left[1] = al[1];
2361   right[1] = ar[1];
2362   left[31] = al[31];
2363   right[31] = ar[31];
2364 
2365   butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
2366                                      cospi_17_64, cospi_15_64, &al[17], &ar[17],
2367                                      &al[15], &ar[15]);
2368   left[17] = al[17];
2369   right[17] = ar[17];
2370   left[15] = al[15];
2371   right[15] = ar[15];
2372 
2373   butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
2374                                      cospi_23_64, &al[9], &ar[9], &al[23],
2375                                      &ar[23]);
2376   left[9] = al[9];
2377   right[9] = ar[9];
2378   left[23] = al[23];
2379   right[23] = ar[23];
2380 
2381   butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
2382                                      cospi_25_64, cospi_7_64, &al[25], &ar[25],
2383                                      &al[7], &ar[7]);
2384   left[25] = al[25];
2385   right[25] = ar[25];
2386   left[7] = al[7];
2387   right[7] = ar[7];
2388 
2389   butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
2390                                      cospi_27_64, &al[5], &ar[5], &al[27],
2391                                      &ar[27]);
2392   left[5] = al[5];
2393   right[5] = ar[5];
2394   left[27] = al[27];
2395   right[27] = ar[27];
2396 
2397   butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
2398                                      cospi_21_64, cospi_11_64, &al[21], &ar[21],
2399                                      &al[11], &ar[11]);
2400   left[21] = al[21];
2401   right[21] = ar[21];
2402   left[11] = al[11];
2403   right[11] = ar[11];
2404 
2405   butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
2406                                      cospi_13_64, cospi_19_64, &al[13], &ar[13],
2407                                      &al[19], &ar[19]);
2408   left[13] = al[13];
2409   right[13] = ar[13];
2410   left[19] = al[19];
2411   right[19] = ar[19];
2412 
2413   butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
2414                                      cospi_29_64, cospi_3_64, &al[29], &ar[29],
2415                                      &al[3], &ar[3]);
2416   left[29] = al[29];
2417   right[29] = ar[29];
2418   left[3] = al[3];
2419   right[3] = ar[3];
2420 }
2421 
highbd_dct8x32_body_second_pass_rd(int32x4_t * left,int32x4_t * right)2422 static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
2423                                                       int32x4_t *right /*32*/) {
2424   int32x4_t al[32], ar[32];
2425   int32x4_t bl[32], br[32];
2426 
2427   // Stage 1: Done as part of the load.
2428 
2429   // Stage 2.
2430   // For the "rd" version, all the values are rounded down after stage 2 to keep
2431   // the values in 16 bits.
2432   al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
2433   ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
2434   al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
2435   ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
2436   al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
2437   ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
2438   al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
2439   ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
2440   al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
2441   ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
2442   al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
2443   ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
2444   al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
2445   ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
2446   al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
2447   ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
2448 
2449   al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
2450   ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
2451   al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
2452   ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
2453   al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
2454   ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
2455   al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
2456   ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
2457   al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
2458   ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
2459   al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
2460   ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
2461   al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
2462   ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
2463   al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
2464   ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
2465 
2466   al[16] = add_round_shift_s32(left[16]);
2467   ar[16] = add_round_shift_s32(right[16]);
2468   al[17] = add_round_shift_s32(left[17]);
2469   ar[17] = add_round_shift_s32(right[17]);
2470   al[18] = add_round_shift_s32(left[18]);
2471   ar[18] = add_round_shift_s32(right[18]);
2472   al[19] = add_round_shift_s32(left[19]);
2473   ar[19] = add_round_shift_s32(right[19]);
2474 
2475   butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
2476                                cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
2477   butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
2478                                cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
2479   butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
2480                                cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
2481   butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
2482                                cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
2483 
2484   al[20] = add_round_shift_s32(al[20]);
2485   ar[20] = add_round_shift_s32(ar[20]);
2486   al[21] = add_round_shift_s32(al[21]);
2487   ar[21] = add_round_shift_s32(ar[21]);
2488   al[22] = add_round_shift_s32(al[22]);
2489   ar[22] = add_round_shift_s32(ar[22]);
2490   al[23] = add_round_shift_s32(al[23]);
2491   ar[23] = add_round_shift_s32(ar[23]);
2492   al[24] = add_round_shift_s32(al[24]);
2493   ar[24] = add_round_shift_s32(ar[24]);
2494   al[25] = add_round_shift_s32(al[25]);
2495   ar[25] = add_round_shift_s32(ar[25]);
2496   al[26] = add_round_shift_s32(al[26]);
2497   ar[26] = add_round_shift_s32(ar[26]);
2498   al[27] = add_round_shift_s32(al[27]);
2499   ar[27] = add_round_shift_s32(ar[27]);
2500 
2501   al[28] = add_round_shift_s32(left[28]);
2502   ar[28] = add_round_shift_s32(right[28]);
2503   al[29] = add_round_shift_s32(left[29]);
2504   ar[29] = add_round_shift_s32(right[29]);
2505   al[30] = add_round_shift_s32(left[30]);
2506   ar[30] = add_round_shift_s32(right[30]);
2507   al[31] = add_round_shift_s32(left[31]);
2508   ar[31] = add_round_shift_s32(right[31]);
2509 
2510   // Stage 3.
2511   bl[0] = vaddq_s32(al[0], al[7]);
2512   br[0] = vaddq_s32(ar[0], ar[7]);
2513   bl[1] = vaddq_s32(al[1], al[6]);
2514   br[1] = vaddq_s32(ar[1], ar[6]);
2515   bl[2] = vaddq_s32(al[2], al[5]);
2516   br[2] = vaddq_s32(ar[2], ar[5]);
2517   bl[3] = vaddq_s32(al[3], al[4]);
2518   br[3] = vaddq_s32(ar[3], ar[4]);
2519 
2520   bl[4] = vsubq_s32(al[3], al[4]);
2521   br[4] = vsubq_s32(ar[3], ar[4]);
2522   bl[5] = vsubq_s32(al[2], al[5]);
2523   br[5] = vsubq_s32(ar[2], ar[5]);
2524   bl[6] = vsubq_s32(al[1], al[6]);
2525   br[6] = vsubq_s32(ar[1], ar[6]);
2526   bl[7] = vsubq_s32(al[0], al[7]);
2527   br[7] = vsubq_s32(ar[0], ar[7]);
2528 
2529   bl[8] = al[8];
2530   br[8] = ar[8];
2531   bl[9] = al[9];
2532   br[9] = ar[9];
2533 
2534   butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
2535                                &bl[13], &br[13], &bl[10], &br[10]);
2536   butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
2537                                &bl[12], &br[12], &bl[11], &br[11]);
2538 
2539   bl[14] = al[14];
2540   br[14] = ar[14];
2541   bl[15] = al[15];
2542   br[15] = ar[15];
2543 
2544   bl[16] = vaddq_s32(al[16], al[23]);
2545   br[16] = vaddq_s32(ar[16], ar[23]);
2546   bl[17] = vaddq_s32(al[17], al[22]);
2547   br[17] = vaddq_s32(ar[17], ar[22]);
2548   bl[18] = vaddq_s32(al[18], al[21]);
2549   br[18] = vaddq_s32(ar[18], ar[21]);
2550   bl[19] = vaddq_s32(al[19], al[20]);
2551   br[19] = vaddq_s32(ar[19], ar[20]);
2552 
2553   bl[20] = vsubq_s32(al[19], al[20]);
2554   br[20] = vsubq_s32(ar[19], ar[20]);
2555   bl[21] = vsubq_s32(al[18], al[21]);
2556   br[21] = vsubq_s32(ar[18], ar[21]);
2557   bl[22] = vsubq_s32(al[17], al[22]);
2558   br[22] = vsubq_s32(ar[17], ar[22]);
2559   bl[23] = vsubq_s32(al[16], al[23]);
2560   br[23] = vsubq_s32(ar[16], ar[23]);
2561 
2562   bl[24] = vsubq_s32(al[31], al[24]);
2563   br[24] = vsubq_s32(ar[31], ar[24]);
2564   bl[25] = vsubq_s32(al[30], al[25]);
2565   br[25] = vsubq_s32(ar[30], ar[25]);
2566   bl[26] = vsubq_s32(al[29], al[26]);
2567   br[26] = vsubq_s32(ar[29], ar[26]);
2568   bl[27] = vsubq_s32(al[28], al[27]);
2569   br[27] = vsubq_s32(ar[28], ar[27]);
2570 
2571   bl[28] = vaddq_s32(al[28], al[27]);
2572   br[28] = vaddq_s32(ar[28], ar[27]);
2573   bl[29] = vaddq_s32(al[29], al[26]);
2574   br[29] = vaddq_s32(ar[29], ar[26]);
2575   bl[30] = vaddq_s32(al[30], al[25]);
2576   br[30] = vaddq_s32(ar[30], ar[25]);
2577   bl[31] = vaddq_s32(al[31], al[24]);
2578   br[31] = vaddq_s32(ar[31], ar[24]);
2579 
2580   // Stage 4.
2581   al[0] = vaddq_s32(bl[0], bl[3]);
2582   ar[0] = vaddq_s32(br[0], br[3]);
2583   al[1] = vaddq_s32(bl[1], bl[2]);
2584   ar[1] = vaddq_s32(br[1], br[2]);
2585   al[2] = vsubq_s32(bl[1], bl[2]);
2586   ar[2] = vsubq_s32(br[1], br[2]);
2587   al[3] = vsubq_s32(bl[0], bl[3]);
2588   ar[3] = vsubq_s32(br[0], br[3]);
2589 
2590   al[4] = bl[4];
2591   ar[4] = br[4];
2592 
2593   butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
2594                                &ar[6], &al[5], &ar[5]);
2595 
2596   al[7] = bl[7];
2597   ar[7] = br[7];
2598 
2599   al[8] = vaddq_s32(bl[8], bl[11]);
2600   ar[8] = vaddq_s32(br[8], br[11]);
2601   al[9] = vaddq_s32(bl[9], bl[10]);
2602   ar[9] = vaddq_s32(br[9], br[10]);
2603   al[10] = vsubq_s32(bl[9], bl[10]);
2604   ar[10] = vsubq_s32(br[9], br[10]);
2605   al[11] = vsubq_s32(bl[8], bl[11]);
2606   ar[11] = vsubq_s32(br[8], br[11]);
2607   al[12] = vsubq_s32(bl[15], bl[12]);
2608   ar[12] = vsubq_s32(br[15], br[12]);
2609   al[13] = vsubq_s32(bl[14], bl[13]);
2610   ar[13] = vsubq_s32(br[14], br[13]);
2611   al[14] = vaddq_s32(bl[14], bl[13]);
2612   ar[14] = vaddq_s32(br[14], br[13]);
2613   al[15] = vaddq_s32(bl[15], bl[12]);
2614   ar[15] = vaddq_s32(br[15], br[12]);
2615 
2616   al[16] = bl[16];
2617   ar[16] = br[16];
2618   al[17] = bl[17];
2619   ar[17] = br[17];
2620 
2621   butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
2622                           cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
2623   butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
2624                           cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
2625   butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
2626                           -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
2627   butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
2628                           -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
2629 
2630   al[22] = bl[22];
2631   ar[22] = br[22];
2632   al[23] = bl[23];
2633   ar[23] = br[23];
2634   al[24] = bl[24];
2635   ar[24] = br[24];
2636   al[25] = bl[25];
2637   ar[25] = br[25];
2638 
2639   al[30] = bl[30];
2640   ar[30] = br[30];
2641   al[31] = bl[31];
2642   ar[31] = br[31];
2643 
2644   // Stage 5.
2645   butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
2646                                &br[0], &bl[1], &br[1]);
2647   butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
2648                           &bl[2], &br[2], &bl[3], &br[3]);
2649 
2650   bl[4] = vaddq_s32(al[4], al[5]);
2651   br[4] = vaddq_s32(ar[4], ar[5]);
2652   bl[5] = vsubq_s32(al[4], al[5]);
2653   br[5] = vsubq_s32(ar[4], ar[5]);
2654   bl[6] = vsubq_s32(al[7], al[6]);
2655   br[6] = vsubq_s32(ar[7], ar[6]);
2656   bl[7] = vaddq_s32(al[7], al[6]);
2657   br[7] = vaddq_s32(ar[7], ar[6]);
2658 
2659   bl[8] = al[8];
2660   br[8] = ar[8];
2661 
2662   butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
2663                           &bl[14], &br[14], &bl[9], &br[9]);
2664   butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
2665                           -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
2666 
2667   bl[11] = al[11];
2668   br[11] = ar[11];
2669   bl[12] = al[12];
2670   br[12] = ar[12];
2671 
2672   bl[15] = al[15];
2673   br[15] = ar[15];
2674 
2675   bl[16] = vaddq_s32(al[19], al[16]);
2676   br[16] = vaddq_s32(ar[19], ar[16]);
2677   bl[17] = vaddq_s32(al[18], al[17]);
2678   br[17] = vaddq_s32(ar[18], ar[17]);
2679   bl[18] = vsubq_s32(al[17], al[18]);
2680   br[18] = vsubq_s32(ar[17], ar[18]);
2681   bl[19] = vsubq_s32(al[16], al[19]);
2682   br[19] = vsubq_s32(ar[16], ar[19]);
2683   bl[20] = vsubq_s32(al[23], al[20]);
2684   br[20] = vsubq_s32(ar[23], ar[20]);
2685   bl[21] = vsubq_s32(al[22], al[21]);
2686   br[21] = vsubq_s32(ar[22], ar[21]);
2687   bl[22] = vaddq_s32(al[21], al[22]);
2688   br[22] = vaddq_s32(ar[21], ar[22]);
2689   bl[23] = vaddq_s32(al[20], al[23]);
2690   br[23] = vaddq_s32(ar[20], ar[23]);
2691   bl[24] = vaddq_s32(al[27], al[24]);
2692   br[24] = vaddq_s32(ar[27], ar[24]);
2693   bl[25] = vaddq_s32(al[26], al[25]);
2694   br[25] = vaddq_s32(ar[26], ar[25]);
2695   bl[26] = vsubq_s32(al[25], al[26]);
2696   br[26] = vsubq_s32(ar[25], ar[26]);
2697   bl[27] = vsubq_s32(al[24], al[27]);
2698   br[27] = vsubq_s32(ar[24], ar[27]);
2699   bl[28] = vsubq_s32(al[31], al[28]);
2700   br[28] = vsubq_s32(ar[31], ar[28]);
2701   bl[29] = vsubq_s32(al[30], al[29]);
2702   br[29] = vsubq_s32(ar[30], ar[29]);
2703   bl[30] = vaddq_s32(al[29], al[30]);
2704   br[30] = vaddq_s32(ar[29], ar[30]);
2705   bl[31] = vaddq_s32(al[28], al[31]);
2706   br[31] = vaddq_s32(ar[28], ar[31]);
2707 
2708   // Stage 6.
2709   al[0] = bl[0];
2710   ar[0] = br[0];
2711   al[1] = bl[1];
2712   ar[1] = br[1];
2713   al[2] = bl[2];
2714   ar[2] = br[2];
2715   al[3] = bl[3];
2716   ar[3] = br[3];
2717 
2718   butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
2719                           &al[4], &ar[4], &al[7], &ar[7]);
2720   butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
2721                           &al[5], &ar[5], &al[6], &ar[6]);
2722 
2723   al[8] = vaddq_s32(bl[8], bl[9]);
2724   ar[8] = vaddq_s32(br[8], br[9]);
2725   al[9] = vsubq_s32(bl[8], bl[9]);
2726   ar[9] = vsubq_s32(br[8], br[9]);
2727   al[10] = vsubq_s32(bl[11], bl[10]);
2728   ar[10] = vsubq_s32(br[11], br[10]);
2729   al[11] = vaddq_s32(bl[11], bl[10]);
2730   ar[11] = vaddq_s32(br[11], br[10]);
2731   al[12] = vaddq_s32(bl[12], bl[13]);
2732   ar[12] = vaddq_s32(br[12], br[13]);
2733   al[13] = vsubq_s32(bl[12], bl[13]);
2734   ar[13] = vsubq_s32(br[12], br[13]);
2735   al[14] = vsubq_s32(bl[15], bl[14]);
2736   ar[14] = vsubq_s32(br[15], br[14]);
2737   al[15] = vaddq_s32(bl[15], bl[14]);
2738   ar[15] = vaddq_s32(br[15], br[14]);
2739 
2740   al[16] = bl[16];
2741   ar[16] = br[16];
2742   al[19] = bl[19];
2743   ar[19] = br[19];
2744   al[20] = bl[20];
2745   ar[20] = br[20];
2746   al[23] = bl[23];
2747   ar[23] = br[23];
2748   al[24] = bl[24];
2749   ar[24] = br[24];
2750   al[27] = bl[27];
2751   ar[27] = br[27];
2752   al[28] = bl[28];
2753   ar[28] = br[28];
2754   al[31] = bl[31];
2755   ar[31] = br[31];
2756 
2757   butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
2758                           cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
2759   butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
2760                           -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
2761   butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
2762                           cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
2763   butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
2764                           -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
2765 
2766   // Stage 7.
2767   bl[0] = al[0];
2768   br[0] = ar[0];
2769   bl[1] = al[1];
2770   br[1] = ar[1];
2771   bl[2] = al[2];
2772   br[2] = ar[2];
2773   bl[3] = al[3];
2774   br[3] = ar[3];
2775   bl[4] = al[4];
2776   br[4] = ar[4];
2777   bl[5] = al[5];
2778   br[5] = ar[5];
2779   bl[6] = al[6];
2780   br[6] = ar[6];
2781   bl[7] = al[7];
2782   br[7] = ar[7];
2783 
2784   butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
2785                           &bl[8], &br[8], &bl[15], &br[15]);
2786   butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
2787                           cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
2788   butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
2789                           cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
2790   butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
2791                           cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
2792 
2793   bl[16] = vaddq_s32(al[16], al[17]);
2794   br[16] = vaddq_s32(ar[16], ar[17]);
2795   bl[17] = vsubq_s32(al[16], al[17]);
2796   br[17] = vsubq_s32(ar[16], ar[17]);
2797   bl[18] = vsubq_s32(al[19], al[18]);
2798   br[18] = vsubq_s32(ar[19], ar[18]);
2799   bl[19] = vaddq_s32(al[19], al[18]);
2800   br[19] = vaddq_s32(ar[19], ar[18]);
2801   bl[20] = vaddq_s32(al[20], al[21]);
2802   br[20] = vaddq_s32(ar[20], ar[21]);
2803   bl[21] = vsubq_s32(al[20], al[21]);
2804   br[21] = vsubq_s32(ar[20], ar[21]);
2805   bl[22] = vsubq_s32(al[23], al[22]);
2806   br[22] = vsubq_s32(ar[23], ar[22]);
2807   bl[23] = vaddq_s32(al[23], al[22]);
2808   br[23] = vaddq_s32(ar[23], ar[22]);
2809   bl[24] = vaddq_s32(al[24], al[25]);
2810   br[24] = vaddq_s32(ar[24], ar[25]);
2811   bl[25] = vsubq_s32(al[24], al[25]);
2812   br[25] = vsubq_s32(ar[24], ar[25]);
2813   bl[26] = vsubq_s32(al[27], al[26]);
2814   br[26] = vsubq_s32(ar[27], ar[26]);
2815   bl[27] = vaddq_s32(al[27], al[26]);
2816   br[27] = vaddq_s32(ar[27], ar[26]);
2817   bl[28] = vaddq_s32(al[28], al[29]);
2818   br[28] = vaddq_s32(ar[28], ar[29]);
2819   bl[29] = vsubq_s32(al[28], al[29]);
2820   br[29] = vsubq_s32(ar[28], ar[29]);
2821   bl[30] = vsubq_s32(al[31], al[30]);
2822   br[30] = vsubq_s32(ar[31], ar[30]);
2823   bl[31] = vaddq_s32(al[31], al[30]);
2824   br[31] = vaddq_s32(ar[31], ar[30]);
2825 
2826   // Final stage.
2827   left[0] = bl[0];
2828   right[0] = br[0];
2829   left[16] = bl[1];
2830   right[16] = br[1];
2831   left[8] = bl[2];
2832   right[8] = br[2];
2833   left[24] = bl[3];
2834   right[24] = br[3];
2835   left[4] = bl[4];
2836   right[4] = br[4];
2837   left[20] = bl[5];
2838   right[20] = br[5];
2839   left[12] = bl[6];
2840   right[12] = br[6];
2841   left[28] = bl[7];
2842   right[28] = br[7];
2843   left[2] = bl[8];
2844   right[2] = br[8];
2845   left[18] = bl[9];
2846   right[18] = br[9];
2847   left[10] = bl[10];
2848   right[10] = br[10];
2849   left[26] = bl[11];
2850   right[26] = br[11];
2851   left[6] = bl[12];
2852   right[6] = br[12];
2853   left[22] = bl[13];
2854   right[22] = br[13];
2855   left[14] = bl[14];
2856   right[14] = br[14];
2857   left[30] = bl[15];
2858   right[30] = br[15];
2859 
2860   butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
2861                           cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
2862   left[1] = al[1];
2863   right[1] = ar[1];
2864   left[31] = al[31];
2865   right[31] = ar[31];
2866 
2867   butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
2868                           cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
2869   left[17] = al[17];
2870   right[17] = ar[17];
2871   left[15] = al[15];
2872   right[15] = ar[15];
2873 
2874   butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
2875                           cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
2876   left[9] = al[9];
2877   right[9] = ar[9];
2878   left[23] = al[23];
2879   right[23] = ar[23];
2880 
2881   butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
2882                           cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
2883   left[25] = al[25];
2884   right[25] = ar[25];
2885   left[7] = al[7];
2886   right[7] = ar[7];
2887 
2888   butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
2889                           cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
2890   left[5] = al[5];
2891   right[5] = ar[5];
2892   left[27] = al[27];
2893   right[27] = ar[27];
2894 
2895   butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
2896                           cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
2897   left[21] = al[21];
2898   right[21] = ar[21];
2899   left[11] = al[11];
2900   right[11] = ar[11];
2901 
2902   butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
2903                           cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
2904   left[13] = al[13];
2905   right[13] = ar[13];
2906   left[19] = al[19];
2907   right[19] = ar[19];
2908 
2909   butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
2910                           cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
2911   left[29] = al[29];
2912   right[29] = ar[29];
2913   left[3] = al[3];
2914   right[3] = ar[3];
2915 }
2916 
2917 #endif  // CONFIG_VP9_HIGHBITDEPTH
2918 
2919 #endif  // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
2920