xref: /aosp_15_r20/external/libaom/av1/common/x86/highbd_inv_txfm_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <immintrin.h>
13 
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
21 #include "aom_dsp/x86/txfm_common_avx2.h"
22 
23 // Note:
24 //  Total 32x4 registers to represent 32x32 block coefficients.
25 //  For high bit depth, each coefficient is 4-byte.
26 //  Each __m256i register holds 8 coefficients.
27 //  So each "row" we needs 4 register. Totally 32 rows
28 //  Register layout:
29 //   v0,   v1,   v2,   v3,
30 //   v4,   v5,   v6,   v7,
31 //   ... ...
32 //   v124, v125, v126, v127
33 
highbd_clamp_epi16_avx2(__m256i u,int bd)34 static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
35   const __m256i zero = _mm256_setzero_si256();
36   const __m256i one = _mm256_set1_epi16(1);
37   const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
38   __m256i clamped, mask;
39 
40   mask = _mm256_cmpgt_epi16(u, max);
41   clamped = _mm256_andnot_si256(mask, u);
42   mask = _mm256_and_si256(mask, max);
43   clamped = _mm256_or_si256(mask, clamped);
44   mask = _mm256_cmpgt_epi16(clamped, zero);
45   clamped = _mm256_and_si256(clamped, mask);
46 
47   return clamped;
48 }
49 
round_shift_4x4_avx2(__m256i * in,int shift)50 static inline void round_shift_4x4_avx2(__m256i *in, int shift) {
51   if (shift != 0) {
52     __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
53     in[0] = _mm256_add_epi32(in[0], rnding);
54     in[1] = _mm256_add_epi32(in[1], rnding);
55     in[2] = _mm256_add_epi32(in[2], rnding);
56     in[3] = _mm256_add_epi32(in[3], rnding);
57 
58     in[0] = _mm256_srai_epi32(in[0], shift);
59     in[1] = _mm256_srai_epi32(in[1], shift);
60     in[2] = _mm256_srai_epi32(in[2], shift);
61     in[3] = _mm256_srai_epi32(in[3], shift);
62   }
63 }
64 
round_shift_8x8_avx2(__m256i * in,int shift)65 static inline void round_shift_8x8_avx2(__m256i *in, int shift) {
66   round_shift_4x4_avx2(in, shift);
67   round_shift_4x4_avx2(in + 4, shift);
68   round_shift_4x4_avx2(in + 8, shift);
69   round_shift_4x4_avx2(in + 12, shift);
70 }
71 
highbd_clamp_epi32_avx2(__m256i * in,__m256i * out,const __m256i * clamp_lo,const __m256i * clamp_hi,int size)72 static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
73                                     const __m256i *clamp_lo,
74                                     const __m256i *clamp_hi, int size) {
75   __m256i a0, a1;
76   for (int i = 0; i < size; i += 4) {
77     a0 = _mm256_max_epi32(in[i], *clamp_lo);
78     out[i] = _mm256_min_epi32(a0, *clamp_hi);
79 
80     a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
81     out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
82 
83     a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
84     out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
85 
86     a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
87     out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
88   }
89 }
90 
highbd_get_recon_16x8_avx2(const __m256i pred,__m256i res0,__m256i res1,const int bd)91 static inline __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
92                                                  __m256i res0, __m256i res1,
93                                                  const int bd) {
94   __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
95   __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
96 
97   x0 = _mm256_add_epi32(res0, x0);
98   x1 = _mm256_add_epi32(res1, x1);
99   x0 = _mm256_packus_epi32(x0, x1);
100   x0 = _mm256_permute4x64_epi64(x0, 0xd8);
101   x0 = highbd_clamp_epi16_avx2(x0, bd);
102   return x0;
103 }
104 
highbd_write_buffer_16xn_avx2(__m256i * in,uint16_t * output,int stride,int flipud,int height,const int bd)105 static inline void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
106                                                  int stride, int flipud,
107                                                  int height, const int bd) {
108   int j = flipud ? (height - 1) : 0;
109   const int step = flipud ? -1 : 1;
110   for (int i = 0; i < height; ++i, j += step) {
111     __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
112     __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
113 
114     _mm256_storeu_si256((__m256i *)(output + i * stride), u);
115   }
116 }
highbd_get_recon_8x8_avx2(const __m256i pred,__m256i res,const int bd)117 static inline __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
118                                                 const int bd) {
119   __m256i x0 = pred;
120   x0 = _mm256_add_epi32(res, x0);
121   x0 = _mm256_packus_epi32(x0, x0);
122   x0 = _mm256_permute4x64_epi64(x0, 0xd8);
123   x0 = highbd_clamp_epi16_avx2(x0, bd);
124   return x0;
125 }
126 
highbd_write_buffer_8xn_avx2(__m256i * in,uint16_t * output,int stride,int flipud,int height,const int bd)127 static inline void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
128                                                 int stride, int flipud,
129                                                 int height, const int bd) {
130   int j = flipud ? (height - 1) : 0;
131   __m128i temp;
132   const int step = flipud ? -1 : 1;
133   for (int i = 0; i < height; ++i, j += step) {
134     temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
135     __m256i v = _mm256_cvtepi16_epi32(temp);
136     __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
137     __m128i u1 = _mm256_castsi256_si128(u);
138     _mm_storeu_si128((__m128i *)(output + i * stride), u1);
139   }
140 }
neg_shift_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1,const __m256i * clamp_lo,const __m256i * clamp_hi,int shift)141 static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
142                            __m256i *out1, const __m256i *clamp_lo,
143                            const __m256i *clamp_hi, int shift) {
144   __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
145   __m256i a0 = _mm256_add_epi32(offset, in0);
146   __m256i a1 = _mm256_sub_epi32(offset, in1);
147 
148   a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
149   a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
150 
151   a0 = _mm256_max_epi32(a0, *clamp_lo);
152   a0 = _mm256_min_epi32(a0, *clamp_hi);
153   a1 = _mm256_max_epi32(a1, *clamp_lo);
154   a1 = _mm256_min_epi32(a1, *clamp_hi);
155 
156   *out0 = a0;
157   *out1 = a1;
158 }
159 
transpose_8x8_avx2(const __m256i * in,__m256i * out)160 static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
161   __m256i u0, u1, u2, u3, u4, u5, u6, u7;
162   __m256i x0, x1;
163 
164   u0 = _mm256_unpacklo_epi32(in[0], in[1]);
165   u1 = _mm256_unpackhi_epi32(in[0], in[1]);
166 
167   u2 = _mm256_unpacklo_epi32(in[2], in[3]);
168   u3 = _mm256_unpackhi_epi32(in[2], in[3]);
169 
170   u4 = _mm256_unpacklo_epi32(in[4], in[5]);
171   u5 = _mm256_unpackhi_epi32(in[4], in[5]);
172 
173   u6 = _mm256_unpacklo_epi32(in[6], in[7]);
174   u7 = _mm256_unpackhi_epi32(in[6], in[7]);
175 
176   x0 = _mm256_unpacklo_epi64(u0, u2);
177   x1 = _mm256_unpacklo_epi64(u4, u6);
178   out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
179   out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
180 
181   x0 = _mm256_unpackhi_epi64(u0, u2);
182   x1 = _mm256_unpackhi_epi64(u4, u6);
183   out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
184   out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
185 
186   x0 = _mm256_unpacklo_epi64(u1, u3);
187   x1 = _mm256_unpacklo_epi64(u5, u7);
188   out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
189   out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
190 
191   x0 = _mm256_unpackhi_epi64(u1, u3);
192   x1 = _mm256_unpackhi_epi64(u5, u7);
193   out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
194   out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
195 }
196 
transpose_8x8_flip_avx2(const __m256i * in,__m256i * out)197 static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
198   __m256i u0, u1, u2, u3, u4, u5, u6, u7;
199   __m256i x0, x1;
200 
201   u0 = _mm256_unpacklo_epi32(in[7], in[6]);
202   u1 = _mm256_unpackhi_epi32(in[7], in[6]);
203 
204   u2 = _mm256_unpacklo_epi32(in[5], in[4]);
205   u3 = _mm256_unpackhi_epi32(in[5], in[4]);
206 
207   u4 = _mm256_unpacklo_epi32(in[3], in[2]);
208   u5 = _mm256_unpackhi_epi32(in[3], in[2]);
209 
210   u6 = _mm256_unpacklo_epi32(in[1], in[0]);
211   u7 = _mm256_unpackhi_epi32(in[1], in[0]);
212 
213   x0 = _mm256_unpacklo_epi64(u0, u2);
214   x1 = _mm256_unpacklo_epi64(u4, u6);
215   out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
216   out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
217 
218   x0 = _mm256_unpackhi_epi64(u0, u2);
219   x1 = _mm256_unpackhi_epi64(u4, u6);
220   out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
221   out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
222 
223   x0 = _mm256_unpacklo_epi64(u1, u3);
224   x1 = _mm256_unpacklo_epi64(u5, u7);
225   out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
226   out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
227 
228   x0 = _mm256_unpackhi_epi64(u1, u3);
229   x1 = _mm256_unpackhi_epi64(u5, u7);
230   out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
231   out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
232 }
233 
load_buffer_32bit_input(const int32_t * in,int stride,__m256i * out,int out_size)234 static inline void load_buffer_32bit_input(const int32_t *in, int stride,
235                                            __m256i *out, int out_size) {
236   for (int i = 0; i < out_size; ++i) {
237     out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
238   }
239 }
240 
half_btf_0_avx2(const __m256i * w0,const __m256i * n0,const __m256i * rounding,int bit)241 static inline __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
242                                       const __m256i *rounding, int bit) {
243   __m256i x;
244   x = _mm256_mullo_epi32(*w0, *n0);
245   x = _mm256_add_epi32(x, *rounding);
246   x = _mm256_srai_epi32(x, bit);
247   return x;
248 }
249 
half_btf_avx2(const __m256i * w0,const __m256i * n0,const __m256i * w1,const __m256i * n1,const __m256i * rounding,int bit)250 static inline __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
251                                     const __m256i *w1, const __m256i *n1,
252                                     const __m256i *rounding, int bit) {
253   __m256i x, y;
254 
255   x = _mm256_mullo_epi32(*w0, *n0);
256   y = _mm256_mullo_epi32(*w1, *n1);
257   x = _mm256_add_epi32(x, y);
258   x = _mm256_add_epi32(x, *rounding);
259   x = _mm256_srai_epi32(x, bit);
260   return x;
261 }
262 
addsub_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1,const __m256i * clamp_lo,const __m256i * clamp_hi)263 static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
264                         __m256i *out1, const __m256i *clamp_lo,
265                         const __m256i *clamp_hi) {
266   __m256i a0 = _mm256_add_epi32(in0, in1);
267   __m256i a1 = _mm256_sub_epi32(in0, in1);
268 
269   a0 = _mm256_max_epi32(a0, *clamp_lo);
270   a0 = _mm256_min_epi32(a0, *clamp_hi);
271   a1 = _mm256_max_epi32(a1, *clamp_lo);
272   a1 = _mm256_min_epi32(a1, *clamp_hi);
273 
274   *out0 = a0;
275   *out1 = a1;
276 }
277 
idct32_stage4_avx2(__m256i * bf1,const __m256i * cospim8,const __m256i * cospi56,const __m256i * cospi8,const __m256i * cospim56,const __m256i * cospim40,const __m256i * cospi24,const __m256i * cospi40,const __m256i * cospim24,const __m256i * rounding,int bit)278 static inline void idct32_stage4_avx2(
279     __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
280     const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
281     const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
282     const __m256i *rounding, int bit) {
283   __m256i temp1, temp2;
284   temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
285   bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
286   bf1[17] = temp1;
287 
288   temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
289   bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
290   bf1[18] = temp2;
291 
292   temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
293   bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
294   bf1[21] = temp1;
295 
296   temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
297   bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
298   bf1[22] = temp2;
299 }
300 
idct32_stage5_avx2(__m256i * bf1,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)301 static inline void idct32_stage5_avx2(
302     __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
303     const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
304     const __m256i *clamp_hi, const __m256i *rounding, int bit) {
305   __m256i temp1, temp2;
306   temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
307   bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
308   bf1[9] = temp1;
309 
310   temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
311   bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
312   bf1[10] = temp2;
313 
314   addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
315   addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
316   addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
317   addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
318   addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
319   addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
320   addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
321   addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
322 }
323 
idct32_stage6_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)324 static inline void idct32_stage6_avx2(
325     __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
326     const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
327     const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
328     const __m256i *rounding, int bit) {
329   __m256i temp1, temp2;
330   temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
331   bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
332   bf1[5] = temp1;
333 
334   addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
335   addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
336   addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
337   addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
338 
339   temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
340   bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
341   bf1[18] = temp1;
342   temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
343   bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
344   bf1[19] = temp2;
345   temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
346   bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
347   bf1[20] = temp1;
348   temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
349   bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
350   bf1[21] = temp2;
351 }
352 
idct32_stage7_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)353 static inline void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
354                                       const __m256i *cospi32,
355                                       const __m256i *clamp_lo,
356                                       const __m256i *clamp_hi,
357                                       const __m256i *rounding, int bit) {
358   __m256i temp1, temp2;
359   addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
360   addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
361   addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
362   addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
363 
364   temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
365   bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
366   bf1[10] = temp1;
367   temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
368   bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
369   bf1[11] = temp2;
370 
371   addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
372   addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
373   addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
374   addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
375   addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
376   addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
377   addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
378   addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
379 }
380 
idct32_stage8_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)381 static inline void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
382                                       const __m256i *cospi32,
383                                       const __m256i *clamp_lo,
384                                       const __m256i *clamp_hi,
385                                       const __m256i *rounding, int bit) {
386   __m256i temp1, temp2;
387   addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
388   addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
389   addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
390   addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
391   addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
392   addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
393   addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
394   addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
395 
396   temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
397   bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
398   bf1[20] = temp1;
399   temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
400   bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
401   bf1[21] = temp2;
402   temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
403   bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
404   bf1[22] = temp1;
405   temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
406   bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
407   bf1[23] = temp2;
408 }
409 
idct32_stage9_avx2(__m256i * bf1,__m256i * out,const int do_cols,const int bd,const int out_shift,const __m256i * clamp_lo,const __m256i * clamp_hi)410 static inline void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
411                                       const int do_cols, const int bd,
412                                       const int out_shift,
413                                       const __m256i *clamp_lo,
414                                       const __m256i *clamp_hi) {
415   addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
416   addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
417   addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
418   addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
419   addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
420   addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
421   addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
422   addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
423   addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
424   addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
425   addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
426   addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
427   addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
428   addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
429   addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
430   addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
431   if (!do_cols) {
432     const int log_range_out = AOMMAX(16, bd + 6);
433     const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
434     const __m256i clamp_hi_out =
435         _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
436     round_shift_8x8_avx2(out, out_shift);
437     round_shift_8x8_avx2(out + 16, out_shift);
438     highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
439   }
440 }
441 
idct32_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)442 static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
443                              int bd, int out_shift) {
444   const int32_t *cospi = cospi_arr(bit);
445   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
446   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
447   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
448   __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
449   __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
450   __m256i x;
451   // stage 0
452   // stage 1
453   // stage 2
454   // stage 3
455   // stage 4
456   // stage 5
457   x = _mm256_mullo_epi32(in[0], cospi32);
458   x = _mm256_add_epi32(x, rounding);
459   x = _mm256_srai_epi32(x, bit);
460 
461   // stage 6
462   // stage 7
463   // stage 8
464   // stage 9
465   if (!do_cols) {
466     const int log_range_out = AOMMAX(16, bd + 6);
467     __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
468     clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
469     clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
470     x = _mm256_add_epi32(offset, x);
471     x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
472   }
473   x = _mm256_max_epi32(x, clamp_lo);
474   x = _mm256_min_epi32(x, clamp_hi);
475   out[0] = x;
476   out[1] = x;
477   out[2] = x;
478   out[3] = x;
479   out[4] = x;
480   out[5] = x;
481   out[6] = x;
482   out[7] = x;
483   out[8] = x;
484   out[9] = x;
485   out[10] = x;
486   out[11] = x;
487   out[12] = x;
488   out[13] = x;
489   out[14] = x;
490   out[15] = x;
491   out[16] = x;
492   out[17] = x;
493   out[18] = x;
494   out[19] = x;
495   out[20] = x;
496   out[21] = x;
497   out[22] = x;
498   out[23] = x;
499   out[24] = x;
500   out[25] = x;
501   out[26] = x;
502   out[27] = x;
503   out[28] = x;
504   out[29] = x;
505   out[30] = x;
506   out[31] = x;
507 }
508 
idct32_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)509 static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
510                              int bd, int out_shift) {
511   const int32_t *cospi = cospi_arr(bit);
512   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
513   const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
514   const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
515   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
516   const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
517   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
518   const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
519   const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
520   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
521   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
522   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
523   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
524   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
525   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
526   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
527   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
528   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
529   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
530   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
531   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
532   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
533   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
534   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
535   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
536   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
537   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
538   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
539   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
540   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
541   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
542   __m256i bf1[32];
543 
544   {
545     // stage 0
546     // stage 1
547     bf1[0] = in[0];
548     bf1[4] = in[4];
549     bf1[8] = in[2];
550     bf1[12] = in[6];
551     bf1[16] = in[1];
552     bf1[20] = in[5];
553     bf1[24] = in[3];
554     bf1[28] = in[7];
555 
556     // stage 2
557     bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
558     bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
559     bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
560     bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
561     bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
562     bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
563     bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
564     bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
565 
566     // stage 3
567     bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
568     bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
569 
570     bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
571     bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
572     bf1[17] = bf1[16];
573     bf1[18] = bf1[19];
574     bf1[21] = bf1[20];
575     bf1[22] = bf1[23];
576     bf1[25] = bf1[24];
577     bf1[26] = bf1[27];
578     bf1[29] = bf1[28];
579     bf1[30] = bf1[31];
580 
581     // stage 4
582     bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
583     bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
584 
585     bf1[9] = bf1[8];
586     bf1[10] = bf1[11];
587     bf1[13] = bf1[12];
588     bf1[14] = bf1[15];
589 
590     idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
591                        &cospi24, &cospi40, &cospim24, &rounding, bit);
592 
593     // stage 5
594     bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
595     bf1[1] = bf1[0];
596     bf1[5] = bf1[4];
597     bf1[6] = bf1[7];
598 
599     idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
600                        &clamp_hi, &rounding, bit);
601 
602     // stage 6
603     bf1[3] = bf1[0];
604     bf1[2] = bf1[1];
605 
606     idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
607                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
608 
609     // stage 7
610     idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
611                        &rounding, bit);
612 
613     // stage 8
614     idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
615                        &rounding, bit);
616 
617     // stage 9
618     idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
619   }
620 }
621 
idct32_low16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)622 static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
623                               int bd, int out_shift) {
624   const int32_t *cospi = cospi_arr(bit);
625   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
626   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
627   const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
628   const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
629   const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
630   const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
631   const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
632   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
633   const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
634   const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
635   const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
636   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
637   const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
638   const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
639   const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
640   const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
641   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
642   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
643   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
644   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
645   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
646   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
647   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
648   const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
649   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
650   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
651   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
652   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
653   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
654   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
655   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
656   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
657   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
658   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
659   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
660   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
661   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
662   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
663   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
664   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
665   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
666   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
667   __m256i bf1[32];
668 
669   {
670     // stage 0
671     // stage 1
672     bf1[0] = in[0];
673     bf1[2] = in[8];
674     bf1[4] = in[4];
675     bf1[6] = in[12];
676     bf1[8] = in[2];
677     bf1[10] = in[10];
678     bf1[12] = in[6];
679     bf1[14] = in[14];
680     bf1[16] = in[1];
681     bf1[18] = in[9];
682     bf1[20] = in[5];
683     bf1[22] = in[13];
684     bf1[24] = in[3];
685     bf1[26] = in[11];
686     bf1[28] = in[7];
687     bf1[30] = in[15];
688 
689     // stage 2
690     bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
691     bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
692     bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
693     bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
694     bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
695     bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
696     bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
697     bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
698     bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
699     bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
700     bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
701     bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
702     bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
703     bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
704     bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
705     bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
706 
707     // stage 3
708     bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
709     bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
710     bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
711     bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
712     bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
713     bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
714     bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
715     bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
716 
717     addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
718     addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
719     addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
720     addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
721     addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
722     addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
723     addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
724     addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
725 
726     // stage 4
727     bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
728     bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
729     bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
730     bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
731 
732     addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
733     addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
734     addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
735     addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
736 
737     idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
738                        &cospi24, &cospi40, &cospim24, &rounding, bit);
739 
740     // stage 5
741     bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
742     bf1[1] = bf1[0];
743     bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
744     bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
745 
746     addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
747     addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
748 
749     idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
750                        &clamp_hi, &rounding, bit);
751 
752     // stage 6
753     addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
754     addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
755 
756     idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
757                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
758 
759     // stage 7
760     idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
761                        &rounding, bit);
762 
763     // stage 8
764     idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
765                        &rounding, bit);
766 
767     // stage 9
768     idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
769   }
770 }
771 
idct32_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)772 static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
773                         int out_shift) {
774   const int32_t *cospi = cospi_arr(bit);
775   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
776   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
777   const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
778   const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
779   const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
780   const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
781   const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
782   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
783   const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
784   const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
785   const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
786   const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
787   const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
788   const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
789   const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
790   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
791   const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
792   const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
793   const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
794   const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
795   const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
796   const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
797   const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
798   const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
799   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
800   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
801   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
802   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
803   const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
804   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
805   const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
806   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
807   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
808   const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
809   const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
810   const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
811   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
812   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
813   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
814   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
815   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
816   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
817   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
818   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
819   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
820   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
821   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
822   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
823   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
824   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
825   const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
826   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
827   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
828   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
829   __m256i bf1[32], bf0[32];
830 
831   {
832     // stage 0
833     // stage 1
834     bf1[0] = in[0];
835     bf1[1] = in[16];
836     bf1[2] = in[8];
837     bf1[3] = in[24];
838     bf1[4] = in[4];
839     bf1[5] = in[20];
840     bf1[6] = in[12];
841     bf1[7] = in[28];
842     bf1[8] = in[2];
843     bf1[9] = in[18];
844     bf1[10] = in[10];
845     bf1[11] = in[26];
846     bf1[12] = in[6];
847     bf1[13] = in[22];
848     bf1[14] = in[14];
849     bf1[15] = in[30];
850     bf1[16] = in[1];
851     bf1[17] = in[17];
852     bf1[18] = in[9];
853     bf1[19] = in[25];
854     bf1[20] = in[5];
855     bf1[21] = in[21];
856     bf1[22] = in[13];
857     bf1[23] = in[29];
858     bf1[24] = in[3];
859     bf1[25] = in[19];
860     bf1[26] = in[11];
861     bf1[27] = in[27];
862     bf1[28] = in[7];
863     bf1[29] = in[23];
864     bf1[30] = in[15];
865     bf1[31] = in[31];
866 
867     // stage 2
868     bf0[0] = bf1[0];
869     bf0[1] = bf1[1];
870     bf0[2] = bf1[2];
871     bf0[3] = bf1[3];
872     bf0[4] = bf1[4];
873     bf0[5] = bf1[5];
874     bf0[6] = bf1[6];
875     bf0[7] = bf1[7];
876     bf0[8] = bf1[8];
877     bf0[9] = bf1[9];
878     bf0[10] = bf1[10];
879     bf0[11] = bf1[11];
880     bf0[12] = bf1[12];
881     bf0[13] = bf1[13];
882     bf0[14] = bf1[14];
883     bf0[15] = bf1[15];
884     bf0[16] =
885         half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
886     bf0[17] =
887         half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
888     bf0[18] =
889         half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
890     bf0[19] =
891         half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
892     bf0[20] =
893         half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
894     bf0[21] =
895         half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
896     bf0[22] =
897         half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
898     bf0[23] =
899         half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
900     bf0[24] =
901         half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
902     bf0[25] =
903         half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
904     bf0[26] =
905         half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
906     bf0[27] =
907         half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
908     bf0[28] =
909         half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
910     bf0[29] =
911         half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
912     bf0[30] =
913         half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
914     bf0[31] =
915         half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
916 
917     // stage 3
918     bf1[0] = bf0[0];
919     bf1[1] = bf0[1];
920     bf1[2] = bf0[2];
921     bf1[3] = bf0[3];
922     bf1[4] = bf0[4];
923     bf1[5] = bf0[5];
924     bf1[6] = bf0[6];
925     bf1[7] = bf0[7];
926     bf1[8] =
927         half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
928     bf1[9] =
929         half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
930     bf1[10] =
931         half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
932     bf1[11] =
933         half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
934     bf1[12] =
935         half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
936     bf1[13] =
937         half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
938     bf1[14] =
939         half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
940     bf1[15] =
941         half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
942 
943     addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
944     addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
945     addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
946     addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
947     addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
948     addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
949     addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
950     addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
951 
952     // stage 4
953     bf0[0] = bf1[0];
954     bf0[1] = bf1[1];
955     bf0[2] = bf1[2];
956     bf0[3] = bf1[3];
957     bf0[4] =
958         half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
959     bf0[5] =
960         half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
961     bf0[6] =
962         half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
963     bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
964 
965     addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
966     addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
967     addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
968     addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
969 
970     bf0[16] = bf1[16];
971     bf0[17] =
972         half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
973     bf0[18] =
974         half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
975     bf0[19] = bf1[19];
976     bf0[20] = bf1[20];
977     bf0[21] =
978         half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
979     bf0[22] =
980         half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
981     bf0[23] = bf1[23];
982     bf0[24] = bf1[24];
983     bf0[25] =
984         half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
985     bf0[26] =
986         half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
987     bf0[27] = bf1[27];
988     bf0[28] = bf1[28];
989     bf0[29] =
990         half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
991     bf0[30] =
992         half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
993     bf0[31] = bf1[31];
994 
995     // stage 5
996     bf1[0] =
997         half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
998     bf1[1] =
999         half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
1000     bf1[2] =
1001         half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
1002     bf1[3] =
1003         half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
1004     addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
1005     addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
1006     bf1[8] = bf0[8];
1007     bf1[9] =
1008         half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
1009     bf1[10] =
1010         half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
1011     bf1[11] = bf0[11];
1012     bf1[12] = bf0[12];
1013     bf1[13] =
1014         half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
1015     bf1[14] =
1016         half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
1017     bf1[15] = bf0[15];
1018     addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
1019     addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
1020     addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
1021     addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
1022     addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
1023     addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
1024     addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
1025     addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
1026 
1027     // stage 6
1028     addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
1029     addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
1030     bf0[4] = bf1[4];
1031     bf0[5] =
1032         half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1033     bf0[6] =
1034         half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1035     bf0[7] = bf1[7];
1036     addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
1037     addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
1038     addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
1039     addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
1040     bf0[16] = bf1[16];
1041     bf0[17] = bf1[17];
1042     bf0[18] =
1043         half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
1044     bf0[19] =
1045         half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
1046     bf0[20] =
1047         half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
1048     bf0[21] =
1049         half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
1050     bf0[22] = bf1[22];
1051     bf0[23] = bf1[23];
1052     bf0[24] = bf1[24];
1053     bf0[25] = bf1[25];
1054     bf0[26] =
1055         half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
1056     bf0[27] =
1057         half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
1058     bf0[28] =
1059         half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
1060     bf0[29] =
1061         half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
1062     bf0[30] = bf1[30];
1063     bf0[31] = bf1[31];
1064 
1065     // stage 7
1066     addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
1067     addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
1068     addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
1069     addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
1070     bf1[8] = bf0[8];
1071     bf1[9] = bf0[9];
1072     bf1[10] =
1073         half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1074     bf1[11] =
1075         half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1076     bf1[12] =
1077         half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1078     bf1[13] =
1079         half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1080     bf1[14] = bf0[14];
1081     bf1[15] = bf0[15];
1082     addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
1083     addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
1084     addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
1085     addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
1086     addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
1087     addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
1088     addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
1089     addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
1090 
1091     // stage 8
1092     addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
1093     addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
1094     addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
1095     addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
1096     addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
1097     addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
1098     addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
1099     addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
1100     bf0[16] = bf1[16];
1101     bf0[17] = bf1[17];
1102     bf0[18] = bf1[18];
1103     bf0[19] = bf1[19];
1104     bf0[20] =
1105         half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1106     bf0[21] =
1107         half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1108     bf0[22] =
1109         half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1110     bf0[23] =
1111         half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1112     bf0[24] =
1113         half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1114     bf0[25] =
1115         half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1116     bf0[26] =
1117         half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1118     bf0[27] =
1119         half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1120     bf0[28] = bf1[28];
1121     bf0[29] = bf1[29];
1122     bf0[30] = bf1[30];
1123     bf0[31] = bf1[31];
1124 
1125     // stage 9
1126     addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
1127     addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
1128     addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
1129     addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
1130     addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
1131     addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
1132     addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
1133     addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
1134     addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
1135     addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
1136     addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
1137     addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
1138     addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
1139     addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
1140     addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
1141     addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
1142     if (!do_cols) {
1143       const int log_range_out = AOMMAX(16, bd + 6);
1144       const __m256i clamp_lo_out =
1145           _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1146       const __m256i clamp_hi_out =
1147           _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1148       round_shift_8x8_avx2(out, out_shift);
1149       round_shift_8x8_avx2(out + 16, out_shift);
1150       highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
1151     }
1152   }
1153 }
idct16_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1154 static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1155                              int bd, int out_shift) {
1156   const int32_t *cospi = cospi_arr(bit);
1157   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1158   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1159   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1160   __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1161   __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1162 
1163   {
1164     // stage 0
1165     // stage 1
1166     // stage 2
1167     // stage 3
1168     // stage 4
1169     in[0] = _mm256_mullo_epi32(in[0], cospi32);
1170     in[0] = _mm256_add_epi32(in[0], rnding);
1171     in[0] = _mm256_srai_epi32(in[0], bit);
1172 
1173     // stage 5
1174     // stage 6
1175     // stage 7
1176     if (!do_cols) {
1177       const int log_range_out = AOMMAX(16, bd + 6);
1178       clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1179       clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1180       __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
1181       in[0] = _mm256_add_epi32(in[0], offset);
1182       in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1183     }
1184     in[0] = _mm256_max_epi32(in[0], clamp_lo);
1185     in[0] = _mm256_min_epi32(in[0], clamp_hi);
1186     out[0] = in[0];
1187     out[1] = in[0];
1188     out[2] = in[0];
1189     out[3] = in[0];
1190     out[4] = in[0];
1191     out[5] = in[0];
1192     out[6] = in[0];
1193     out[7] = in[0];
1194     out[8] = in[0];
1195     out[9] = in[0];
1196     out[10] = in[0];
1197     out[11] = in[0];
1198     out[12] = in[0];
1199     out[13] = in[0];
1200     out[14] = in[0];
1201     out[15] = in[0];
1202   }
1203 }
1204 
idct16_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1205 static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1206                              int bd, int out_shift) {
1207   const int32_t *cospi = cospi_arr(bit);
1208   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1209   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1210   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1211   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1212   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1213   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1214   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1215   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1216   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1217   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1218   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1219   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1220   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1221   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1222   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1223   const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1224   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1225   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1226   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1227   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1228   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1229   __m256i u[16], x, y;
1230 
1231   {
1232     // stage 0
1233     // stage 1
1234     u[0] = in[0];
1235     u[2] = in[4];
1236     u[4] = in[2];
1237     u[6] = in[6];
1238     u[8] = in[1];
1239     u[10] = in[5];
1240     u[12] = in[3];
1241     u[14] = in[7];
1242 
1243     // stage 2
1244     u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
1245     u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
1246 
1247     u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
1248     u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
1249 
1250     u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
1251     u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
1252 
1253     u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
1254     u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
1255 
1256     // stage 3
1257     u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
1258     u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
1259     u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
1260     u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
1261 
1262     addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1263     addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1264     addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1265     addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1266 
1267     // stage 4
1268     x = _mm256_mullo_epi32(u[0], cospi32);
1269     u[0] = _mm256_add_epi32(x, rnding);
1270     u[0] = _mm256_srai_epi32(u[0], bit);
1271     u[1] = u[0];
1272 
1273     u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
1274     u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
1275 
1276     addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1277     addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1278 
1279     x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1280     u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1281     u[9] = x;
1282     y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1283     u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1284     u[10] = y;
1285 
1286     // stage 5
1287     addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1288     addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1289 
1290     x = _mm256_mullo_epi32(u[5], cospi32);
1291     y = _mm256_mullo_epi32(u[6], cospi32);
1292     u[5] = _mm256_sub_epi32(y, x);
1293     u[5] = _mm256_add_epi32(u[5], rnding);
1294     u[5] = _mm256_srai_epi32(u[5], bit);
1295 
1296     u[6] = _mm256_add_epi32(y, x);
1297     u[6] = _mm256_add_epi32(u[6], rnding);
1298     u[6] = _mm256_srai_epi32(u[6], bit);
1299 
1300     addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1301     addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1302     addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1303     addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1304 
1305     // stage 6
1306     addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1307     addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1308     addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1309     addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1310 
1311     x = _mm256_mullo_epi32(u[10], cospi32);
1312     y = _mm256_mullo_epi32(u[13], cospi32);
1313     u[10] = _mm256_sub_epi32(y, x);
1314     u[10] = _mm256_add_epi32(u[10], rnding);
1315     u[10] = _mm256_srai_epi32(u[10], bit);
1316 
1317     u[13] = _mm256_add_epi32(x, y);
1318     u[13] = _mm256_add_epi32(u[13], rnding);
1319     u[13] = _mm256_srai_epi32(u[13], bit);
1320 
1321     x = _mm256_mullo_epi32(u[11], cospi32);
1322     y = _mm256_mullo_epi32(u[12], cospi32);
1323     u[11] = _mm256_sub_epi32(y, x);
1324     u[11] = _mm256_add_epi32(u[11], rnding);
1325     u[11] = _mm256_srai_epi32(u[11], bit);
1326 
1327     u[12] = _mm256_add_epi32(x, y);
1328     u[12] = _mm256_add_epi32(u[12], rnding);
1329     u[12] = _mm256_srai_epi32(u[12], bit);
1330     // stage 7
1331     addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1332     addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1333     addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1334     addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1335     addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1336     addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1337     addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1338     addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1339 
1340     if (!do_cols) {
1341       const int log_range_out = AOMMAX(16, bd + 6);
1342       const __m256i clamp_lo_out =
1343           _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1344       const __m256i clamp_hi_out =
1345           _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1346       round_shift_8x8_avx2(out, out_shift);
1347       highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1348     }
1349   }
1350 }
1351 
idct16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1352 static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
1353                         int out_shift) {
1354   const int32_t *cospi = cospi_arr(bit);
1355   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1356   const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1357   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1358   const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1359   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1360   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1361   const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1362   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1363   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1364   const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1365   const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1366   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1367   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1368   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1369   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1370   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1371   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1372   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1373   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1374   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1375   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1376   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1377   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1378   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1379   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1380   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1381   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1382   __m256i u[16], v[16], x, y;
1383 
1384   {
1385     // stage 0
1386     // stage 1
1387     u[0] = in[0];
1388     u[1] = in[8];
1389     u[2] = in[4];
1390     u[3] = in[12];
1391     u[4] = in[2];
1392     u[5] = in[10];
1393     u[6] = in[6];
1394     u[7] = in[14];
1395     u[8] = in[1];
1396     u[9] = in[9];
1397     u[10] = in[5];
1398     u[11] = in[13];
1399     u[12] = in[3];
1400     u[13] = in[11];
1401     u[14] = in[7];
1402     u[15] = in[15];
1403 
1404     // stage 2
1405     v[0] = u[0];
1406     v[1] = u[1];
1407     v[2] = u[2];
1408     v[3] = u[3];
1409     v[4] = u[4];
1410     v[5] = u[5];
1411     v[6] = u[6];
1412     v[7] = u[7];
1413 
1414     v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
1415     v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
1416     v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
1417     v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
1418     v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
1419     v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
1420     v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
1421     v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
1422 
1423     // stage 3
1424     u[0] = v[0];
1425     u[1] = v[1];
1426     u[2] = v[2];
1427     u[3] = v[3];
1428     u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
1429     u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
1430     u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
1431     u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
1432     addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1433     addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1434     addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1435     addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1436 
1437     // stage 4
1438     x = _mm256_mullo_epi32(u[0], cospi32);
1439     y = _mm256_mullo_epi32(u[1], cospi32);
1440     v[0] = _mm256_add_epi32(x, y);
1441     v[0] = _mm256_add_epi32(v[0], rnding);
1442     v[0] = _mm256_srai_epi32(v[0], bit);
1443 
1444     v[1] = _mm256_sub_epi32(x, y);
1445     v[1] = _mm256_add_epi32(v[1], rnding);
1446     v[1] = _mm256_srai_epi32(v[1], bit);
1447 
1448     v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
1449     v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
1450     addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
1451     addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
1452     v[8] = u[8];
1453     v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1454     v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1455     v[11] = u[11];
1456     v[12] = u[12];
1457     v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1458     v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1459     v[15] = u[15];
1460 
1461     // stage 5
1462     addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1463     addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1464     u[4] = v[4];
1465 
1466     x = _mm256_mullo_epi32(v[5], cospi32);
1467     y = _mm256_mullo_epi32(v[6], cospi32);
1468     u[5] = _mm256_sub_epi32(y, x);
1469     u[5] = _mm256_add_epi32(u[5], rnding);
1470     u[5] = _mm256_srai_epi32(u[5], bit);
1471 
1472     u[6] = _mm256_add_epi32(y, x);
1473     u[6] = _mm256_add_epi32(u[6], rnding);
1474     u[6] = _mm256_srai_epi32(u[6], bit);
1475 
1476     u[7] = v[7];
1477     addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1478     addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1479     addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1480     addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1481 
1482     // stage 6
1483     addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
1484     addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
1485     addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
1486     addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
1487     v[8] = u[8];
1488     v[9] = u[9];
1489 
1490     x = _mm256_mullo_epi32(u[10], cospi32);
1491     y = _mm256_mullo_epi32(u[13], cospi32);
1492     v[10] = _mm256_sub_epi32(y, x);
1493     v[10] = _mm256_add_epi32(v[10], rnding);
1494     v[10] = _mm256_srai_epi32(v[10], bit);
1495 
1496     v[13] = _mm256_add_epi32(x, y);
1497     v[13] = _mm256_add_epi32(v[13], rnding);
1498     v[13] = _mm256_srai_epi32(v[13], bit);
1499 
1500     x = _mm256_mullo_epi32(u[11], cospi32);
1501     y = _mm256_mullo_epi32(u[12], cospi32);
1502     v[11] = _mm256_sub_epi32(y, x);
1503     v[11] = _mm256_add_epi32(v[11], rnding);
1504     v[11] = _mm256_srai_epi32(v[11], bit);
1505 
1506     v[12] = _mm256_add_epi32(x, y);
1507     v[12] = _mm256_add_epi32(v[12], rnding);
1508     v[12] = _mm256_srai_epi32(v[12], bit);
1509 
1510     v[14] = u[14];
1511     v[15] = u[15];
1512 
1513     // stage 7
1514     addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1515     addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1516     addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1517     addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1518     addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1519     addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1520     addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1521     addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1522 
1523     if (!do_cols) {
1524       const int log_range_out = AOMMAX(16, bd + 6);
1525       const __m256i clamp_lo_out =
1526           _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1527       const __m256i clamp_hi_out =
1528           _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1529       round_shift_8x8_avx2(out, out_shift);
1530       highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1531     }
1532   }
1533 }
1534 
iadst16_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1535 static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1536                               int bd, int out_shift) {
1537   const int32_t *cospi = cospi_arr(bit);
1538   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1539   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1540   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1541   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1542   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1543   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1544   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1545   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1546   const __m256i zero = _mm256_setzero_si256();
1547   __m256i v[16], x, y, temp1, temp2;
1548 
1549   // Calculate the column 0, 1, 2, 3
1550   {
1551     // stage 0
1552     // stage 1
1553     // stage 2
1554     x = _mm256_mullo_epi32(in[0], cospi62);
1555     v[0] = _mm256_add_epi32(x, rnding);
1556     v[0] = _mm256_srai_epi32(v[0], bit);
1557 
1558     x = _mm256_mullo_epi32(in[0], cospi2);
1559     v[1] = _mm256_sub_epi32(zero, x);
1560     v[1] = _mm256_add_epi32(v[1], rnding);
1561     v[1] = _mm256_srai_epi32(v[1], bit);
1562 
1563     // stage 3
1564     v[8] = v[0];
1565     v[9] = v[1];
1566 
1567     // stage 4
1568     temp1 = _mm256_mullo_epi32(v[8], cospi8);
1569     x = _mm256_mullo_epi32(v[9], cospi56);
1570     temp1 = _mm256_add_epi32(temp1, x);
1571     temp1 = _mm256_add_epi32(temp1, rnding);
1572     temp1 = _mm256_srai_epi32(temp1, bit);
1573 
1574     temp2 = _mm256_mullo_epi32(v[8], cospi56);
1575     x = _mm256_mullo_epi32(v[9], cospi8);
1576     temp2 = _mm256_sub_epi32(temp2, x);
1577     temp2 = _mm256_add_epi32(temp2, rnding);
1578     temp2 = _mm256_srai_epi32(temp2, bit);
1579     v[8] = temp1;
1580     v[9] = temp2;
1581 
1582     // stage 5
1583     v[4] = v[0];
1584     v[5] = v[1];
1585     v[12] = v[8];
1586     v[13] = v[9];
1587 
1588     // stage 6
1589     temp1 = _mm256_mullo_epi32(v[4], cospi16);
1590     x = _mm256_mullo_epi32(v[5], cospi48);
1591     temp1 = _mm256_add_epi32(temp1, x);
1592     temp1 = _mm256_add_epi32(temp1, rnding);
1593     temp1 = _mm256_srai_epi32(temp1, bit);
1594 
1595     temp2 = _mm256_mullo_epi32(v[4], cospi48);
1596     x = _mm256_mullo_epi32(v[5], cospi16);
1597     temp2 = _mm256_sub_epi32(temp2, x);
1598     temp2 = _mm256_add_epi32(temp2, rnding);
1599     temp2 = _mm256_srai_epi32(temp2, bit);
1600     v[4] = temp1;
1601     v[5] = temp2;
1602 
1603     temp1 = _mm256_mullo_epi32(v[12], cospi16);
1604     x = _mm256_mullo_epi32(v[13], cospi48);
1605     temp1 = _mm256_add_epi32(temp1, x);
1606     temp1 = _mm256_add_epi32(temp1, rnding);
1607     temp1 = _mm256_srai_epi32(temp1, bit);
1608 
1609     temp2 = _mm256_mullo_epi32(v[12], cospi48);
1610     x = _mm256_mullo_epi32(v[13], cospi16);
1611     temp2 = _mm256_sub_epi32(temp2, x);
1612     temp2 = _mm256_add_epi32(temp2, rnding);
1613     temp2 = _mm256_srai_epi32(temp2, bit);
1614     v[12] = temp1;
1615     v[13] = temp2;
1616 
1617     // stage 7
1618     v[2] = v[0];
1619     v[3] = v[1];
1620     v[6] = v[4];
1621     v[7] = v[5];
1622     v[10] = v[8];
1623     v[11] = v[9];
1624     v[14] = v[12];
1625     v[15] = v[13];
1626 
1627     // stage 8
1628     y = _mm256_mullo_epi32(v[2], cospi32);
1629     x = _mm256_mullo_epi32(v[3], cospi32);
1630     v[2] = _mm256_add_epi32(y, x);
1631     v[2] = _mm256_add_epi32(v[2], rnding);
1632     v[2] = _mm256_srai_epi32(v[2], bit);
1633 
1634     v[3] = _mm256_sub_epi32(y, x);
1635     v[3] = _mm256_add_epi32(v[3], rnding);
1636     v[3] = _mm256_srai_epi32(v[3], bit);
1637 
1638     y = _mm256_mullo_epi32(v[6], cospi32);
1639     x = _mm256_mullo_epi32(v[7], cospi32);
1640     v[6] = _mm256_add_epi32(y, x);
1641     v[6] = _mm256_add_epi32(v[6], rnding);
1642     v[6] = _mm256_srai_epi32(v[6], bit);
1643 
1644     v[7] = _mm256_sub_epi32(y, x);
1645     v[7] = _mm256_add_epi32(v[7], rnding);
1646     v[7] = _mm256_srai_epi32(v[7], bit);
1647 
1648     y = _mm256_mullo_epi32(v[10], cospi32);
1649     x = _mm256_mullo_epi32(v[11], cospi32);
1650     v[10] = _mm256_add_epi32(y, x);
1651     v[10] = _mm256_add_epi32(v[10], rnding);
1652     v[10] = _mm256_srai_epi32(v[10], bit);
1653 
1654     v[11] = _mm256_sub_epi32(y, x);
1655     v[11] = _mm256_add_epi32(v[11], rnding);
1656     v[11] = _mm256_srai_epi32(v[11], bit);
1657 
1658     y = _mm256_mullo_epi32(v[14], cospi32);
1659     x = _mm256_mullo_epi32(v[15], cospi32);
1660     v[14] = _mm256_add_epi32(y, x);
1661     v[14] = _mm256_add_epi32(v[14], rnding);
1662     v[14] = _mm256_srai_epi32(v[14], bit);
1663 
1664     v[15] = _mm256_sub_epi32(y, x);
1665     v[15] = _mm256_add_epi32(v[15], rnding);
1666     v[15] = _mm256_srai_epi32(v[15], bit);
1667 
1668     // stage 9
1669     if (do_cols) {
1670       out[0] = v[0];
1671       out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
1672       out[2] = v[12];
1673       out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
1674       out[4] = v[6];
1675       out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
1676       out[6] = v[10];
1677       out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
1678       out[8] = v[3];
1679       out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
1680       out[10] = v[15];
1681       out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
1682       out[12] = v[5];
1683       out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
1684       out[14] = v[9];
1685       out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
1686     } else {
1687       const int log_range_out = AOMMAX(16, bd + 6);
1688       const __m256i clamp_lo_out =
1689           _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1690       const __m256i clamp_hi_out =
1691           _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1692 
1693       neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1694                      out_shift);
1695       neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
1696                      &clamp_hi_out, out_shift);
1697       neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
1698                      &clamp_hi_out, out_shift);
1699       neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
1700                      &clamp_hi_out, out_shift);
1701       neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
1702                      &clamp_hi_out, out_shift);
1703       neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
1704                      &clamp_hi_out, out_shift);
1705       neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
1706                      &clamp_hi_out, out_shift);
1707       neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
1708                      &clamp_hi_out, out_shift);
1709     }
1710   }
1711 }
1712 
iadst16_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1713 static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1714                               int bd, int out_shift) {
1715   const int32_t *cospi = cospi_arr(bit);
1716   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1717   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1718   const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
1719   const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
1720   const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
1721   const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
1722   const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
1723   const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
1724   const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
1725   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
1726   const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
1727   const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
1728   const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
1729   const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
1730   const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
1731   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
1732   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1733   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1734   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1735   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1736   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
1737   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
1738   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1739   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1740   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1741   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1742   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1743   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1744   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1745   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1746   __m256i u[16], x, y;
1747 
1748   {
1749     // stage 0
1750     // stage 1
1751     // stage 2
1752     __m256i zero = _mm256_setzero_si256();
1753     x = _mm256_mullo_epi32(in[0], cospi62);
1754     u[0] = _mm256_add_epi32(x, rnding);
1755     u[0] = _mm256_srai_epi32(u[0], bit);
1756 
1757     x = _mm256_mullo_epi32(in[0], cospi2);
1758     u[1] = _mm256_sub_epi32(zero, x);
1759     u[1] = _mm256_add_epi32(u[1], rnding);
1760     u[1] = _mm256_srai_epi32(u[1], bit);
1761 
1762     x = _mm256_mullo_epi32(in[2], cospi54);
1763     u[2] = _mm256_add_epi32(x, rnding);
1764     u[2] = _mm256_srai_epi32(u[2], bit);
1765 
1766     x = _mm256_mullo_epi32(in[2], cospi10);
1767     u[3] = _mm256_sub_epi32(zero, x);
1768     u[3] = _mm256_add_epi32(u[3], rnding);
1769     u[3] = _mm256_srai_epi32(u[3], bit);
1770 
1771     x = _mm256_mullo_epi32(in[4], cospi46);
1772     u[4] = _mm256_add_epi32(x, rnding);
1773     u[4] = _mm256_srai_epi32(u[4], bit);
1774 
1775     x = _mm256_mullo_epi32(in[4], cospi18);
1776     u[5] = _mm256_sub_epi32(zero, x);
1777     u[5] = _mm256_add_epi32(u[5], rnding);
1778     u[5] = _mm256_srai_epi32(u[5], bit);
1779 
1780     x = _mm256_mullo_epi32(in[6], cospi38);
1781     u[6] = _mm256_add_epi32(x, rnding);
1782     u[6] = _mm256_srai_epi32(u[6], bit);
1783 
1784     x = _mm256_mullo_epi32(in[6], cospi26);
1785     u[7] = _mm256_sub_epi32(zero, x);
1786     u[7] = _mm256_add_epi32(u[7], rnding);
1787     u[7] = _mm256_srai_epi32(u[7], bit);
1788 
1789     u[8] = _mm256_mullo_epi32(in[7], cospi34);
1790     u[8] = _mm256_add_epi32(u[8], rnding);
1791     u[8] = _mm256_srai_epi32(u[8], bit);
1792 
1793     u[9] = _mm256_mullo_epi32(in[7], cospi30);
1794     u[9] = _mm256_add_epi32(u[9], rnding);
1795     u[9] = _mm256_srai_epi32(u[9], bit);
1796 
1797     u[10] = _mm256_mullo_epi32(in[5], cospi42);
1798     u[10] = _mm256_add_epi32(u[10], rnding);
1799     u[10] = _mm256_srai_epi32(u[10], bit);
1800 
1801     u[11] = _mm256_mullo_epi32(in[5], cospi22);
1802     u[11] = _mm256_add_epi32(u[11], rnding);
1803     u[11] = _mm256_srai_epi32(u[11], bit);
1804 
1805     u[12] = _mm256_mullo_epi32(in[3], cospi50);
1806     u[12] = _mm256_add_epi32(u[12], rnding);
1807     u[12] = _mm256_srai_epi32(u[12], bit);
1808 
1809     u[13] = _mm256_mullo_epi32(in[3], cospi14);
1810     u[13] = _mm256_add_epi32(u[13], rnding);
1811     u[13] = _mm256_srai_epi32(u[13], bit);
1812 
1813     u[14] = _mm256_mullo_epi32(in[1], cospi58);
1814     u[14] = _mm256_add_epi32(u[14], rnding);
1815     u[14] = _mm256_srai_epi32(u[14], bit);
1816 
1817     u[15] = _mm256_mullo_epi32(in[1], cospi6);
1818     u[15] = _mm256_add_epi32(u[15], rnding);
1819     u[15] = _mm256_srai_epi32(u[15], bit);
1820 
1821     // stage 3
1822     addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
1823     addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
1824     addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
1825     addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
1826     addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
1827     addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
1828     addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
1829     addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
1830 
1831     // stage 4
1832     y = _mm256_mullo_epi32(u[8], cospi56);
1833     x = _mm256_mullo_epi32(u[9], cospi56);
1834     u[8] = _mm256_mullo_epi32(u[8], cospi8);
1835     u[8] = _mm256_add_epi32(u[8], x);
1836     u[8] = _mm256_add_epi32(u[8], rnding);
1837     u[8] = _mm256_srai_epi32(u[8], bit);
1838 
1839     x = _mm256_mullo_epi32(u[9], cospi8);
1840     u[9] = _mm256_sub_epi32(y, x);
1841     u[9] = _mm256_add_epi32(u[9], rnding);
1842     u[9] = _mm256_srai_epi32(u[9], bit);
1843 
1844     x = _mm256_mullo_epi32(u[11], cospi24);
1845     y = _mm256_mullo_epi32(u[10], cospi24);
1846     u[10] = _mm256_mullo_epi32(u[10], cospi40);
1847     u[10] = _mm256_add_epi32(u[10], x);
1848     u[10] = _mm256_add_epi32(u[10], rnding);
1849     u[10] = _mm256_srai_epi32(u[10], bit);
1850 
1851     x = _mm256_mullo_epi32(u[11], cospi40);
1852     u[11] = _mm256_sub_epi32(y, x);
1853     u[11] = _mm256_add_epi32(u[11], rnding);
1854     u[11] = _mm256_srai_epi32(u[11], bit);
1855 
1856     x = _mm256_mullo_epi32(u[13], cospi8);
1857     y = _mm256_mullo_epi32(u[12], cospi8);
1858     u[12] = _mm256_mullo_epi32(u[12], cospim56);
1859     u[12] = _mm256_add_epi32(u[12], x);
1860     u[12] = _mm256_add_epi32(u[12], rnding);
1861     u[12] = _mm256_srai_epi32(u[12], bit);
1862 
1863     x = _mm256_mullo_epi32(u[13], cospim56);
1864     u[13] = _mm256_sub_epi32(y, x);
1865     u[13] = _mm256_add_epi32(u[13], rnding);
1866     u[13] = _mm256_srai_epi32(u[13], bit);
1867 
1868     x = _mm256_mullo_epi32(u[15], cospi40);
1869     y = _mm256_mullo_epi32(u[14], cospi40);
1870     u[14] = _mm256_mullo_epi32(u[14], cospim24);
1871     u[14] = _mm256_add_epi32(u[14], x);
1872     u[14] = _mm256_add_epi32(u[14], rnding);
1873     u[14] = _mm256_srai_epi32(u[14], bit);
1874 
1875     x = _mm256_mullo_epi32(u[15], cospim24);
1876     u[15] = _mm256_sub_epi32(y, x);
1877     u[15] = _mm256_add_epi32(u[15], rnding);
1878     u[15] = _mm256_srai_epi32(u[15], bit);
1879 
1880     // stage 5
1881     addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
1882     addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
1883     addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
1884     addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
1885     addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
1886     addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
1887     addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
1888     addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
1889 
1890     // stage 6
1891     x = _mm256_mullo_epi32(u[5], cospi48);
1892     y = _mm256_mullo_epi32(u[4], cospi48);
1893     u[4] = _mm256_mullo_epi32(u[4], cospi16);
1894     u[4] = _mm256_add_epi32(u[4], x);
1895     u[4] = _mm256_add_epi32(u[4], rnding);
1896     u[4] = _mm256_srai_epi32(u[4], bit);
1897 
1898     x = _mm256_mullo_epi32(u[5], cospi16);
1899     u[5] = _mm256_sub_epi32(y, x);
1900     u[5] = _mm256_add_epi32(u[5], rnding);
1901     u[5] = _mm256_srai_epi32(u[5], bit);
1902 
1903     x = _mm256_mullo_epi32(u[7], cospi16);
1904     y = _mm256_mullo_epi32(u[6], cospi16);
1905     u[6] = _mm256_mullo_epi32(u[6], cospim48);
1906     u[6] = _mm256_add_epi32(u[6], x);
1907     u[6] = _mm256_add_epi32(u[6], rnding);
1908     u[6] = _mm256_srai_epi32(u[6], bit);
1909 
1910     x = _mm256_mullo_epi32(u[7], cospim48);
1911     u[7] = _mm256_sub_epi32(y, x);
1912     u[7] = _mm256_add_epi32(u[7], rnding);
1913     u[7] = _mm256_srai_epi32(u[7], bit);
1914 
1915     x = _mm256_mullo_epi32(u[13], cospi48);
1916     y = _mm256_mullo_epi32(u[12], cospi48);
1917     u[12] = _mm256_mullo_epi32(u[12], cospi16);
1918     u[12] = _mm256_add_epi32(u[12], x);
1919     u[12] = _mm256_add_epi32(u[12], rnding);
1920     u[12] = _mm256_srai_epi32(u[12], bit);
1921 
1922     x = _mm256_mullo_epi32(u[13], cospi16);
1923     u[13] = _mm256_sub_epi32(y, x);
1924     u[13] = _mm256_add_epi32(u[13], rnding);
1925     u[13] = _mm256_srai_epi32(u[13], bit);
1926 
1927     x = _mm256_mullo_epi32(u[15], cospi16);
1928     y = _mm256_mullo_epi32(u[14], cospi16);
1929     u[14] = _mm256_mullo_epi32(u[14], cospim48);
1930     u[14] = _mm256_add_epi32(u[14], x);
1931     u[14] = _mm256_add_epi32(u[14], rnding);
1932     u[14] = _mm256_srai_epi32(u[14], bit);
1933 
1934     x = _mm256_mullo_epi32(u[15], cospim48);
1935     u[15] = _mm256_sub_epi32(y, x);
1936     u[15] = _mm256_add_epi32(u[15], rnding);
1937     u[15] = _mm256_srai_epi32(u[15], bit);
1938 
1939     // stage 7
1940     addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
1941     addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
1942     addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
1943     addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
1944     addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
1945     addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
1946     addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
1947     addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
1948 
1949     // stage 8
1950     y = _mm256_mullo_epi32(u[2], cospi32);
1951     x = _mm256_mullo_epi32(u[3], cospi32);
1952     u[2] = _mm256_add_epi32(y, x);
1953     u[2] = _mm256_add_epi32(u[2], rnding);
1954     u[2] = _mm256_srai_epi32(u[2], bit);
1955 
1956     u[3] = _mm256_sub_epi32(y, x);
1957     u[3] = _mm256_add_epi32(u[3], rnding);
1958     u[3] = _mm256_srai_epi32(u[3], bit);
1959     y = _mm256_mullo_epi32(u[6], cospi32);
1960     x = _mm256_mullo_epi32(u[7], cospi32);
1961     u[6] = _mm256_add_epi32(y, x);
1962     u[6] = _mm256_add_epi32(u[6], rnding);
1963     u[6] = _mm256_srai_epi32(u[6], bit);
1964 
1965     u[7] = _mm256_sub_epi32(y, x);
1966     u[7] = _mm256_add_epi32(u[7], rnding);
1967     u[7] = _mm256_srai_epi32(u[7], bit);
1968 
1969     y = _mm256_mullo_epi32(u[10], cospi32);
1970     x = _mm256_mullo_epi32(u[11], cospi32);
1971     u[10] = _mm256_add_epi32(y, x);
1972     u[10] = _mm256_add_epi32(u[10], rnding);
1973     u[10] = _mm256_srai_epi32(u[10], bit);
1974 
1975     u[11] = _mm256_sub_epi32(y, x);
1976     u[11] = _mm256_add_epi32(u[11], rnding);
1977     u[11] = _mm256_srai_epi32(u[11], bit);
1978 
1979     y = _mm256_mullo_epi32(u[14], cospi32);
1980     x = _mm256_mullo_epi32(u[15], cospi32);
1981     u[14] = _mm256_add_epi32(y, x);
1982     u[14] = _mm256_add_epi32(u[14], rnding);
1983     u[14] = _mm256_srai_epi32(u[14], bit);
1984 
1985     u[15] = _mm256_sub_epi32(y, x);
1986     u[15] = _mm256_add_epi32(u[15], rnding);
1987     u[15] = _mm256_srai_epi32(u[15], bit);
1988 
1989     // stage 9
1990     if (do_cols) {
1991       out[0] = u[0];
1992       out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
1993       out[2] = u[12];
1994       out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
1995       out[4] = u[6];
1996       out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
1997       out[6] = u[10];
1998       out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
1999       out[8] = u[3];
2000       out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
2001       out[10] = u[15];
2002       out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
2003       out[12] = u[5];
2004       out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
2005       out[14] = u[9];
2006       out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
2007     } else {
2008       const int log_range_out = AOMMAX(16, bd + 6);
2009       const __m256i clamp_lo_out =
2010           _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2011       const __m256i clamp_hi_out =
2012           _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2013 
2014       neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2015                      out_shift);
2016       neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2017                      &clamp_hi_out, out_shift);
2018       neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2019                      &clamp_hi_out, out_shift);
2020       neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2021                      &clamp_hi_out, out_shift);
2022       neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2023                      &clamp_hi_out, out_shift);
2024       neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2025                      &clamp_hi_out, out_shift);
2026       neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2027                      &clamp_hi_out, out_shift);
2028       neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2029                      &clamp_hi_out, out_shift);
2030     }
2031   }
2032 }
2033 
iadst16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2034 static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2035                          int bd, int out_shift) {
2036   const int32_t *cospi = cospi_arr(bit);
2037   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
2038   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
2039   const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
2040   const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
2041   const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
2042   const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
2043   const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
2044   const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
2045   const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
2046   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
2047   const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
2048   const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
2049   const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
2050   const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
2051   const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
2052   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
2053   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2054   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2055   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2056   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2057   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
2058   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
2059   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2060   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2061   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2062   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2063   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2064   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2065   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2066   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2067   __m256i u[16], v[16], x, y;
2068 
2069   {
2070     // stage 0
2071     // stage 1
2072     // stage 2
2073     v[0] = _mm256_mullo_epi32(in[15], cospi2);
2074     x = _mm256_mullo_epi32(in[0], cospi62);
2075     v[0] = _mm256_add_epi32(v[0], x);
2076     v[0] = _mm256_add_epi32(v[0], rnding);
2077     v[0] = _mm256_srai_epi32(v[0], bit);
2078 
2079     v[1] = _mm256_mullo_epi32(in[15], cospi62);
2080     x = _mm256_mullo_epi32(in[0], cospi2);
2081     v[1] = _mm256_sub_epi32(v[1], x);
2082     v[1] = _mm256_add_epi32(v[1], rnding);
2083     v[1] = _mm256_srai_epi32(v[1], bit);
2084 
2085     v[2] = _mm256_mullo_epi32(in[13], cospi10);
2086     x = _mm256_mullo_epi32(in[2], cospi54);
2087     v[2] = _mm256_add_epi32(v[2], x);
2088     v[2] = _mm256_add_epi32(v[2], rnding);
2089     v[2] = _mm256_srai_epi32(v[2], bit);
2090 
2091     v[3] = _mm256_mullo_epi32(in[13], cospi54);
2092     x = _mm256_mullo_epi32(in[2], cospi10);
2093     v[3] = _mm256_sub_epi32(v[3], x);
2094     v[3] = _mm256_add_epi32(v[3], rnding);
2095     v[3] = _mm256_srai_epi32(v[3], bit);
2096 
2097     v[4] = _mm256_mullo_epi32(in[11], cospi18);
2098     x = _mm256_mullo_epi32(in[4], cospi46);
2099     v[4] = _mm256_add_epi32(v[4], x);
2100     v[4] = _mm256_add_epi32(v[4], rnding);
2101     v[4] = _mm256_srai_epi32(v[4], bit);
2102 
2103     v[5] = _mm256_mullo_epi32(in[11], cospi46);
2104     x = _mm256_mullo_epi32(in[4], cospi18);
2105     v[5] = _mm256_sub_epi32(v[5], x);
2106     v[5] = _mm256_add_epi32(v[5], rnding);
2107     v[5] = _mm256_srai_epi32(v[5], bit);
2108 
2109     v[6] = _mm256_mullo_epi32(in[9], cospi26);
2110     x = _mm256_mullo_epi32(in[6], cospi38);
2111     v[6] = _mm256_add_epi32(v[6], x);
2112     v[6] = _mm256_add_epi32(v[6], rnding);
2113     v[6] = _mm256_srai_epi32(v[6], bit);
2114 
2115     v[7] = _mm256_mullo_epi32(in[9], cospi38);
2116     x = _mm256_mullo_epi32(in[6], cospi26);
2117     v[7] = _mm256_sub_epi32(v[7], x);
2118     v[7] = _mm256_add_epi32(v[7], rnding);
2119     v[7] = _mm256_srai_epi32(v[7], bit);
2120 
2121     v[8] = _mm256_mullo_epi32(in[7], cospi34);
2122     x = _mm256_mullo_epi32(in[8], cospi30);
2123     v[8] = _mm256_add_epi32(v[8], x);
2124     v[8] = _mm256_add_epi32(v[8], rnding);
2125     v[8] = _mm256_srai_epi32(v[8], bit);
2126 
2127     v[9] = _mm256_mullo_epi32(in[7], cospi30);
2128     x = _mm256_mullo_epi32(in[8], cospi34);
2129     v[9] = _mm256_sub_epi32(v[9], x);
2130     v[9] = _mm256_add_epi32(v[9], rnding);
2131     v[9] = _mm256_srai_epi32(v[9], bit);
2132 
2133     v[10] = _mm256_mullo_epi32(in[5], cospi42);
2134     x = _mm256_mullo_epi32(in[10], cospi22);
2135     v[10] = _mm256_add_epi32(v[10], x);
2136     v[10] = _mm256_add_epi32(v[10], rnding);
2137     v[10] = _mm256_srai_epi32(v[10], bit);
2138 
2139     v[11] = _mm256_mullo_epi32(in[5], cospi22);
2140     x = _mm256_mullo_epi32(in[10], cospi42);
2141     v[11] = _mm256_sub_epi32(v[11], x);
2142     v[11] = _mm256_add_epi32(v[11], rnding);
2143     v[11] = _mm256_srai_epi32(v[11], bit);
2144 
2145     v[12] = _mm256_mullo_epi32(in[3], cospi50);
2146     x = _mm256_mullo_epi32(in[12], cospi14);
2147     v[12] = _mm256_add_epi32(v[12], x);
2148     v[12] = _mm256_add_epi32(v[12], rnding);
2149     v[12] = _mm256_srai_epi32(v[12], bit);
2150 
2151     v[13] = _mm256_mullo_epi32(in[3], cospi14);
2152     x = _mm256_mullo_epi32(in[12], cospi50);
2153     v[13] = _mm256_sub_epi32(v[13], x);
2154     v[13] = _mm256_add_epi32(v[13], rnding);
2155     v[13] = _mm256_srai_epi32(v[13], bit);
2156 
2157     v[14] = _mm256_mullo_epi32(in[1], cospi58);
2158     x = _mm256_mullo_epi32(in[14], cospi6);
2159     v[14] = _mm256_add_epi32(v[14], x);
2160     v[14] = _mm256_add_epi32(v[14], rnding);
2161     v[14] = _mm256_srai_epi32(v[14], bit);
2162 
2163     v[15] = _mm256_mullo_epi32(in[1], cospi6);
2164     x = _mm256_mullo_epi32(in[14], cospi58);
2165     v[15] = _mm256_sub_epi32(v[15], x);
2166     v[15] = _mm256_add_epi32(v[15], rnding);
2167     v[15] = _mm256_srai_epi32(v[15], bit);
2168 
2169     // stage 3
2170     addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2171     addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2172     addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2173     addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2174     addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2175     addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2176     addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2177     addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2178 
2179     // stage 4
2180     v[0] = u[0];
2181     v[1] = u[1];
2182     v[2] = u[2];
2183     v[3] = u[3];
2184     v[4] = u[4];
2185     v[5] = u[5];
2186     v[6] = u[6];
2187     v[7] = u[7];
2188 
2189     v[8] = _mm256_mullo_epi32(u[8], cospi8);
2190     x = _mm256_mullo_epi32(u[9], cospi56);
2191     v[8] = _mm256_add_epi32(v[8], x);
2192     v[8] = _mm256_add_epi32(v[8], rnding);
2193     v[8] = _mm256_srai_epi32(v[8], bit);
2194 
2195     v[9] = _mm256_mullo_epi32(u[8], cospi56);
2196     x = _mm256_mullo_epi32(u[9], cospi8);
2197     v[9] = _mm256_sub_epi32(v[9], x);
2198     v[9] = _mm256_add_epi32(v[9], rnding);
2199     v[9] = _mm256_srai_epi32(v[9], bit);
2200 
2201     v[10] = _mm256_mullo_epi32(u[10], cospi40);
2202     x = _mm256_mullo_epi32(u[11], cospi24);
2203     v[10] = _mm256_add_epi32(v[10], x);
2204     v[10] = _mm256_add_epi32(v[10], rnding);
2205     v[10] = _mm256_srai_epi32(v[10], bit);
2206 
2207     v[11] = _mm256_mullo_epi32(u[10], cospi24);
2208     x = _mm256_mullo_epi32(u[11], cospi40);
2209     v[11] = _mm256_sub_epi32(v[11], x);
2210     v[11] = _mm256_add_epi32(v[11], rnding);
2211     v[11] = _mm256_srai_epi32(v[11], bit);
2212 
2213     v[12] = _mm256_mullo_epi32(u[12], cospim56);
2214     x = _mm256_mullo_epi32(u[13], cospi8);
2215     v[12] = _mm256_add_epi32(v[12], x);
2216     v[12] = _mm256_add_epi32(v[12], rnding);
2217     v[12] = _mm256_srai_epi32(v[12], bit);
2218 
2219     v[13] = _mm256_mullo_epi32(u[12], cospi8);
2220     x = _mm256_mullo_epi32(u[13], cospim56);
2221     v[13] = _mm256_sub_epi32(v[13], x);
2222     v[13] = _mm256_add_epi32(v[13], rnding);
2223     v[13] = _mm256_srai_epi32(v[13], bit);
2224 
2225     v[14] = _mm256_mullo_epi32(u[14], cospim24);
2226     x = _mm256_mullo_epi32(u[15], cospi40);
2227     v[14] = _mm256_add_epi32(v[14], x);
2228     v[14] = _mm256_add_epi32(v[14], rnding);
2229     v[14] = _mm256_srai_epi32(v[14], bit);
2230 
2231     v[15] = _mm256_mullo_epi32(u[14], cospi40);
2232     x = _mm256_mullo_epi32(u[15], cospim24);
2233     v[15] = _mm256_sub_epi32(v[15], x);
2234     v[15] = _mm256_add_epi32(v[15], rnding);
2235     v[15] = _mm256_srai_epi32(v[15], bit);
2236 
2237     // stage 5
2238     addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2239     addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2240     addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2241     addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2242     addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2243     addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2244     addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2245     addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2246 
2247     // stage 6
2248     v[0] = u[0];
2249     v[1] = u[1];
2250     v[2] = u[2];
2251     v[3] = u[3];
2252 
2253     v[4] = _mm256_mullo_epi32(u[4], cospi16);
2254     x = _mm256_mullo_epi32(u[5], cospi48);
2255     v[4] = _mm256_add_epi32(v[4], x);
2256     v[4] = _mm256_add_epi32(v[4], rnding);
2257     v[4] = _mm256_srai_epi32(v[4], bit);
2258 
2259     v[5] = _mm256_mullo_epi32(u[4], cospi48);
2260     x = _mm256_mullo_epi32(u[5], cospi16);
2261     v[5] = _mm256_sub_epi32(v[5], x);
2262     v[5] = _mm256_add_epi32(v[5], rnding);
2263     v[5] = _mm256_srai_epi32(v[5], bit);
2264 
2265     v[6] = _mm256_mullo_epi32(u[6], cospim48);
2266     x = _mm256_mullo_epi32(u[7], cospi16);
2267     v[6] = _mm256_add_epi32(v[6], x);
2268     v[6] = _mm256_add_epi32(v[6], rnding);
2269     v[6] = _mm256_srai_epi32(v[6], bit);
2270 
2271     v[7] = _mm256_mullo_epi32(u[6], cospi16);
2272     x = _mm256_mullo_epi32(u[7], cospim48);
2273     v[7] = _mm256_sub_epi32(v[7], x);
2274     v[7] = _mm256_add_epi32(v[7], rnding);
2275     v[7] = _mm256_srai_epi32(v[7], bit);
2276 
2277     v[8] = u[8];
2278     v[9] = u[9];
2279     v[10] = u[10];
2280     v[11] = u[11];
2281 
2282     v[12] = _mm256_mullo_epi32(u[12], cospi16);
2283     x = _mm256_mullo_epi32(u[13], cospi48);
2284     v[12] = _mm256_add_epi32(v[12], x);
2285     v[12] = _mm256_add_epi32(v[12], rnding);
2286     v[12] = _mm256_srai_epi32(v[12], bit);
2287 
2288     v[13] = _mm256_mullo_epi32(u[12], cospi48);
2289     x = _mm256_mullo_epi32(u[13], cospi16);
2290     v[13] = _mm256_sub_epi32(v[13], x);
2291     v[13] = _mm256_add_epi32(v[13], rnding);
2292     v[13] = _mm256_srai_epi32(v[13], bit);
2293 
2294     v[14] = _mm256_mullo_epi32(u[14], cospim48);
2295     x = _mm256_mullo_epi32(u[15], cospi16);
2296     v[14] = _mm256_add_epi32(v[14], x);
2297     v[14] = _mm256_add_epi32(v[14], rnding);
2298     v[14] = _mm256_srai_epi32(v[14], bit);
2299 
2300     v[15] = _mm256_mullo_epi32(u[14], cospi16);
2301     x = _mm256_mullo_epi32(u[15], cospim48);
2302     v[15] = _mm256_sub_epi32(v[15], x);
2303     v[15] = _mm256_add_epi32(v[15], rnding);
2304     v[15] = _mm256_srai_epi32(v[15], bit);
2305 
2306     // stage 7
2307     addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2308     addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2309     addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2310     addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2311     addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2312     addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2313     addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2314     addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2315 
2316     // stage 8
2317     v[0] = u[0];
2318     v[1] = u[1];
2319 
2320     y = _mm256_mullo_epi32(u[2], cospi32);
2321     x = _mm256_mullo_epi32(u[3], cospi32);
2322     v[2] = _mm256_add_epi32(y, x);
2323     v[2] = _mm256_add_epi32(v[2], rnding);
2324     v[2] = _mm256_srai_epi32(v[2], bit);
2325 
2326     v[3] = _mm256_sub_epi32(y, x);
2327     v[3] = _mm256_add_epi32(v[3], rnding);
2328     v[3] = _mm256_srai_epi32(v[3], bit);
2329 
2330     v[4] = u[4];
2331     v[5] = u[5];
2332 
2333     y = _mm256_mullo_epi32(u[6], cospi32);
2334     x = _mm256_mullo_epi32(u[7], cospi32);
2335     v[6] = _mm256_add_epi32(y, x);
2336     v[6] = _mm256_add_epi32(v[6], rnding);
2337     v[6] = _mm256_srai_epi32(v[6], bit);
2338 
2339     v[7] = _mm256_sub_epi32(y, x);
2340     v[7] = _mm256_add_epi32(v[7], rnding);
2341     v[7] = _mm256_srai_epi32(v[7], bit);
2342 
2343     v[8] = u[8];
2344     v[9] = u[9];
2345 
2346     y = _mm256_mullo_epi32(u[10], cospi32);
2347     x = _mm256_mullo_epi32(u[11], cospi32);
2348     v[10] = _mm256_add_epi32(y, x);
2349     v[10] = _mm256_add_epi32(v[10], rnding);
2350     v[10] = _mm256_srai_epi32(v[10], bit);
2351 
2352     v[11] = _mm256_sub_epi32(y, x);
2353     v[11] = _mm256_add_epi32(v[11], rnding);
2354     v[11] = _mm256_srai_epi32(v[11], bit);
2355 
2356     v[12] = u[12];
2357     v[13] = u[13];
2358 
2359     y = _mm256_mullo_epi32(u[14], cospi32);
2360     x = _mm256_mullo_epi32(u[15], cospi32);
2361     v[14] = _mm256_add_epi32(y, x);
2362     v[14] = _mm256_add_epi32(v[14], rnding);
2363     v[14] = _mm256_srai_epi32(v[14], bit);
2364 
2365     v[15] = _mm256_sub_epi32(y, x);
2366     v[15] = _mm256_add_epi32(v[15], rnding);
2367     v[15] = _mm256_srai_epi32(v[15], bit);
2368 
2369     // stage 9
2370     if (do_cols) {
2371       out[0] = v[0];
2372       out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
2373       out[2] = v[12];
2374       out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
2375       out[4] = v[6];
2376       out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
2377       out[6] = v[10];
2378       out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
2379       out[8] = v[3];
2380       out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
2381       out[10] = v[15];
2382       out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
2383       out[12] = v[5];
2384       out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
2385       out[14] = v[9];
2386       out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
2387     } else {
2388       const int log_range_out = AOMMAX(16, bd + 6);
2389       const __m256i clamp_lo_out =
2390           _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2391       const __m256i clamp_hi_out =
2392           _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2393 
2394       neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2395                      out_shift);
2396       neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2397                      &clamp_hi_out, out_shift);
2398       neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2399                      &clamp_hi_out, out_shift);
2400       neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2401                      &clamp_hi_out, out_shift);
2402       neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2403                      &clamp_hi_out, out_shift);
2404       neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2405                      &clamp_hi_out, out_shift);
2406       neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2407                      &clamp_hi_out, out_shift);
2408       neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2409                      &clamp_hi_out, out_shift);
2410     }
2411   }
2412 }
idct8x8_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2413 static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2414                               int bd, int out_shift) {
2415   const int32_t *cospi = cospi_arr(bit);
2416   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2417   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2418   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2419   __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2420   __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2421   __m256i x;
2422 
2423   // stage 0
2424   // stage 1
2425   // stage 2
2426   // stage 3
2427   x = _mm256_mullo_epi32(in[0], cospi32);
2428   x = _mm256_add_epi32(x, rnding);
2429   x = _mm256_srai_epi32(x, bit);
2430 
2431   // stage 4
2432   // stage 5
2433   if (!do_cols) {
2434     const int log_range_out = AOMMAX(16, bd + 6);
2435     __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2436     clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2437     clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2438     x = _mm256_add_epi32(x, offset);
2439     x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2440   }
2441   x = _mm256_max_epi32(x, clamp_lo);
2442   x = _mm256_min_epi32(x, clamp_hi);
2443   out[0] = x;
2444   out[1] = x;
2445   out[2] = x;
2446   out[3] = x;
2447   out[4] = x;
2448   out[5] = x;
2449   out[6] = x;
2450   out[7] = x;
2451 }
idct8x8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2452 static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2453                          int bd, int out_shift) {
2454   const int32_t *cospi = cospi_arr(bit);
2455   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2456   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
2457   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2458   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2459   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2460   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2461   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2462   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2463   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2464   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2465   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2466   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2467   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2468   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2469   __m256i u0, u1, u2, u3, u4, u5, u6, u7;
2470   __m256i v0, v1, v2, v3, v4, v5, v6, v7;
2471   __m256i x, y;
2472 
2473   // stage 0
2474   // stage 1
2475   // stage 2
2476   u0 = in[0];
2477   u1 = in[4];
2478   u2 = in[2];
2479   u3 = in[6];
2480 
2481   x = _mm256_mullo_epi32(in[1], cospi56);
2482   y = _mm256_mullo_epi32(in[7], cospim8);
2483   u4 = _mm256_add_epi32(x, y);
2484   u4 = _mm256_add_epi32(u4, rnding);
2485   u4 = _mm256_srai_epi32(u4, bit);
2486 
2487   x = _mm256_mullo_epi32(in[1], cospi8);
2488   y = _mm256_mullo_epi32(in[7], cospi56);
2489   u7 = _mm256_add_epi32(x, y);
2490   u7 = _mm256_add_epi32(u7, rnding);
2491   u7 = _mm256_srai_epi32(u7, bit);
2492 
2493   x = _mm256_mullo_epi32(in[5], cospi24);
2494   y = _mm256_mullo_epi32(in[3], cospim40);
2495   u5 = _mm256_add_epi32(x, y);
2496   u5 = _mm256_add_epi32(u5, rnding);
2497   u5 = _mm256_srai_epi32(u5, bit);
2498 
2499   x = _mm256_mullo_epi32(in[5], cospi40);
2500   y = _mm256_mullo_epi32(in[3], cospi24);
2501   u6 = _mm256_add_epi32(x, y);
2502   u6 = _mm256_add_epi32(u6, rnding);
2503   u6 = _mm256_srai_epi32(u6, bit);
2504 
2505   // stage 3
2506   x = _mm256_mullo_epi32(u0, cospi32);
2507   y = _mm256_mullo_epi32(u1, cospi32);
2508   v0 = _mm256_add_epi32(x, y);
2509   v0 = _mm256_add_epi32(v0, rnding);
2510   v0 = _mm256_srai_epi32(v0, bit);
2511 
2512   v1 = _mm256_sub_epi32(x, y);
2513   v1 = _mm256_add_epi32(v1, rnding);
2514   v1 = _mm256_srai_epi32(v1, bit);
2515 
2516   x = _mm256_mullo_epi32(u2, cospi48);
2517   y = _mm256_mullo_epi32(u3, cospim16);
2518   v2 = _mm256_add_epi32(x, y);
2519   v2 = _mm256_add_epi32(v2, rnding);
2520   v2 = _mm256_srai_epi32(v2, bit);
2521 
2522   x = _mm256_mullo_epi32(u2, cospi16);
2523   y = _mm256_mullo_epi32(u3, cospi48);
2524   v3 = _mm256_add_epi32(x, y);
2525   v3 = _mm256_add_epi32(v3, rnding);
2526   v3 = _mm256_srai_epi32(v3, bit);
2527 
2528   addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
2529   addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
2530 
2531   // stage 4
2532   addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
2533   addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
2534   u4 = v4;
2535   u7 = v7;
2536 
2537   x = _mm256_mullo_epi32(v5, cospi32);
2538   y = _mm256_mullo_epi32(v6, cospi32);
2539   u6 = _mm256_add_epi32(y, x);
2540   u6 = _mm256_add_epi32(u6, rnding);
2541   u6 = _mm256_srai_epi32(u6, bit);
2542 
2543   u5 = _mm256_sub_epi32(y, x);
2544   u5 = _mm256_add_epi32(u5, rnding);
2545   u5 = _mm256_srai_epi32(u5, bit);
2546 
2547   addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
2548   addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
2549   addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
2550   addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
2551   // stage 5
2552   if (!do_cols) {
2553     const int log_range_out = AOMMAX(16, bd + 6);
2554     const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2555     const __m256i clamp_hi_out =
2556         _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2557 
2558     round_shift_4x4_avx2(out, out_shift);
2559     round_shift_4x4_avx2(out + 4, out_shift);
2560     highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
2561   }
2562 }
iadst8x8_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2563 static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2564                                int bd, int out_shift) {
2565   const int32_t *cospi = cospi_arr(bit);
2566   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2567   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2568   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2569   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2570   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2571   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2572   const __m256i kZero = _mm256_setzero_si256();
2573   __m256i u[8], x;
2574 
2575   // stage 0
2576   // stage 1
2577   // stage 2
2578 
2579   x = _mm256_mullo_epi32(in[0], cospi60);
2580   u[0] = _mm256_add_epi32(x, rnding);
2581   u[0] = _mm256_srai_epi32(u[0], bit);
2582 
2583   x = _mm256_mullo_epi32(in[0], cospi4);
2584   u[1] = _mm256_sub_epi32(kZero, x);
2585   u[1] = _mm256_add_epi32(u[1], rnding);
2586   u[1] = _mm256_srai_epi32(u[1], bit);
2587 
2588   // stage 3
2589   // stage 4
2590   __m256i temp1, temp2;
2591   temp1 = _mm256_mullo_epi32(u[0], cospi16);
2592   x = _mm256_mullo_epi32(u[1], cospi48);
2593   temp1 = _mm256_add_epi32(temp1, x);
2594   temp1 = _mm256_add_epi32(temp1, rnding);
2595   temp1 = _mm256_srai_epi32(temp1, bit);
2596   u[4] = temp1;
2597 
2598   temp2 = _mm256_mullo_epi32(u[0], cospi48);
2599   x = _mm256_mullo_epi32(u[1], cospi16);
2600   u[5] = _mm256_sub_epi32(temp2, x);
2601   u[5] = _mm256_add_epi32(u[5], rnding);
2602   u[5] = _mm256_srai_epi32(u[5], bit);
2603 
2604   // stage 5
2605   // stage 6
2606   temp1 = _mm256_mullo_epi32(u[0], cospi32);
2607   x = _mm256_mullo_epi32(u[1], cospi32);
2608   u[2] = _mm256_add_epi32(temp1, x);
2609   u[2] = _mm256_add_epi32(u[2], rnding);
2610   u[2] = _mm256_srai_epi32(u[2], bit);
2611 
2612   u[3] = _mm256_sub_epi32(temp1, x);
2613   u[3] = _mm256_add_epi32(u[3], rnding);
2614   u[3] = _mm256_srai_epi32(u[3], bit);
2615 
2616   temp1 = _mm256_mullo_epi32(u[4], cospi32);
2617   x = _mm256_mullo_epi32(u[5], cospi32);
2618   u[6] = _mm256_add_epi32(temp1, x);
2619   u[6] = _mm256_add_epi32(u[6], rnding);
2620   u[6] = _mm256_srai_epi32(u[6], bit);
2621 
2622   u[7] = _mm256_sub_epi32(temp1, x);
2623   u[7] = _mm256_add_epi32(u[7], rnding);
2624   u[7] = _mm256_srai_epi32(u[7], bit);
2625 
2626   // stage 7
2627   if (do_cols) {
2628     out[0] = u[0];
2629     out[1] = _mm256_sub_epi32(kZero, u[4]);
2630     out[2] = u[6];
2631     out[3] = _mm256_sub_epi32(kZero, u[2]);
2632     out[4] = u[3];
2633     out[5] = _mm256_sub_epi32(kZero, u[7]);
2634     out[6] = u[5];
2635     out[7] = _mm256_sub_epi32(kZero, u[1]);
2636   } else {
2637     const int log_range_out = AOMMAX(16, bd + 6);
2638     const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2639     const __m256i clamp_hi_out =
2640         _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2641 
2642     neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2643                    out_shift);
2644     neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2645                    out_shift);
2646     neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2647                    out_shift);
2648     neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2649                    out_shift);
2650   }
2651 }
2652 
iadst8x8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2653 static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2654                           int bd, int out_shift) {
2655   const int32_t *cospi = cospi_arr(bit);
2656   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2657   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2658   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
2659   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
2660   const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
2661   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
2662   const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
2663   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
2664   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2665   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2666   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2667   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2668   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2669   const __m256i kZero = _mm256_setzero_si256();
2670   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2671   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2672   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2673   __m256i u[8], v[8], x;
2674 
2675   // stage 0
2676   // stage 1
2677   // stage 2
2678 
2679   u[0] = _mm256_mullo_epi32(in[7], cospi4);
2680   x = _mm256_mullo_epi32(in[0], cospi60);
2681   u[0] = _mm256_add_epi32(u[0], x);
2682   u[0] = _mm256_add_epi32(u[0], rnding);
2683   u[0] = _mm256_srai_epi32(u[0], bit);
2684 
2685   u[1] = _mm256_mullo_epi32(in[7], cospi60);
2686   x = _mm256_mullo_epi32(in[0], cospi4);
2687   u[1] = _mm256_sub_epi32(u[1], x);
2688   u[1] = _mm256_add_epi32(u[1], rnding);
2689   u[1] = _mm256_srai_epi32(u[1], bit);
2690 
2691   u[2] = _mm256_mullo_epi32(in[5], cospi20);
2692   x = _mm256_mullo_epi32(in[2], cospi44);
2693   u[2] = _mm256_add_epi32(u[2], x);
2694   u[2] = _mm256_add_epi32(u[2], rnding);
2695   u[2] = _mm256_srai_epi32(u[2], bit);
2696 
2697   u[3] = _mm256_mullo_epi32(in[5], cospi44);
2698   x = _mm256_mullo_epi32(in[2], cospi20);
2699   u[3] = _mm256_sub_epi32(u[3], x);
2700   u[3] = _mm256_add_epi32(u[3], rnding);
2701   u[3] = _mm256_srai_epi32(u[3], bit);
2702 
2703   u[4] = _mm256_mullo_epi32(in[3], cospi36);
2704   x = _mm256_mullo_epi32(in[4], cospi28);
2705   u[4] = _mm256_add_epi32(u[4], x);
2706   u[4] = _mm256_add_epi32(u[4], rnding);
2707   u[4] = _mm256_srai_epi32(u[4], bit);
2708 
2709   u[5] = _mm256_mullo_epi32(in[3], cospi28);
2710   x = _mm256_mullo_epi32(in[4], cospi36);
2711   u[5] = _mm256_sub_epi32(u[5], x);
2712   u[5] = _mm256_add_epi32(u[5], rnding);
2713   u[5] = _mm256_srai_epi32(u[5], bit);
2714 
2715   u[6] = _mm256_mullo_epi32(in[1], cospi52);
2716   x = _mm256_mullo_epi32(in[6], cospi12);
2717   u[6] = _mm256_add_epi32(u[6], x);
2718   u[6] = _mm256_add_epi32(u[6], rnding);
2719   u[6] = _mm256_srai_epi32(u[6], bit);
2720 
2721   u[7] = _mm256_mullo_epi32(in[1], cospi12);
2722   x = _mm256_mullo_epi32(in[6], cospi52);
2723   u[7] = _mm256_sub_epi32(u[7], x);
2724   u[7] = _mm256_add_epi32(u[7], rnding);
2725   u[7] = _mm256_srai_epi32(u[7], bit);
2726 
2727   // stage 3
2728   addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
2729   addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
2730   addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
2731   addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
2732 
2733   // stage 4
2734   u[0] = v[0];
2735   u[1] = v[1];
2736   u[2] = v[2];
2737   u[3] = v[3];
2738 
2739   u[4] = _mm256_mullo_epi32(v[4], cospi16);
2740   x = _mm256_mullo_epi32(v[5], cospi48);
2741   u[4] = _mm256_add_epi32(u[4], x);
2742   u[4] = _mm256_add_epi32(u[4], rnding);
2743   u[4] = _mm256_srai_epi32(u[4], bit);
2744 
2745   u[5] = _mm256_mullo_epi32(v[4], cospi48);
2746   x = _mm256_mullo_epi32(v[5], cospi16);
2747   u[5] = _mm256_sub_epi32(u[5], x);
2748   u[5] = _mm256_add_epi32(u[5], rnding);
2749   u[5] = _mm256_srai_epi32(u[5], bit);
2750 
2751   u[6] = _mm256_mullo_epi32(v[6], cospim48);
2752   x = _mm256_mullo_epi32(v[7], cospi16);
2753   u[6] = _mm256_add_epi32(u[6], x);
2754   u[6] = _mm256_add_epi32(u[6], rnding);
2755   u[6] = _mm256_srai_epi32(u[6], bit);
2756 
2757   u[7] = _mm256_mullo_epi32(v[6], cospi16);
2758   x = _mm256_mullo_epi32(v[7], cospim48);
2759   u[7] = _mm256_sub_epi32(u[7], x);
2760   u[7] = _mm256_add_epi32(u[7], rnding);
2761   u[7] = _mm256_srai_epi32(u[7], bit);
2762 
2763   // stage 5
2764   addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
2765   addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
2766   addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
2767   addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
2768 
2769   // stage 6
2770   u[0] = v[0];
2771   u[1] = v[1];
2772   u[4] = v[4];
2773   u[5] = v[5];
2774 
2775   v[0] = _mm256_mullo_epi32(v[2], cospi32);
2776   x = _mm256_mullo_epi32(v[3], cospi32);
2777   u[2] = _mm256_add_epi32(v[0], x);
2778   u[2] = _mm256_add_epi32(u[2], rnding);
2779   u[2] = _mm256_srai_epi32(u[2], bit);
2780 
2781   u[3] = _mm256_sub_epi32(v[0], x);
2782   u[3] = _mm256_add_epi32(u[3], rnding);
2783   u[3] = _mm256_srai_epi32(u[3], bit);
2784 
2785   v[0] = _mm256_mullo_epi32(v[6], cospi32);
2786   x = _mm256_mullo_epi32(v[7], cospi32);
2787   u[6] = _mm256_add_epi32(v[0], x);
2788   u[6] = _mm256_add_epi32(u[6], rnding);
2789   u[6] = _mm256_srai_epi32(u[6], bit);
2790 
2791   u[7] = _mm256_sub_epi32(v[0], x);
2792   u[7] = _mm256_add_epi32(u[7], rnding);
2793   u[7] = _mm256_srai_epi32(u[7], bit);
2794 
2795   // stage 7
2796   if (do_cols) {
2797     out[0] = u[0];
2798     out[1] = _mm256_sub_epi32(kZero, u[4]);
2799     out[2] = u[6];
2800     out[3] = _mm256_sub_epi32(kZero, u[2]);
2801     out[4] = u[3];
2802     out[5] = _mm256_sub_epi32(kZero, u[7]);
2803     out[6] = u[5];
2804     out[7] = _mm256_sub_epi32(kZero, u[1]);
2805   } else {
2806     const int log_range_out = AOMMAX(16, bd + 6);
2807     const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2808     const __m256i clamp_hi_out =
2809         _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2810 
2811     neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2812                    out_shift);
2813     neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2814                    out_shift);
2815     neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2816                    out_shift);
2817     neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2818                    out_shift);
2819   }
2820 }
idct64_stage8_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2821 static inline void idct64_stage8_avx2(
2822     __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
2823     const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
2824     const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
2825     const __m256i *rnding, int bit) {
2826   int i;
2827   __m256i temp1, temp2, temp3, temp4;
2828   temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
2829   u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
2830   u[10] = temp1;
2831   temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
2832   u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
2833   u[11] = temp2;
2834 
2835   for (i = 16; i < 20; ++i) {
2836     addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2837     addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
2838   }
2839 
2840   temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
2841   temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
2842   temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
2843   temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
2844   u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
2845   u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
2846   u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
2847   u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
2848   u[36] = temp1;
2849   u[37] = temp2;
2850   u[38] = temp3;
2851   u[39] = temp4;
2852 
2853   temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
2854   temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
2855   temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
2856   temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
2857   u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
2858   u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
2859   u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
2860   u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
2861   u[40] = temp1;
2862   u[41] = temp2;
2863   u[42] = temp3;
2864   u[43] = temp4;
2865 }
2866 
idct64_stage9_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2867 static inline void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
2868                                       const __m256i *cospi32,
2869                                       const __m256i *clamp_lo,
2870                                       const __m256i *clamp_hi,
2871                                       const __m256i *rnding, int bit) {
2872   int i;
2873   __m256i temp1, temp2, temp3, temp4;
2874   for (i = 0; i < 8; ++i) {
2875     addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
2876   }
2877 
2878   temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
2879   temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
2880   temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
2881   temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
2882   u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
2883   u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
2884   u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
2885   u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
2886   u[20] = temp1;
2887   u[21] = temp2;
2888   u[22] = temp3;
2889   u[23] = temp4;
2890   for (i = 32; i < 40; i++) {
2891     addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
2892   }
2893 
2894   for (i = 48; i < 56; i++) {
2895     addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
2896   }
2897 }
2898 
idct64_stage10_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2899 static inline void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
2900                                        const __m256i *cospi32,
2901                                        const __m256i *clamp_lo,
2902                                        const __m256i *clamp_hi,
2903                                        const __m256i *rnding, int bit) {
2904   __m256i temp1, temp2, temp3, temp4;
2905   for (int i = 0; i < 16; i++) {
2906     addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
2907   }
2908 
2909   temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
2910   temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
2911   temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
2912   temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
2913   u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
2914   u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
2915   u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
2916   u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
2917   u[40] = temp1;
2918   u[41] = temp2;
2919   u[42] = temp3;
2920   u[43] = temp4;
2921 
2922   temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
2923   temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
2924   temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
2925   temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
2926   u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
2927   u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
2928   u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
2929   u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
2930   u[44] = temp1;
2931   u[45] = temp2;
2932   u[46] = temp3;
2933   u[47] = temp4;
2934 }
2935 
idct64_stage11_avx2(__m256i * u,__m256i * out,int do_cols,int bd,int out_shift,const __m256i * clamp_lo,const __m256i * clamp_hi)2936 static inline void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
2937                                        int bd, int out_shift,
2938                                        const __m256i *clamp_lo,
2939                                        const __m256i *clamp_hi) {
2940   for (int i = 0; i < 32; i++) {
2941     addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
2942   }
2943 
2944   if (!do_cols) {
2945     const int log_range_out = AOMMAX(16, bd + 6);
2946     const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2947     const __m256i clamp_hi_out =
2948         _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2949 
2950     round_shift_8x8_avx2(out, out_shift);
2951     round_shift_8x8_avx2(out + 16, out_shift);
2952     round_shift_8x8_avx2(out + 32, out_shift);
2953     round_shift_8x8_avx2(out + 48, out_shift);
2954     highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
2955   }
2956 }
2957 
idct64_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2958 static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2959                              int bd, int out_shift) {
2960   const int32_t *cospi = cospi_arr(bit);
2961   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2962   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2963   __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2964   __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2965 
2966   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2967 
2968   {
2969     __m256i x;
2970 
2971     // stage 1
2972     // stage 2
2973     // stage 3
2974     // stage 4
2975     // stage 5
2976     // stage 6
2977     x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
2978 
2979     // stage 8
2980     // stage 9
2981     // stage 10
2982     // stage 11
2983     if (!do_cols) {
2984       const int log_range_out = AOMMAX(16, bd + 6);
2985       clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2986       clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2987       if (out_shift != 0) {
2988         __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2989         x = _mm256_add_epi32(x, offset);
2990         x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2991       }
2992     }
2993     x = _mm256_max_epi32(x, clamp_lo);
2994     x = _mm256_min_epi32(x, clamp_hi);
2995     out[0] = x;
2996     out[1] = x;
2997     out[2] = x;
2998     out[3] = x;
2999     out[4] = x;
3000     out[5] = x;
3001     out[6] = x;
3002     out[7] = x;
3003     out[8] = x;
3004     out[9] = x;
3005     out[10] = x;
3006     out[11] = x;
3007     out[12] = x;
3008     out[13] = x;
3009     out[14] = x;
3010     out[15] = x;
3011     out[16] = x;
3012     out[17] = x;
3013     out[18] = x;
3014     out[19] = x;
3015     out[20] = x;
3016     out[21] = x;
3017     out[22] = x;
3018     out[23] = x;
3019     out[24] = x;
3020     out[25] = x;
3021     out[26] = x;
3022     out[27] = x;
3023     out[28] = x;
3024     out[29] = x;
3025     out[30] = x;
3026     out[31] = x;
3027     out[32] = x;
3028     out[33] = x;
3029     out[34] = x;
3030     out[35] = x;
3031     out[36] = x;
3032     out[37] = x;
3033     out[38] = x;
3034     out[39] = x;
3035     out[40] = x;
3036     out[41] = x;
3037     out[42] = x;
3038     out[43] = x;
3039     out[44] = x;
3040     out[45] = x;
3041     out[46] = x;
3042     out[47] = x;
3043     out[48] = x;
3044     out[49] = x;
3045     out[50] = x;
3046     out[51] = x;
3047     out[52] = x;
3048     out[53] = x;
3049     out[54] = x;
3050     out[55] = x;
3051     out[56] = x;
3052     out[57] = x;
3053     out[58] = x;
3054     out[59] = x;
3055     out[60] = x;
3056     out[61] = x;
3057     out[62] = x;
3058     out[63] = x;
3059   }
3060 }
idct64_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3061 static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3062                              int bd, int out_shift) {
3063   int i, j;
3064   const int32_t *cospi = cospi_arr(bit);
3065   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3066   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3067   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3068   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3069 
3070   const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3071   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3072   const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3073   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3074   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3075   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3076   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3077   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3078   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3079   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3080   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3081   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3082   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3083   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3084   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3085   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3086   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3087   const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3088   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3089   const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3090   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3091   const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3092   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3093   const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3094   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3095   const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3096   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3097   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3098   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3099   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3100   const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3101   const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3102   const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3103   const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3104   const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3105   const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3106   const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3107   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3108 
3109   {
3110     __m256i u[64];
3111 
3112     // stage 1
3113     u[0] = in[0];
3114     u[8] = in[4];
3115     u[16] = in[2];
3116     u[24] = in[6];
3117     u[32] = in[1];
3118     u[40] = in[5];
3119     u[48] = in[3];
3120     u[56] = in[7];
3121 
3122     // stage 2
3123     u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3124     u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3125     u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3126     u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3127     u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3128     u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3129     u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3130     u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3131 
3132     // stage 3
3133     u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3134     u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3135     u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3136     u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3137     u[33] = u[32];
3138     u[38] = u[39];
3139     u[41] = u[40];
3140     u[46] = u[47];
3141     u[49] = u[48];
3142     u[54] = u[55];
3143     u[57] = u[56];
3144     u[62] = u[63];
3145 
3146     // stage 4
3147     __m256i temp1, temp2;
3148     u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3149     u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3150     u[17] = u[16];
3151     u[22] = u[23];
3152     u[25] = u[24];
3153     u[30] = u[31];
3154 
3155     temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3156     u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3157     u[33] = temp1;
3158 
3159     temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3160     u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3161     u[57] = temp2;
3162 
3163     temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3164     u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3165     u[41] = temp1;
3166 
3167     temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3168     u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3169     u[46] = temp2;
3170 
3171     // stage 5
3172     u[9] = u[8];
3173     u[14] = u[15];
3174 
3175     temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3176     u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3177     u[17] = temp1;
3178 
3179     temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3180     u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3181     u[22] = temp2;
3182 
3183     u[35] = u[32];
3184     u[34] = u[33];
3185     u[36] = u[39];
3186     u[37] = u[38];
3187     u[43] = u[40];
3188     u[42] = u[41];
3189     u[44] = u[47];
3190     u[45] = u[46];
3191     u[51] = u[48];
3192     u[50] = u[49];
3193     u[52] = u[55];
3194     u[53] = u[54];
3195     u[59] = u[56];
3196     u[58] = u[57];
3197     u[60] = u[63];
3198     u[61] = u[62];
3199 
3200     // stage 6
3201     temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3202     u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3203     u[0] = temp1;
3204 
3205     temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3206     u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3207     u[9] = temp2;
3208     u[19] = u[16];
3209     u[18] = u[17];
3210     u[20] = u[23];
3211     u[21] = u[22];
3212     u[27] = u[24];
3213     u[26] = u[25];
3214     u[28] = u[31];
3215     u[29] = u[30];
3216 
3217     temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3218     u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3219     u[34] = temp1;
3220     temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3221     u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3222     u[35] = temp2;
3223     temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3224     u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3225     u[36] = temp1;
3226     temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3227     u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3228     u[37] = temp2;
3229     temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3230     u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3231     u[42] = temp1;
3232     temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3233     u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3234     u[43] = temp2;
3235     temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3236     u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3237     u[44] = temp1;
3238     temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3239     u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3240     u[45] = temp2;
3241 
3242     // stage 7
3243     u[3] = u[0];
3244     u[2] = u[1];
3245     u[11] = u[8];
3246     u[10] = u[9];
3247     u[12] = u[15];
3248     u[13] = u[14];
3249 
3250     temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3251     u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3252     u[18] = temp1;
3253     temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3254     u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3255     u[19] = temp2;
3256     temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3257     u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3258     u[20] = temp1;
3259     temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3260     u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3261     u[21] = temp2;
3262     for (i = 32; i < 64; i += 16) {
3263       for (j = i; j < i + 4; j++) {
3264         addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3265         addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3266                     &clamp_hi);
3267       }
3268     }
3269 
3270     // stage 8
3271     u[7] = u[0];
3272     u[6] = u[1];
3273     u[5] = u[2];
3274     u[4] = u[3];
3275 
3276     idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3277                        &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3278 
3279     // stage 9
3280     idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3281                        bit);
3282 
3283     // stage 10
3284     idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3285                         bit);
3286 
3287     // stage 11
3288     idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3289   }
3290 }
idct64_low16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3291 static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3292                               int bd, int out_shift) {
3293   int i, j;
3294   const int32_t *cospi = cospi_arr(bit);
3295   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3296   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3297   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3298   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3299 
3300   const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3301   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3302   const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3303   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3304   const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3305   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3306   const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3307   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3308   const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3309   const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3310   const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3311   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3312   const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3313   const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3314   const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3315   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3316   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3317   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3318   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3319   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3320   const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3321   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3322   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3323   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3324   const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3325   const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3326   const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3327   const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3328   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3329   const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3330   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3331   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3332   const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3333 
3334   const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3335   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3336   const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3337   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3338   const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3339   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3340   const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3341   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3342   const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3343   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3344   const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3345   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3346   const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3347   const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3348   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3349   const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3350   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3351   const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3352   const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3353   const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3354   const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3355 
3356   {
3357     __m256i u[64];
3358     __m256i tmp1, tmp2, tmp3, tmp4;
3359     // stage 1
3360     u[0] = in[0];
3361     u[32] = in[1];
3362     u[36] = in[9];
3363     u[40] = in[5];
3364     u[44] = in[13];
3365     u[48] = in[3];
3366     u[52] = in[11];
3367     u[56] = in[7];
3368     u[60] = in[15];
3369     u[16] = in[2];
3370     u[20] = in[10];
3371     u[24] = in[6];
3372     u[28] = in[14];
3373     u[4] = in[8];
3374     u[8] = in[4];
3375     u[12] = in[12];
3376 
3377     // stage 2
3378     u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3379     u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3380     u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3381     u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3382     u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3383     u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3384     u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3385     u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3386     u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3387     u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3388     u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3389     u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3390     u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3391     u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3392     u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3393     u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3394 
3395     // stage 3
3396     u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3397     u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3398     u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
3399     u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
3400     u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
3401     u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
3402     u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3403     u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3404     u[33] = u[32];
3405     u[34] = u[35];
3406     u[37] = u[36];
3407     u[38] = u[39];
3408     u[41] = u[40];
3409     u[42] = u[43];
3410     u[45] = u[44];
3411     u[46] = u[47];
3412     u[49] = u[48];
3413     u[50] = u[51];
3414     u[53] = u[52];
3415     u[54] = u[55];
3416     u[57] = u[56];
3417     u[58] = u[59];
3418     u[61] = u[60];
3419     u[62] = u[63];
3420 
3421     // stage 4
3422     u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3423     u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3424     u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3425     u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3426 
3427     u[17] = u[16];
3428     u[18] = u[19];
3429     u[21] = u[20];
3430     u[22] = u[23];
3431     u[25] = u[24];
3432     u[26] = u[27];
3433     u[29] = u[28];
3434     u[30] = u[31];
3435 
3436     tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3437     tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3438     tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3439     tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3440     u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3441     u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3442     u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3443     u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3444     u[33] = tmp1;
3445     u[34] = tmp2;
3446     u[37] = tmp3;
3447     u[38] = tmp4;
3448 
3449     tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3450     tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3451     tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3452     tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3453     u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3454     u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3455     u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3456     u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3457     u[41] = tmp1;
3458     u[42] = tmp2;
3459     u[45] = tmp3;
3460     u[46] = tmp4;
3461 
3462     // stage 5
3463     u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
3464     u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
3465 
3466     u[9] = u[8];
3467     u[10] = u[11];
3468     u[13] = u[12];
3469     u[14] = u[15];
3470 
3471     tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3472     tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3473     tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3474     tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3475     u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3476     u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3477     u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3478     u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3479     u[17] = tmp1;
3480     u[18] = tmp2;
3481     u[21] = tmp3;
3482     u[22] = tmp4;
3483 
3484     for (i = 32; i < 64; i += 8) {
3485       addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3486                   &clamp_hi);
3487       addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3488                   &clamp_hi);
3489 
3490       addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3491                   &clamp_hi);
3492       addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3493                   &clamp_hi);
3494     }
3495 
3496     // stage 6
3497     tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3498     u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3499     u[0] = tmp1;
3500     u[5] = u[4];
3501     u[6] = u[7];
3502 
3503     tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3504     u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3505     u[9] = tmp1;
3506     tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3507     u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3508     u[10] = tmp2;
3509 
3510     for (i = 16; i < 32; i += 8) {
3511       addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3512                   &clamp_hi);
3513       addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3514                   &clamp_hi);
3515 
3516       addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3517                   &clamp_hi);
3518       addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3519                   &clamp_hi);
3520     }
3521 
3522     tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3523     tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3524     tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3525     tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3526     u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3527     u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3528     u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3529     u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3530     u[34] = tmp1;
3531     u[35] = tmp2;
3532     u[36] = tmp3;
3533     u[37] = tmp4;
3534 
3535     tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3536     tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3537     tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3538     tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3539     u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3540     u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3541     u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3542     u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3543     u[42] = tmp1;
3544     u[43] = tmp2;
3545     u[44] = tmp3;
3546     u[45] = tmp4;
3547 
3548     // stage 7
3549     u[3] = u[0];
3550     u[2] = u[1];
3551     tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3552     u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3553     u[5] = tmp1;
3554     addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3555     addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3556     addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3557     addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3558 
3559     tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3560     tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3561     tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3562     tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3563     u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3564     u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3565     u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3566     u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3567     u[18] = tmp1;
3568     u[19] = tmp2;
3569     u[20] = tmp3;
3570     u[21] = tmp4;
3571 
3572     for (i = 32; i < 64; i += 16) {
3573       for (j = i; j < i + 4; j++) {
3574         addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3575         addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3576                     &clamp_hi);
3577       }
3578     }
3579 
3580     // stage 8
3581     for (i = 0; i < 4; ++i) {
3582       addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3583     }
3584 
3585     idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3586                        &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3587 
3588     // stage 9
3589     idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3590                        bit);
3591 
3592     // stage 10
3593     idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3594                         bit);
3595 
3596     // stage 11
3597     idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3598   }
3599 }
idct64_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3600 static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
3601                         int out_shift) {
3602   int i, j;
3603   const int32_t *cospi = cospi_arr(bit);
3604   const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3605   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3606   const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3607   const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3608 
3609   const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3610   const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3611   const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3612   const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3613   const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3614   const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3615   const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3616   const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3617   const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3618   const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3619   const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3620   const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3621   const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3622   const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3623   const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3624   const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3625   const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
3626   const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
3627   const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
3628   const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3629   const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
3630   const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
3631   const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
3632   const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3633   const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
3634   const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
3635   const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
3636   const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3637   const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
3638   const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
3639   const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
3640   const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3641   const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
3642   const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3643   const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
3644   const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
3645   const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3646   const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
3647   const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3648   const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
3649   const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
3650   const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3651   const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3652   const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3653   const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3654   const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3655   const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3656   const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3657   const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3658   const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3659   const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3660 
3661   const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3662   const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3663   const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3664   const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3665   const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3666   const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3667   const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3668   const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3669   const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
3670   const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
3671   const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3672   const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
3673   const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3674   const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
3675   const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
3676   const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3677   const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
3678   const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3679   const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3680   const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3681   const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3682   const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3683   const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3684   const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3685   const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3686   const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3687   const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3688 
3689   {
3690     __m256i u[64], v[64];
3691 
3692     // stage 1
3693     u[32] = in[1];
3694     u[34] = in[17];
3695     u[36] = in[9];
3696     u[38] = in[25];
3697     u[40] = in[5];
3698     u[42] = in[21];
3699     u[44] = in[13];
3700     u[46] = in[29];
3701     u[48] = in[3];
3702     u[50] = in[19];
3703     u[52] = in[11];
3704     u[54] = in[27];
3705     u[56] = in[7];
3706     u[58] = in[23];
3707     u[60] = in[15];
3708     u[62] = in[31];
3709 
3710     v[16] = in[2];
3711     v[18] = in[18];
3712     v[20] = in[10];
3713     v[22] = in[26];
3714     v[24] = in[6];
3715     v[26] = in[22];
3716     v[28] = in[14];
3717     v[30] = in[30];
3718 
3719     u[8] = in[4];
3720     u[10] = in[20];
3721     u[12] = in[12];
3722     u[14] = in[28];
3723 
3724     v[4] = in[8];
3725     v[6] = in[24];
3726 
3727     u[0] = in[0];
3728     u[2] = in[16];
3729 
3730     // stage 2
3731     v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3732     v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
3733     v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
3734     v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3735     v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3736     v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
3737     v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
3738     v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3739     v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3740     v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
3741     v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
3742     v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3743     v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3744     v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
3745     v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
3746     v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3747     v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3748     v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
3749     v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
3750     v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3751     v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3752     v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
3753     v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
3754     v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3755     v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3756     v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
3757     v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
3758     v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3759     v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3760     v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
3761     v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
3762     v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3763 
3764     // stage 3
3765     u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
3766     u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
3767     u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
3768     u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
3769     u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
3770     u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
3771     u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
3772     u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
3773     u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
3774     u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
3775     u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
3776     u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
3777     u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
3778     u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
3779     u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
3780     u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
3781 
3782     for (i = 32; i < 64; i += 4) {
3783       addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3784                   &clamp_hi);
3785       addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3786                   &clamp_hi);
3787     }
3788 
3789     // stage 4
3790     v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3791     v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
3792     v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
3793     v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3794     v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3795     v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
3796     v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
3797     v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3798 
3799     for (i = 16; i < 32; i += 4) {
3800       addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3801                   &clamp_hi);
3802       addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3803                   &clamp_hi);
3804     }
3805 
3806     for (i = 32; i < 64; i += 4) {
3807       v[i + 0] = u[i + 0];
3808       v[i + 3] = u[i + 3];
3809     }
3810 
3811     v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3812     v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3813     v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3814     v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3815     v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3816     v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3817     v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3818     v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3819     v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3820     v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3821     v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3822     v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3823     v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3824     v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3825     v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3826     v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3827 
3828     // stage 5
3829     u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
3830     u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
3831     u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
3832     u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
3833 
3834     for (i = 8; i < 16; i += 4) {
3835       addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3836                   &clamp_hi);
3837       addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3838                   &clamp_hi);
3839     }
3840 
3841     for (i = 16; i < 32; i += 4) {
3842       u[i + 0] = v[i + 0];
3843       u[i + 3] = v[i + 3];
3844     }
3845 
3846     u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
3847     u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
3848     u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
3849     u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
3850     u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
3851     u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
3852     u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
3853     u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
3854 
3855     for (i = 32; i < 64; i += 8) {
3856       addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3857                   &clamp_hi);
3858       addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3859                   &clamp_hi);
3860 
3861       addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3862                   &clamp_hi);
3863       addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3864                   &clamp_hi);
3865     }
3866 
3867     // stage 6
3868     v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3869     v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3870     v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
3871     v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
3872 
3873     addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3874     addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3875 
3876     for (i = 8; i < 16; i += 4) {
3877       v[i + 0] = u[i + 0];
3878       v[i + 3] = u[i + 3];
3879     }
3880 
3881     v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3882     v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3883     v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3884     v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3885 
3886     for (i = 16; i < 32; i += 8) {
3887       addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
3888                   &clamp_hi);
3889       addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
3890                   &clamp_hi);
3891 
3892       addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
3893                   &clamp_hi);
3894       addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
3895                   &clamp_hi);
3896     }
3897 
3898     for (i = 32; i < 64; i += 8) {
3899       v[i + 0] = u[i + 0];
3900       v[i + 1] = u[i + 1];
3901       v[i + 6] = u[i + 6];
3902       v[i + 7] = u[i + 7];
3903     }
3904 
3905     v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3906     v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3907     v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3908     v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3909     v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3910     v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3911     v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3912     v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3913     v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3914     v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3915     v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3916     v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3917     v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3918     v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3919     v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3920     v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3921 
3922     // stage 7
3923     addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3924     addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3925 
3926     u[4] = v[4];
3927     u[7] = v[7];
3928     u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
3929     u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
3930 
3931     addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3932     addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3933     addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3934     addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3935 
3936     for (i = 16; i < 32; i += 8) {
3937       u[i + 0] = v[i + 0];
3938       u[i + 1] = v[i + 1];
3939       u[i + 6] = v[i + 6];
3940       u[i + 7] = v[i + 7];
3941     }
3942 
3943     u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
3944     u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
3945     u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
3946     u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
3947     u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
3948     u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
3949     u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
3950     u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
3951 
3952     for (i = 32; i < 64; i += 16) {
3953       for (j = i; j < i + 4; j++) {
3954         addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3955         addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3956                     &clamp_hi);
3957       }
3958     }
3959 
3960     // stage 8
3961     for (i = 0; i < 4; ++i) {
3962       addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
3963     }
3964 
3965     v[8] = u[8];
3966     v[9] = u[9];
3967     v[14] = u[14];
3968     v[15] = u[15];
3969 
3970     v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
3971     v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
3972     v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
3973     v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
3974 
3975     for (i = 16; i < 20; ++i) {
3976       addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
3977       addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
3978                   &clamp_hi);
3979     }
3980 
3981     for (i = 32; i < 36; ++i) {
3982       v[i] = u[i];
3983       v[i + 12] = u[i + 12];
3984       v[i + 16] = u[i + 16];
3985       v[i + 28] = u[i + 28];
3986     }
3987 
3988     v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
3989     v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
3990     v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
3991     v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
3992     v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
3993     v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
3994     v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
3995     v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
3996     v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
3997     v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
3998     v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
3999     v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4000     v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4001     v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4002     v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4003     v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4004 
4005     // stage 9
4006     for (i = 0; i < 8; ++i) {
4007       addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4008     }
4009 
4010     for (i = 16; i < 20; ++i) {
4011       u[i] = v[i];
4012       u[i + 12] = v[i + 12];
4013     }
4014 
4015     u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4016     u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4017     u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4018     u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4019     u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4020     u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4021     u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4022     u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4023 
4024     for (i = 32; i < 40; i++) {
4025       addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4026     }
4027 
4028     for (i = 48; i < 56; i++) {
4029       addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4030     }
4031 
4032     // stage 10
4033     for (i = 0; i < 16; i++) {
4034       addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4035     }
4036 
4037     for (i = 32; i < 40; i++) v[i] = u[i];
4038 
4039     v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4040     v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4041     v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4042     v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4043     v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4044     v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4045     v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4046     v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4047     v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4048     v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4049     v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4050     v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4051     v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4052     v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4053     v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4054     v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4055 
4056     for (i = 56; i < 64; i++) v[i] = u[i];
4057 
4058     // stage 11
4059     for (i = 0; i < 32; i++) {
4060       addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4061                   &clamp_hi);
4062     }
4063     if (!do_cols) {
4064       const int log_range_out = AOMMAX(16, bd + 6);
4065       const __m256i clamp_lo_out =
4066           _mm256_set1_epi32(-(1 << (log_range_out - 1)));
4067       const __m256i clamp_hi_out =
4068           _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
4069 
4070       round_shift_8x8_avx2(out, out_shift);
4071       round_shift_8x8_avx2(out + 16, out_shift);
4072       round_shift_8x8_avx2(out + 32, out_shift);
4073       round_shift_8x8_avx2(out + 48, out_shift);
4074       highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
4075     }
4076   }
4077 }
4078 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
4079                                   int do_cols, int bd, int out_shift);
4080 
4081 static const transform_1d_avx2
4082     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4083       {
4084           { NULL, NULL, NULL, NULL },
4085           { NULL, NULL, NULL, NULL },
4086           { NULL, NULL, NULL, NULL },
4087       },
4088       {
4089           { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
4090           { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
4091           { NULL, NULL, NULL, NULL },
4092       },
4093       {
4094           { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
4095           { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
4096           { NULL, NULL, NULL, NULL },
4097       },
4098       { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
4099         { NULL, NULL, NULL, NULL },
4100         { NULL, NULL, NULL, NULL } },
4101 
4102       { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
4103         { NULL, NULL, NULL, NULL },
4104         { NULL, NULL, NULL, NULL } }
4105     };
4106 
highbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)4107 static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
4108                                                    uint16_t *output, int stride,
4109                                                    TX_TYPE tx_type,
4110                                                    TX_SIZE tx_size, int eob,
4111                                                    const int bd) {
4112   __m256i buf1[64 * 8];
4113   int eobx, eoby;
4114   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
4115   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4116   const int txw_idx = get_txw_idx(tx_size);
4117   const int txh_idx = get_txh_idx(tx_size);
4118   const int txfm_size_col = tx_size_wide[tx_size];
4119   const int txfm_size_row = tx_size_high[tx_size];
4120   const int buf_size_w_div8 = txfm_size_col >> 3;
4121   const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
4122   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4123   const int input_stride = AOMMIN(32, txfm_size_row);
4124   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
4125   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4126   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4127   const transform_1d_avx2 row_txfm =
4128       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4129   const transform_1d_avx2 col_txfm =
4130       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4131 
4132   assert(col_txfm != NULL);
4133   assert(row_txfm != NULL);
4134   int ud_flip, lr_flip;
4135   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4136 
4137   // 1st stage: column transform
4138   for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4139     __m256i buf0[64];
4140     load_buffer_32bit_input(input + i * 8, input_stride, buf0,
4141                             buf_size_nonzero_w);
4142     if (rect_type == 1 || rect_type == -1) {
4143       round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
4144                                      NewInvSqrt2);
4145     }
4146     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4147 
4148     __m256i *_buf1 = buf1 + i * 8;
4149     if (lr_flip) {
4150       for (int j = 0; j < buf_size_w_div8; ++j) {
4151         transpose_8x8_flip_avx2(
4152             &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
4153       }
4154     } else {
4155       for (int j = 0; j < buf_size_w_div8; ++j) {
4156         transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
4157       }
4158     }
4159   }
4160   // 2nd stage: column transform
4161   for (int i = 0; i < buf_size_w_div8; i++) {
4162     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
4163              bd, 0);
4164 
4165     round_shift_array_32_avx2(buf1 + i * txfm_size_row,
4166                               buf1 + i * txfm_size_row, txfm_size_row,
4167                               -shift[1]);
4168   }
4169 
4170   // write to buffer
4171   if (txfm_size_col >= 16) {
4172     for (int i = 0; i < (txfm_size_col >> 4); i++) {
4173       highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
4174                                     output + 16 * i, stride, ud_flip,
4175                                     txfm_size_row, bd);
4176     }
4177   } else if (txfm_size_col == 8) {
4178     highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
4179                                  bd);
4180   }
4181 }
4182 
av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)4183 static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
4184                                                     uint8_t *output, int stride,
4185                                                     TX_TYPE tx_type,
4186                                                     TX_SIZE tx_size, int eob,
4187                                                     const int bd) {
4188   switch (tx_type) {
4189     case DCT_DCT:
4190     case ADST_DCT:
4191     case DCT_ADST:
4192     case ADST_ADST:
4193     case FLIPADST_DCT:
4194     case DCT_FLIPADST:
4195     case FLIPADST_FLIPADST:
4196     case ADST_FLIPADST:
4197     case FLIPADST_ADST:
4198       highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
4199                                              stride, tx_type, tx_size, eob, bd);
4200       break;
4201     case IDTX:
4202     case H_DCT:
4203     case H_ADST:
4204     case H_FLIPADST:
4205     case V_DCT:
4206     case V_ADST:
4207     case V_FLIPADST:
4208       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
4209                                                 tx_size, eob, bd);
4210       break;
4211     default: assert(0); break;
4212   }
4213 }
av1_highbd_inv_txfm_add_avx2(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)4214 void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
4215                                   int stride, const TxfmParam *txfm_param) {
4216   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4217   const TX_SIZE tx_size = txfm_param->tx_size;
4218   switch (tx_size) {
4219     case TX_4X8:
4220     case TX_8X4:
4221     case TX_4X4:
4222     case TX_16X4:
4223     case TX_4X16:
4224       av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param);
4225       break;
4226     default:
4227       av1_highbd_inv_txfm2d_add_universe_avx2(
4228           input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
4229           txfm_param->eob, txfm_param->bd);
4230       break;
4231   }
4232 }
4233