1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #include <assert.h>
12 #include <immintrin.h>
13
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
21 #include "aom_dsp/x86/txfm_common_avx2.h"
22
23 // Note:
24 // Total 32x4 registers to represent 32x32 block coefficients.
25 // For high bit depth, each coefficient is 4-byte.
26 // Each __m256i register holds 8 coefficients.
27 // So each "row" we needs 4 register. Totally 32 rows
28 // Register layout:
29 // v0, v1, v2, v3,
30 // v4, v5, v6, v7,
31 // ... ...
32 // v124, v125, v126, v127
33
highbd_clamp_epi16_avx2(__m256i u,int bd)34 static inline __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
35 const __m256i zero = _mm256_setzero_si256();
36 const __m256i one = _mm256_set1_epi16(1);
37 const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
38 __m256i clamped, mask;
39
40 mask = _mm256_cmpgt_epi16(u, max);
41 clamped = _mm256_andnot_si256(mask, u);
42 mask = _mm256_and_si256(mask, max);
43 clamped = _mm256_or_si256(mask, clamped);
44 mask = _mm256_cmpgt_epi16(clamped, zero);
45 clamped = _mm256_and_si256(clamped, mask);
46
47 return clamped;
48 }
49
round_shift_4x4_avx2(__m256i * in,int shift)50 static inline void round_shift_4x4_avx2(__m256i *in, int shift) {
51 if (shift != 0) {
52 __m256i rnding = _mm256_set1_epi32(1 << (shift - 1));
53 in[0] = _mm256_add_epi32(in[0], rnding);
54 in[1] = _mm256_add_epi32(in[1], rnding);
55 in[2] = _mm256_add_epi32(in[2], rnding);
56 in[3] = _mm256_add_epi32(in[3], rnding);
57
58 in[0] = _mm256_srai_epi32(in[0], shift);
59 in[1] = _mm256_srai_epi32(in[1], shift);
60 in[2] = _mm256_srai_epi32(in[2], shift);
61 in[3] = _mm256_srai_epi32(in[3], shift);
62 }
63 }
64
round_shift_8x8_avx2(__m256i * in,int shift)65 static inline void round_shift_8x8_avx2(__m256i *in, int shift) {
66 round_shift_4x4_avx2(in, shift);
67 round_shift_4x4_avx2(in + 4, shift);
68 round_shift_4x4_avx2(in + 8, shift);
69 round_shift_4x4_avx2(in + 12, shift);
70 }
71
highbd_clamp_epi32_avx2(__m256i * in,__m256i * out,const __m256i * clamp_lo,const __m256i * clamp_hi,int size)72 static void highbd_clamp_epi32_avx2(__m256i *in, __m256i *out,
73 const __m256i *clamp_lo,
74 const __m256i *clamp_hi, int size) {
75 __m256i a0, a1;
76 for (int i = 0; i < size; i += 4) {
77 a0 = _mm256_max_epi32(in[i], *clamp_lo);
78 out[i] = _mm256_min_epi32(a0, *clamp_hi);
79
80 a1 = _mm256_max_epi32(in[i + 1], *clamp_lo);
81 out[i + 1] = _mm256_min_epi32(a1, *clamp_hi);
82
83 a0 = _mm256_max_epi32(in[i + 2], *clamp_lo);
84 out[i + 2] = _mm256_min_epi32(a0, *clamp_hi);
85
86 a1 = _mm256_max_epi32(in[i + 3], *clamp_lo);
87 out[i + 3] = _mm256_min_epi32(a1, *clamp_hi);
88 }
89 }
90
highbd_get_recon_16x8_avx2(const __m256i pred,__m256i res0,__m256i res1,const int bd)91 static inline __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
92 __m256i res0, __m256i res1,
93 const int bd) {
94 __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
95 __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
96
97 x0 = _mm256_add_epi32(res0, x0);
98 x1 = _mm256_add_epi32(res1, x1);
99 x0 = _mm256_packus_epi32(x0, x1);
100 x0 = _mm256_permute4x64_epi64(x0, 0xd8);
101 x0 = highbd_clamp_epi16_avx2(x0, bd);
102 return x0;
103 }
104
highbd_write_buffer_16xn_avx2(__m256i * in,uint16_t * output,int stride,int flipud,int height,const int bd)105 static inline void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
106 int stride, int flipud,
107 int height, const int bd) {
108 int j = flipud ? (height - 1) : 0;
109 const int step = flipud ? -1 : 1;
110 for (int i = 0; i < height; ++i, j += step) {
111 __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
112 __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
113
114 _mm256_storeu_si256((__m256i *)(output + i * stride), u);
115 }
116 }
highbd_get_recon_8x8_avx2(const __m256i pred,__m256i res,const int bd)117 static inline __m256i highbd_get_recon_8x8_avx2(const __m256i pred, __m256i res,
118 const int bd) {
119 __m256i x0 = pred;
120 x0 = _mm256_add_epi32(res, x0);
121 x0 = _mm256_packus_epi32(x0, x0);
122 x0 = _mm256_permute4x64_epi64(x0, 0xd8);
123 x0 = highbd_clamp_epi16_avx2(x0, bd);
124 return x0;
125 }
126
highbd_write_buffer_8xn_avx2(__m256i * in,uint16_t * output,int stride,int flipud,int height,const int bd)127 static inline void highbd_write_buffer_8xn_avx2(__m256i *in, uint16_t *output,
128 int stride, int flipud,
129 int height, const int bd) {
130 int j = flipud ? (height - 1) : 0;
131 __m128i temp;
132 const int step = flipud ? -1 : 1;
133 for (int i = 0; i < height; ++i, j += step) {
134 temp = _mm_loadu_si128((__m128i const *)(output + i * stride));
135 __m256i v = _mm256_cvtepi16_epi32(temp);
136 __m256i u = highbd_get_recon_8x8_avx2(v, in[j], bd);
137 __m128i u1 = _mm256_castsi256_si128(u);
138 _mm_storeu_si128((__m128i *)(output + i * stride), u1);
139 }
140 }
neg_shift_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1,const __m256i * clamp_lo,const __m256i * clamp_hi,int shift)141 static void neg_shift_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
142 __m256i *out1, const __m256i *clamp_lo,
143 const __m256i *clamp_hi, int shift) {
144 __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
145 __m256i a0 = _mm256_add_epi32(offset, in0);
146 __m256i a1 = _mm256_sub_epi32(offset, in1);
147
148 a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
149 a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
150
151 a0 = _mm256_max_epi32(a0, *clamp_lo);
152 a0 = _mm256_min_epi32(a0, *clamp_hi);
153 a1 = _mm256_max_epi32(a1, *clamp_lo);
154 a1 = _mm256_min_epi32(a1, *clamp_hi);
155
156 *out0 = a0;
157 *out1 = a1;
158 }
159
transpose_8x8_avx2(const __m256i * in,__m256i * out)160 static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
161 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
162 __m256i x0, x1;
163
164 u0 = _mm256_unpacklo_epi32(in[0], in[1]);
165 u1 = _mm256_unpackhi_epi32(in[0], in[1]);
166
167 u2 = _mm256_unpacklo_epi32(in[2], in[3]);
168 u3 = _mm256_unpackhi_epi32(in[2], in[3]);
169
170 u4 = _mm256_unpacklo_epi32(in[4], in[5]);
171 u5 = _mm256_unpackhi_epi32(in[4], in[5]);
172
173 u6 = _mm256_unpacklo_epi32(in[6], in[7]);
174 u7 = _mm256_unpackhi_epi32(in[6], in[7]);
175
176 x0 = _mm256_unpacklo_epi64(u0, u2);
177 x1 = _mm256_unpacklo_epi64(u4, u6);
178 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
179 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
180
181 x0 = _mm256_unpackhi_epi64(u0, u2);
182 x1 = _mm256_unpackhi_epi64(u4, u6);
183 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
184 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
185
186 x0 = _mm256_unpacklo_epi64(u1, u3);
187 x1 = _mm256_unpacklo_epi64(u5, u7);
188 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
189 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
190
191 x0 = _mm256_unpackhi_epi64(u1, u3);
192 x1 = _mm256_unpackhi_epi64(u5, u7);
193 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
194 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
195 }
196
transpose_8x8_flip_avx2(const __m256i * in,__m256i * out)197 static void transpose_8x8_flip_avx2(const __m256i *in, __m256i *out) {
198 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
199 __m256i x0, x1;
200
201 u0 = _mm256_unpacklo_epi32(in[7], in[6]);
202 u1 = _mm256_unpackhi_epi32(in[7], in[6]);
203
204 u2 = _mm256_unpacklo_epi32(in[5], in[4]);
205 u3 = _mm256_unpackhi_epi32(in[5], in[4]);
206
207 u4 = _mm256_unpacklo_epi32(in[3], in[2]);
208 u5 = _mm256_unpackhi_epi32(in[3], in[2]);
209
210 u6 = _mm256_unpacklo_epi32(in[1], in[0]);
211 u7 = _mm256_unpackhi_epi32(in[1], in[0]);
212
213 x0 = _mm256_unpacklo_epi64(u0, u2);
214 x1 = _mm256_unpacklo_epi64(u4, u6);
215 out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
216 out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
217
218 x0 = _mm256_unpackhi_epi64(u0, u2);
219 x1 = _mm256_unpackhi_epi64(u4, u6);
220 out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
221 out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
222
223 x0 = _mm256_unpacklo_epi64(u1, u3);
224 x1 = _mm256_unpacklo_epi64(u5, u7);
225 out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
226 out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
227
228 x0 = _mm256_unpackhi_epi64(u1, u3);
229 x1 = _mm256_unpackhi_epi64(u5, u7);
230 out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
231 out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
232 }
233
load_buffer_32bit_input(const int32_t * in,int stride,__m256i * out,int out_size)234 static inline void load_buffer_32bit_input(const int32_t *in, int stride,
235 __m256i *out, int out_size) {
236 for (int i = 0; i < out_size; ++i) {
237 out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
238 }
239 }
240
half_btf_0_avx2(const __m256i * w0,const __m256i * n0,const __m256i * rounding,int bit)241 static inline __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
242 const __m256i *rounding, int bit) {
243 __m256i x;
244 x = _mm256_mullo_epi32(*w0, *n0);
245 x = _mm256_add_epi32(x, *rounding);
246 x = _mm256_srai_epi32(x, bit);
247 return x;
248 }
249
half_btf_avx2(const __m256i * w0,const __m256i * n0,const __m256i * w1,const __m256i * n1,const __m256i * rounding,int bit)250 static inline __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
251 const __m256i *w1, const __m256i *n1,
252 const __m256i *rounding, int bit) {
253 __m256i x, y;
254
255 x = _mm256_mullo_epi32(*w0, *n0);
256 y = _mm256_mullo_epi32(*w1, *n1);
257 x = _mm256_add_epi32(x, y);
258 x = _mm256_add_epi32(x, *rounding);
259 x = _mm256_srai_epi32(x, bit);
260 return x;
261 }
262
addsub_avx2(const __m256i in0,const __m256i in1,__m256i * out0,__m256i * out1,const __m256i * clamp_lo,const __m256i * clamp_hi)263 static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
264 __m256i *out1, const __m256i *clamp_lo,
265 const __m256i *clamp_hi) {
266 __m256i a0 = _mm256_add_epi32(in0, in1);
267 __m256i a1 = _mm256_sub_epi32(in0, in1);
268
269 a0 = _mm256_max_epi32(a0, *clamp_lo);
270 a0 = _mm256_min_epi32(a0, *clamp_hi);
271 a1 = _mm256_max_epi32(a1, *clamp_lo);
272 a1 = _mm256_min_epi32(a1, *clamp_hi);
273
274 *out0 = a0;
275 *out1 = a1;
276 }
277
idct32_stage4_avx2(__m256i * bf1,const __m256i * cospim8,const __m256i * cospi56,const __m256i * cospi8,const __m256i * cospim56,const __m256i * cospim40,const __m256i * cospi24,const __m256i * cospi40,const __m256i * cospim24,const __m256i * rounding,int bit)278 static inline void idct32_stage4_avx2(
279 __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
280 const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
281 const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
282 const __m256i *rounding, int bit) {
283 __m256i temp1, temp2;
284 temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
285 bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
286 bf1[17] = temp1;
287
288 temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
289 bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
290 bf1[18] = temp2;
291
292 temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
293 bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
294 bf1[21] = temp1;
295
296 temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
297 bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
298 bf1[22] = temp2;
299 }
300
idct32_stage5_avx2(__m256i * bf1,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)301 static inline void idct32_stage5_avx2(
302 __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
303 const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
304 const __m256i *clamp_hi, const __m256i *rounding, int bit) {
305 __m256i temp1, temp2;
306 temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
307 bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
308 bf1[9] = temp1;
309
310 temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
311 bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
312 bf1[10] = temp2;
313
314 addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
315 addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
316 addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
317 addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
318 addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
319 addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
320 addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
321 addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
322 }
323
idct32_stage6_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)324 static inline void idct32_stage6_avx2(
325 __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
326 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
327 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
328 const __m256i *rounding, int bit) {
329 __m256i temp1, temp2;
330 temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
331 bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
332 bf1[5] = temp1;
333
334 addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
335 addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
336 addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
337 addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
338
339 temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
340 bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
341 bf1[18] = temp1;
342 temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
343 bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
344 bf1[19] = temp2;
345 temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
346 bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
347 bf1[20] = temp1;
348 temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
349 bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
350 bf1[21] = temp2;
351 }
352
idct32_stage7_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)353 static inline void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
354 const __m256i *cospi32,
355 const __m256i *clamp_lo,
356 const __m256i *clamp_hi,
357 const __m256i *rounding, int bit) {
358 __m256i temp1, temp2;
359 addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
360 addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
361 addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
362 addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
363
364 temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
365 bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
366 bf1[10] = temp1;
367 temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
368 bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
369 bf1[11] = temp2;
370
371 addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
372 addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
373 addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
374 addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
375 addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
376 addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
377 addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
378 addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
379 }
380
idct32_stage8_avx2(__m256i * bf1,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rounding,int bit)381 static inline void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
382 const __m256i *cospi32,
383 const __m256i *clamp_lo,
384 const __m256i *clamp_hi,
385 const __m256i *rounding, int bit) {
386 __m256i temp1, temp2;
387 addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
388 addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
389 addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
390 addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
391 addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
392 addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
393 addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
394 addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
395
396 temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
397 bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
398 bf1[20] = temp1;
399 temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
400 bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
401 bf1[21] = temp2;
402 temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
403 bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
404 bf1[22] = temp1;
405 temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
406 bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
407 bf1[23] = temp2;
408 }
409
idct32_stage9_avx2(__m256i * bf1,__m256i * out,const int do_cols,const int bd,const int out_shift,const __m256i * clamp_lo,const __m256i * clamp_hi)410 static inline void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
411 const int do_cols, const int bd,
412 const int out_shift,
413 const __m256i *clamp_lo,
414 const __m256i *clamp_hi) {
415 addsub_avx2(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
416 addsub_avx2(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
417 addsub_avx2(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
418 addsub_avx2(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
419 addsub_avx2(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
420 addsub_avx2(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
421 addsub_avx2(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
422 addsub_avx2(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
423 addsub_avx2(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
424 addsub_avx2(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
425 addsub_avx2(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
426 addsub_avx2(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
427 addsub_avx2(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
428 addsub_avx2(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
429 addsub_avx2(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
430 addsub_avx2(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
431 if (!do_cols) {
432 const int log_range_out = AOMMAX(16, bd + 6);
433 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
434 const __m256i clamp_hi_out =
435 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
436 round_shift_8x8_avx2(out, out_shift);
437 round_shift_8x8_avx2(out + 16, out_shift);
438 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
439 }
440 }
441
idct32_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)442 static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
443 int bd, int out_shift) {
444 const int32_t *cospi = cospi_arr(bit);
445 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
446 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
447 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
448 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
449 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
450 __m256i x;
451 // stage 0
452 // stage 1
453 // stage 2
454 // stage 3
455 // stage 4
456 // stage 5
457 x = _mm256_mullo_epi32(in[0], cospi32);
458 x = _mm256_add_epi32(x, rounding);
459 x = _mm256_srai_epi32(x, bit);
460
461 // stage 6
462 // stage 7
463 // stage 8
464 // stage 9
465 if (!do_cols) {
466 const int log_range_out = AOMMAX(16, bd + 6);
467 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
468 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
469 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
470 x = _mm256_add_epi32(offset, x);
471 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
472 }
473 x = _mm256_max_epi32(x, clamp_lo);
474 x = _mm256_min_epi32(x, clamp_hi);
475 out[0] = x;
476 out[1] = x;
477 out[2] = x;
478 out[3] = x;
479 out[4] = x;
480 out[5] = x;
481 out[6] = x;
482 out[7] = x;
483 out[8] = x;
484 out[9] = x;
485 out[10] = x;
486 out[11] = x;
487 out[12] = x;
488 out[13] = x;
489 out[14] = x;
490 out[15] = x;
491 out[16] = x;
492 out[17] = x;
493 out[18] = x;
494 out[19] = x;
495 out[20] = x;
496 out[21] = x;
497 out[22] = x;
498 out[23] = x;
499 out[24] = x;
500 out[25] = x;
501 out[26] = x;
502 out[27] = x;
503 out[28] = x;
504 out[29] = x;
505 out[30] = x;
506 out[31] = x;
507 }
508
idct32_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)509 static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
510 int bd, int out_shift) {
511 const int32_t *cospi = cospi_arr(bit);
512 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
513 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
514 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
515 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
516 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
517 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
518 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
519 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
520 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
521 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
522 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
523 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
524 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
525 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
526 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
527 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
528 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
529 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
530 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
531 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
532 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
533 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
534 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
535 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
536 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
537 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
538 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
539 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
540 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
541 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
542 __m256i bf1[32];
543
544 {
545 // stage 0
546 // stage 1
547 bf1[0] = in[0];
548 bf1[4] = in[4];
549 bf1[8] = in[2];
550 bf1[12] = in[6];
551 bf1[16] = in[1];
552 bf1[20] = in[5];
553 bf1[24] = in[3];
554 bf1[28] = in[7];
555
556 // stage 2
557 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
558 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
559 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
560 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
561 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
562 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
563 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
564 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
565
566 // stage 3
567 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
568 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
569
570 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
571 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
572 bf1[17] = bf1[16];
573 bf1[18] = bf1[19];
574 bf1[21] = bf1[20];
575 bf1[22] = bf1[23];
576 bf1[25] = bf1[24];
577 bf1[26] = bf1[27];
578 bf1[29] = bf1[28];
579 bf1[30] = bf1[31];
580
581 // stage 4
582 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
583 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
584
585 bf1[9] = bf1[8];
586 bf1[10] = bf1[11];
587 bf1[13] = bf1[12];
588 bf1[14] = bf1[15];
589
590 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
591 &cospi24, &cospi40, &cospim24, &rounding, bit);
592
593 // stage 5
594 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
595 bf1[1] = bf1[0];
596 bf1[5] = bf1[4];
597 bf1[6] = bf1[7];
598
599 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
600 &clamp_hi, &rounding, bit);
601
602 // stage 6
603 bf1[3] = bf1[0];
604 bf1[2] = bf1[1];
605
606 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
607 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
608
609 // stage 7
610 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
611 &rounding, bit);
612
613 // stage 8
614 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
615 &rounding, bit);
616
617 // stage 9
618 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
619 }
620 }
621
idct32_low16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)622 static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
623 int bd, int out_shift) {
624 const int32_t *cospi = cospi_arr(bit);
625 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
626 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
627 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
628 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
629 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
630 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
631 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
632 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
633 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
634 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
635 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
636 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
637 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
638 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
639 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
640 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
641 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
642 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
643 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
644 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
645 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
646 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
647 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
648 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
649 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
650 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
651 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
652 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
653 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
654 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
655 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
656 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
657 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
658 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
659 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
660 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
661 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
662 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
663 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
664 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
665 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
666 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
667 __m256i bf1[32];
668
669 {
670 // stage 0
671 // stage 1
672 bf1[0] = in[0];
673 bf1[2] = in[8];
674 bf1[4] = in[4];
675 bf1[6] = in[12];
676 bf1[8] = in[2];
677 bf1[10] = in[10];
678 bf1[12] = in[6];
679 bf1[14] = in[14];
680 bf1[16] = in[1];
681 bf1[18] = in[9];
682 bf1[20] = in[5];
683 bf1[22] = in[13];
684 bf1[24] = in[3];
685 bf1[26] = in[11];
686 bf1[28] = in[7];
687 bf1[30] = in[15];
688
689 // stage 2
690 bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
691 bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
692 bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
693 bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
694 bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
695 bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
696 bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
697 bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
698 bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
699 bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
700 bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
701 bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
702 bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
703 bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
704 bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
705 bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
706
707 // stage 3
708 bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
709 bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
710 bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
711 bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
712 bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
713 bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
714 bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
715 bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
716
717 addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
718 addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
719 addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
720 addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
721 addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
722 addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
723 addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
724 addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
725
726 // stage 4
727 bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
728 bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
729 bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
730 bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
731
732 addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
733 addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
734 addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
735 addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
736
737 idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
738 &cospi24, &cospi40, &cospim24, &rounding, bit);
739
740 // stage 5
741 bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
742 bf1[1] = bf1[0];
743 bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
744 bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
745
746 addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
747 addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
748
749 idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
750 &clamp_hi, &rounding, bit);
751
752 // stage 6
753 addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
754 addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
755
756 idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
757 &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
758
759 // stage 7
760 idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
761 &rounding, bit);
762
763 // stage 8
764 idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
765 &rounding, bit);
766
767 // stage 9
768 idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
769 }
770 }
771
idct32_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)772 static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
773 int out_shift) {
774 const int32_t *cospi = cospi_arr(bit);
775 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
776 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
777 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
778 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
779 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
780 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
781 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
782 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
783 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
784 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
785 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
786 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
787 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
788 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
789 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
790 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
791 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
792 const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
793 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
794 const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
795 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
796 const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
797 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
798 const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
799 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
800 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
801 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
802 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
803 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
804 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
805 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
806 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
807 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
808 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
809 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
810 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
811 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
812 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
813 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
814 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
815 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
816 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
817 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
818 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
819 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
820 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
821 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
822 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
823 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
824 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
825 const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
826 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
827 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
828 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
829 __m256i bf1[32], bf0[32];
830
831 {
832 // stage 0
833 // stage 1
834 bf1[0] = in[0];
835 bf1[1] = in[16];
836 bf1[2] = in[8];
837 bf1[3] = in[24];
838 bf1[4] = in[4];
839 bf1[5] = in[20];
840 bf1[6] = in[12];
841 bf1[7] = in[28];
842 bf1[8] = in[2];
843 bf1[9] = in[18];
844 bf1[10] = in[10];
845 bf1[11] = in[26];
846 bf1[12] = in[6];
847 bf1[13] = in[22];
848 bf1[14] = in[14];
849 bf1[15] = in[30];
850 bf1[16] = in[1];
851 bf1[17] = in[17];
852 bf1[18] = in[9];
853 bf1[19] = in[25];
854 bf1[20] = in[5];
855 bf1[21] = in[21];
856 bf1[22] = in[13];
857 bf1[23] = in[29];
858 bf1[24] = in[3];
859 bf1[25] = in[19];
860 bf1[26] = in[11];
861 bf1[27] = in[27];
862 bf1[28] = in[7];
863 bf1[29] = in[23];
864 bf1[30] = in[15];
865 bf1[31] = in[31];
866
867 // stage 2
868 bf0[0] = bf1[0];
869 bf0[1] = bf1[1];
870 bf0[2] = bf1[2];
871 bf0[3] = bf1[3];
872 bf0[4] = bf1[4];
873 bf0[5] = bf1[5];
874 bf0[6] = bf1[6];
875 bf0[7] = bf1[7];
876 bf0[8] = bf1[8];
877 bf0[9] = bf1[9];
878 bf0[10] = bf1[10];
879 bf0[11] = bf1[11];
880 bf0[12] = bf1[12];
881 bf0[13] = bf1[13];
882 bf0[14] = bf1[14];
883 bf0[15] = bf1[15];
884 bf0[16] =
885 half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
886 bf0[17] =
887 half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
888 bf0[18] =
889 half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
890 bf0[19] =
891 half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
892 bf0[20] =
893 half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
894 bf0[21] =
895 half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
896 bf0[22] =
897 half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
898 bf0[23] =
899 half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
900 bf0[24] =
901 half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
902 bf0[25] =
903 half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
904 bf0[26] =
905 half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
906 bf0[27] =
907 half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
908 bf0[28] =
909 half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
910 bf0[29] =
911 half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
912 bf0[30] =
913 half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
914 bf0[31] =
915 half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
916
917 // stage 3
918 bf1[0] = bf0[0];
919 bf1[1] = bf0[1];
920 bf1[2] = bf0[2];
921 bf1[3] = bf0[3];
922 bf1[4] = bf0[4];
923 bf1[5] = bf0[5];
924 bf1[6] = bf0[6];
925 bf1[7] = bf0[7];
926 bf1[8] =
927 half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
928 bf1[9] =
929 half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
930 bf1[10] =
931 half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
932 bf1[11] =
933 half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
934 bf1[12] =
935 half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
936 bf1[13] =
937 half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
938 bf1[14] =
939 half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
940 bf1[15] =
941 half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
942
943 addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
944 addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
945 addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
946 addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
947 addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
948 addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
949 addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
950 addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
951
952 // stage 4
953 bf0[0] = bf1[0];
954 bf0[1] = bf1[1];
955 bf0[2] = bf1[2];
956 bf0[3] = bf1[3];
957 bf0[4] =
958 half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
959 bf0[5] =
960 half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
961 bf0[6] =
962 half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
963 bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
964
965 addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
966 addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
967 addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
968 addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
969
970 bf0[16] = bf1[16];
971 bf0[17] =
972 half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
973 bf0[18] =
974 half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
975 bf0[19] = bf1[19];
976 bf0[20] = bf1[20];
977 bf0[21] =
978 half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
979 bf0[22] =
980 half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
981 bf0[23] = bf1[23];
982 bf0[24] = bf1[24];
983 bf0[25] =
984 half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
985 bf0[26] =
986 half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
987 bf0[27] = bf1[27];
988 bf0[28] = bf1[28];
989 bf0[29] =
990 half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
991 bf0[30] =
992 half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
993 bf0[31] = bf1[31];
994
995 // stage 5
996 bf1[0] =
997 half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
998 bf1[1] =
999 half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
1000 bf1[2] =
1001 half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
1002 bf1[3] =
1003 half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
1004 addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
1005 addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
1006 bf1[8] = bf0[8];
1007 bf1[9] =
1008 half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
1009 bf1[10] =
1010 half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
1011 bf1[11] = bf0[11];
1012 bf1[12] = bf0[12];
1013 bf1[13] =
1014 half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
1015 bf1[14] =
1016 half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
1017 bf1[15] = bf0[15];
1018 addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
1019 addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
1020 addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
1021 addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
1022 addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
1023 addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
1024 addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
1025 addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
1026
1027 // stage 6
1028 addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
1029 addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
1030 bf0[4] = bf1[4];
1031 bf0[5] =
1032 half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1033 bf0[6] =
1034 half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
1035 bf0[7] = bf1[7];
1036 addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
1037 addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
1038 addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
1039 addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
1040 bf0[16] = bf1[16];
1041 bf0[17] = bf1[17];
1042 bf0[18] =
1043 half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
1044 bf0[19] =
1045 half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
1046 bf0[20] =
1047 half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
1048 bf0[21] =
1049 half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
1050 bf0[22] = bf1[22];
1051 bf0[23] = bf1[23];
1052 bf0[24] = bf1[24];
1053 bf0[25] = bf1[25];
1054 bf0[26] =
1055 half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
1056 bf0[27] =
1057 half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
1058 bf0[28] =
1059 half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
1060 bf0[29] =
1061 half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
1062 bf0[30] = bf1[30];
1063 bf0[31] = bf1[31];
1064
1065 // stage 7
1066 addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
1067 addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
1068 addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
1069 addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
1070 bf1[8] = bf0[8];
1071 bf1[9] = bf0[9];
1072 bf1[10] =
1073 half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1074 bf1[11] =
1075 half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1076 bf1[12] =
1077 half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
1078 bf1[13] =
1079 half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
1080 bf1[14] = bf0[14];
1081 bf1[15] = bf0[15];
1082 addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
1083 addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
1084 addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
1085 addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
1086 addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
1087 addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
1088 addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
1089 addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
1090
1091 // stage 8
1092 addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
1093 addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
1094 addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
1095 addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
1096 addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
1097 addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
1098 addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
1099 addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
1100 bf0[16] = bf1[16];
1101 bf0[17] = bf1[17];
1102 bf0[18] = bf1[18];
1103 bf0[19] = bf1[19];
1104 bf0[20] =
1105 half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1106 bf0[21] =
1107 half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1108 bf0[22] =
1109 half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1110 bf0[23] =
1111 half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1112 bf0[24] =
1113 half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
1114 bf0[25] =
1115 half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
1116 bf0[26] =
1117 half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
1118 bf0[27] =
1119 half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
1120 bf0[28] = bf1[28];
1121 bf0[29] = bf1[29];
1122 bf0[30] = bf1[30];
1123 bf0[31] = bf1[31];
1124
1125 // stage 9
1126 addsub_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
1127 addsub_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
1128 addsub_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
1129 addsub_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
1130 addsub_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
1131 addsub_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
1132 addsub_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
1133 addsub_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
1134 addsub_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
1135 addsub_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
1136 addsub_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
1137 addsub_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
1138 addsub_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
1139 addsub_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
1140 addsub_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
1141 addsub_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
1142 if (!do_cols) {
1143 const int log_range_out = AOMMAX(16, bd + 6);
1144 const __m256i clamp_lo_out =
1145 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1146 const __m256i clamp_hi_out =
1147 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1148 round_shift_8x8_avx2(out, out_shift);
1149 round_shift_8x8_avx2(out + 16, out_shift);
1150 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 32);
1151 }
1152 }
1153 }
idct16_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1154 static void idct16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1155 int bd, int out_shift) {
1156 const int32_t *cospi = cospi_arr(bit);
1157 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1158 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1159 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1160 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1161 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1162
1163 {
1164 // stage 0
1165 // stage 1
1166 // stage 2
1167 // stage 3
1168 // stage 4
1169 in[0] = _mm256_mullo_epi32(in[0], cospi32);
1170 in[0] = _mm256_add_epi32(in[0], rnding);
1171 in[0] = _mm256_srai_epi32(in[0], bit);
1172
1173 // stage 5
1174 // stage 6
1175 // stage 7
1176 if (!do_cols) {
1177 const int log_range_out = AOMMAX(16, bd + 6);
1178 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1179 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1180 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
1181 in[0] = _mm256_add_epi32(in[0], offset);
1182 in[0] = _mm256_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1183 }
1184 in[0] = _mm256_max_epi32(in[0], clamp_lo);
1185 in[0] = _mm256_min_epi32(in[0], clamp_hi);
1186 out[0] = in[0];
1187 out[1] = in[0];
1188 out[2] = in[0];
1189 out[3] = in[0];
1190 out[4] = in[0];
1191 out[5] = in[0];
1192 out[6] = in[0];
1193 out[7] = in[0];
1194 out[8] = in[0];
1195 out[9] = in[0];
1196 out[10] = in[0];
1197 out[11] = in[0];
1198 out[12] = in[0];
1199 out[13] = in[0];
1200 out[14] = in[0];
1201 out[15] = in[0];
1202 }
1203 }
1204
idct16_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1205 static void idct16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1206 int bd, int out_shift) {
1207 const int32_t *cospi = cospi_arr(bit);
1208 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1209 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1210 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1211 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1212 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1213 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1214 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1215 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1216 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1217 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1218 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1219 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1220 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1221 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1222 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1223 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1224 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1225 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1226 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1227 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1228 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1229 __m256i u[16], x, y;
1230
1231 {
1232 // stage 0
1233 // stage 1
1234 u[0] = in[0];
1235 u[2] = in[4];
1236 u[4] = in[2];
1237 u[6] = in[6];
1238 u[8] = in[1];
1239 u[10] = in[5];
1240 u[12] = in[3];
1241 u[14] = in[7];
1242
1243 // stage 2
1244 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
1245 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
1246
1247 u[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
1248 u[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
1249
1250 u[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
1251 u[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
1252
1253 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
1254 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
1255
1256 // stage 3
1257 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
1258 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
1259 u[5] = half_btf_0_avx2(&cospim40, &u[6], &rnding, bit);
1260 u[6] = half_btf_0_avx2(&cospi24, &u[6], &rnding, bit);
1261
1262 addsub_avx2(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1263 addsub_avx2(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1264 addsub_avx2(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1265 addsub_avx2(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1266
1267 // stage 4
1268 x = _mm256_mullo_epi32(u[0], cospi32);
1269 u[0] = _mm256_add_epi32(x, rnding);
1270 u[0] = _mm256_srai_epi32(u[0], bit);
1271 u[1] = u[0];
1272
1273 u[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
1274 u[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
1275
1276 addsub_avx2(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
1277 addsub_avx2(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
1278
1279 x = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1280 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1281 u[9] = x;
1282 y = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1283 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1284 u[10] = y;
1285
1286 // stage 5
1287 addsub_avx2(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1288 addsub_avx2(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1289
1290 x = _mm256_mullo_epi32(u[5], cospi32);
1291 y = _mm256_mullo_epi32(u[6], cospi32);
1292 u[5] = _mm256_sub_epi32(y, x);
1293 u[5] = _mm256_add_epi32(u[5], rnding);
1294 u[5] = _mm256_srai_epi32(u[5], bit);
1295
1296 u[6] = _mm256_add_epi32(y, x);
1297 u[6] = _mm256_add_epi32(u[6], rnding);
1298 u[6] = _mm256_srai_epi32(u[6], bit);
1299
1300 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1301 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1302 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1303 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1304
1305 // stage 6
1306 addsub_avx2(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
1307 addsub_avx2(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
1308 addsub_avx2(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
1309 addsub_avx2(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
1310
1311 x = _mm256_mullo_epi32(u[10], cospi32);
1312 y = _mm256_mullo_epi32(u[13], cospi32);
1313 u[10] = _mm256_sub_epi32(y, x);
1314 u[10] = _mm256_add_epi32(u[10], rnding);
1315 u[10] = _mm256_srai_epi32(u[10], bit);
1316
1317 u[13] = _mm256_add_epi32(x, y);
1318 u[13] = _mm256_add_epi32(u[13], rnding);
1319 u[13] = _mm256_srai_epi32(u[13], bit);
1320
1321 x = _mm256_mullo_epi32(u[11], cospi32);
1322 y = _mm256_mullo_epi32(u[12], cospi32);
1323 u[11] = _mm256_sub_epi32(y, x);
1324 u[11] = _mm256_add_epi32(u[11], rnding);
1325 u[11] = _mm256_srai_epi32(u[11], bit);
1326
1327 u[12] = _mm256_add_epi32(x, y);
1328 u[12] = _mm256_add_epi32(u[12], rnding);
1329 u[12] = _mm256_srai_epi32(u[12], bit);
1330 // stage 7
1331 addsub_avx2(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1332 addsub_avx2(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1333 addsub_avx2(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1334 addsub_avx2(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1335 addsub_avx2(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1336 addsub_avx2(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1337 addsub_avx2(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1338 addsub_avx2(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1339
1340 if (!do_cols) {
1341 const int log_range_out = AOMMAX(16, bd + 6);
1342 const __m256i clamp_lo_out =
1343 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1344 const __m256i clamp_hi_out =
1345 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1346 round_shift_8x8_avx2(out, out_shift);
1347 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1348 }
1349 }
1350 }
1351
idct16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1352 static void idct16_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
1353 int out_shift) {
1354 const int32_t *cospi = cospi_arr(bit);
1355 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
1356 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
1357 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
1358 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
1359 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
1360 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
1361 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
1362 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
1363 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
1364 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
1365 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
1366 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
1367 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1368 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
1369 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1370 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
1371 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1372 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1373 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1374 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1375 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1376 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
1377 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1378 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1379 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1380 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1381 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1382 __m256i u[16], v[16], x, y;
1383
1384 {
1385 // stage 0
1386 // stage 1
1387 u[0] = in[0];
1388 u[1] = in[8];
1389 u[2] = in[4];
1390 u[3] = in[12];
1391 u[4] = in[2];
1392 u[5] = in[10];
1393 u[6] = in[6];
1394 u[7] = in[14];
1395 u[8] = in[1];
1396 u[9] = in[9];
1397 u[10] = in[5];
1398 u[11] = in[13];
1399 u[12] = in[3];
1400 u[13] = in[11];
1401 u[14] = in[7];
1402 u[15] = in[15];
1403
1404 // stage 2
1405 v[0] = u[0];
1406 v[1] = u[1];
1407 v[2] = u[2];
1408 v[3] = u[3];
1409 v[4] = u[4];
1410 v[5] = u[5];
1411 v[6] = u[6];
1412 v[7] = u[7];
1413
1414 v[8] = half_btf_avx2(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
1415 v[9] = half_btf_avx2(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
1416 v[10] = half_btf_avx2(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
1417 v[11] = half_btf_avx2(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
1418 v[12] = half_btf_avx2(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
1419 v[13] = half_btf_avx2(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
1420 v[14] = half_btf_avx2(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
1421 v[15] = half_btf_avx2(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
1422
1423 // stage 3
1424 u[0] = v[0];
1425 u[1] = v[1];
1426 u[2] = v[2];
1427 u[3] = v[3];
1428 u[4] = half_btf_avx2(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
1429 u[5] = half_btf_avx2(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
1430 u[6] = half_btf_avx2(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
1431 u[7] = half_btf_avx2(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
1432 addsub_avx2(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1433 addsub_avx2(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1434 addsub_avx2(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1435 addsub_avx2(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1436
1437 // stage 4
1438 x = _mm256_mullo_epi32(u[0], cospi32);
1439 y = _mm256_mullo_epi32(u[1], cospi32);
1440 v[0] = _mm256_add_epi32(x, y);
1441 v[0] = _mm256_add_epi32(v[0], rnding);
1442 v[0] = _mm256_srai_epi32(v[0], bit);
1443
1444 v[1] = _mm256_sub_epi32(x, y);
1445 v[1] = _mm256_add_epi32(v[1], rnding);
1446 v[1] = _mm256_srai_epi32(v[1], bit);
1447
1448 v[2] = half_btf_avx2(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
1449 v[3] = half_btf_avx2(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
1450 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
1451 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
1452 v[8] = u[8];
1453 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
1454 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
1455 v[11] = u[11];
1456 v[12] = u[12];
1457 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
1458 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
1459 v[15] = u[15];
1460
1461 // stage 5
1462 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
1463 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
1464 u[4] = v[4];
1465
1466 x = _mm256_mullo_epi32(v[5], cospi32);
1467 y = _mm256_mullo_epi32(v[6], cospi32);
1468 u[5] = _mm256_sub_epi32(y, x);
1469 u[5] = _mm256_add_epi32(u[5], rnding);
1470 u[5] = _mm256_srai_epi32(u[5], bit);
1471
1472 u[6] = _mm256_add_epi32(y, x);
1473 u[6] = _mm256_add_epi32(u[6], rnding);
1474 u[6] = _mm256_srai_epi32(u[6], bit);
1475
1476 u[7] = v[7];
1477 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
1478 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
1479 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
1480 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
1481
1482 // stage 6
1483 addsub_avx2(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
1484 addsub_avx2(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
1485 addsub_avx2(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
1486 addsub_avx2(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
1487 v[8] = u[8];
1488 v[9] = u[9];
1489
1490 x = _mm256_mullo_epi32(u[10], cospi32);
1491 y = _mm256_mullo_epi32(u[13], cospi32);
1492 v[10] = _mm256_sub_epi32(y, x);
1493 v[10] = _mm256_add_epi32(v[10], rnding);
1494 v[10] = _mm256_srai_epi32(v[10], bit);
1495
1496 v[13] = _mm256_add_epi32(x, y);
1497 v[13] = _mm256_add_epi32(v[13], rnding);
1498 v[13] = _mm256_srai_epi32(v[13], bit);
1499
1500 x = _mm256_mullo_epi32(u[11], cospi32);
1501 y = _mm256_mullo_epi32(u[12], cospi32);
1502 v[11] = _mm256_sub_epi32(y, x);
1503 v[11] = _mm256_add_epi32(v[11], rnding);
1504 v[11] = _mm256_srai_epi32(v[11], bit);
1505
1506 v[12] = _mm256_add_epi32(x, y);
1507 v[12] = _mm256_add_epi32(v[12], rnding);
1508 v[12] = _mm256_srai_epi32(v[12], bit);
1509
1510 v[14] = u[14];
1511 v[15] = u[15];
1512
1513 // stage 7
1514 addsub_avx2(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
1515 addsub_avx2(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
1516 addsub_avx2(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
1517 addsub_avx2(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
1518 addsub_avx2(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
1519 addsub_avx2(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
1520 addsub_avx2(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
1521 addsub_avx2(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
1522
1523 if (!do_cols) {
1524 const int log_range_out = AOMMAX(16, bd + 6);
1525 const __m256i clamp_lo_out =
1526 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1527 const __m256i clamp_hi_out =
1528 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1529 round_shift_8x8_avx2(out, out_shift);
1530 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 16);
1531 }
1532 }
1533 }
1534
iadst16_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1535 static void iadst16_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1536 int bd, int out_shift) {
1537 const int32_t *cospi = cospi_arr(bit);
1538 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1539 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1540 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1541 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1542 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1543 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1544 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1545 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1546 const __m256i zero = _mm256_setzero_si256();
1547 __m256i v[16], x, y, temp1, temp2;
1548
1549 // Calculate the column 0, 1, 2, 3
1550 {
1551 // stage 0
1552 // stage 1
1553 // stage 2
1554 x = _mm256_mullo_epi32(in[0], cospi62);
1555 v[0] = _mm256_add_epi32(x, rnding);
1556 v[0] = _mm256_srai_epi32(v[0], bit);
1557
1558 x = _mm256_mullo_epi32(in[0], cospi2);
1559 v[1] = _mm256_sub_epi32(zero, x);
1560 v[1] = _mm256_add_epi32(v[1], rnding);
1561 v[1] = _mm256_srai_epi32(v[1], bit);
1562
1563 // stage 3
1564 v[8] = v[0];
1565 v[9] = v[1];
1566
1567 // stage 4
1568 temp1 = _mm256_mullo_epi32(v[8], cospi8);
1569 x = _mm256_mullo_epi32(v[9], cospi56);
1570 temp1 = _mm256_add_epi32(temp1, x);
1571 temp1 = _mm256_add_epi32(temp1, rnding);
1572 temp1 = _mm256_srai_epi32(temp1, bit);
1573
1574 temp2 = _mm256_mullo_epi32(v[8], cospi56);
1575 x = _mm256_mullo_epi32(v[9], cospi8);
1576 temp2 = _mm256_sub_epi32(temp2, x);
1577 temp2 = _mm256_add_epi32(temp2, rnding);
1578 temp2 = _mm256_srai_epi32(temp2, bit);
1579 v[8] = temp1;
1580 v[9] = temp2;
1581
1582 // stage 5
1583 v[4] = v[0];
1584 v[5] = v[1];
1585 v[12] = v[8];
1586 v[13] = v[9];
1587
1588 // stage 6
1589 temp1 = _mm256_mullo_epi32(v[4], cospi16);
1590 x = _mm256_mullo_epi32(v[5], cospi48);
1591 temp1 = _mm256_add_epi32(temp1, x);
1592 temp1 = _mm256_add_epi32(temp1, rnding);
1593 temp1 = _mm256_srai_epi32(temp1, bit);
1594
1595 temp2 = _mm256_mullo_epi32(v[4], cospi48);
1596 x = _mm256_mullo_epi32(v[5], cospi16);
1597 temp2 = _mm256_sub_epi32(temp2, x);
1598 temp2 = _mm256_add_epi32(temp2, rnding);
1599 temp2 = _mm256_srai_epi32(temp2, bit);
1600 v[4] = temp1;
1601 v[5] = temp2;
1602
1603 temp1 = _mm256_mullo_epi32(v[12], cospi16);
1604 x = _mm256_mullo_epi32(v[13], cospi48);
1605 temp1 = _mm256_add_epi32(temp1, x);
1606 temp1 = _mm256_add_epi32(temp1, rnding);
1607 temp1 = _mm256_srai_epi32(temp1, bit);
1608
1609 temp2 = _mm256_mullo_epi32(v[12], cospi48);
1610 x = _mm256_mullo_epi32(v[13], cospi16);
1611 temp2 = _mm256_sub_epi32(temp2, x);
1612 temp2 = _mm256_add_epi32(temp2, rnding);
1613 temp2 = _mm256_srai_epi32(temp2, bit);
1614 v[12] = temp1;
1615 v[13] = temp2;
1616
1617 // stage 7
1618 v[2] = v[0];
1619 v[3] = v[1];
1620 v[6] = v[4];
1621 v[7] = v[5];
1622 v[10] = v[8];
1623 v[11] = v[9];
1624 v[14] = v[12];
1625 v[15] = v[13];
1626
1627 // stage 8
1628 y = _mm256_mullo_epi32(v[2], cospi32);
1629 x = _mm256_mullo_epi32(v[3], cospi32);
1630 v[2] = _mm256_add_epi32(y, x);
1631 v[2] = _mm256_add_epi32(v[2], rnding);
1632 v[2] = _mm256_srai_epi32(v[2], bit);
1633
1634 v[3] = _mm256_sub_epi32(y, x);
1635 v[3] = _mm256_add_epi32(v[3], rnding);
1636 v[3] = _mm256_srai_epi32(v[3], bit);
1637
1638 y = _mm256_mullo_epi32(v[6], cospi32);
1639 x = _mm256_mullo_epi32(v[7], cospi32);
1640 v[6] = _mm256_add_epi32(y, x);
1641 v[6] = _mm256_add_epi32(v[6], rnding);
1642 v[6] = _mm256_srai_epi32(v[6], bit);
1643
1644 v[7] = _mm256_sub_epi32(y, x);
1645 v[7] = _mm256_add_epi32(v[7], rnding);
1646 v[7] = _mm256_srai_epi32(v[7], bit);
1647
1648 y = _mm256_mullo_epi32(v[10], cospi32);
1649 x = _mm256_mullo_epi32(v[11], cospi32);
1650 v[10] = _mm256_add_epi32(y, x);
1651 v[10] = _mm256_add_epi32(v[10], rnding);
1652 v[10] = _mm256_srai_epi32(v[10], bit);
1653
1654 v[11] = _mm256_sub_epi32(y, x);
1655 v[11] = _mm256_add_epi32(v[11], rnding);
1656 v[11] = _mm256_srai_epi32(v[11], bit);
1657
1658 y = _mm256_mullo_epi32(v[14], cospi32);
1659 x = _mm256_mullo_epi32(v[15], cospi32);
1660 v[14] = _mm256_add_epi32(y, x);
1661 v[14] = _mm256_add_epi32(v[14], rnding);
1662 v[14] = _mm256_srai_epi32(v[14], bit);
1663
1664 v[15] = _mm256_sub_epi32(y, x);
1665 v[15] = _mm256_add_epi32(v[15], rnding);
1666 v[15] = _mm256_srai_epi32(v[15], bit);
1667
1668 // stage 9
1669 if (do_cols) {
1670 out[0] = v[0];
1671 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
1672 out[2] = v[12];
1673 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
1674 out[4] = v[6];
1675 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
1676 out[6] = v[10];
1677 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
1678 out[8] = v[3];
1679 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
1680 out[10] = v[15];
1681 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
1682 out[12] = v[5];
1683 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
1684 out[14] = v[9];
1685 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
1686 } else {
1687 const int log_range_out = AOMMAX(16, bd + 6);
1688 const __m256i clamp_lo_out =
1689 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
1690 const __m256i clamp_hi_out =
1691 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
1692
1693 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1694 out_shift);
1695 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
1696 &clamp_hi_out, out_shift);
1697 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
1698 &clamp_hi_out, out_shift);
1699 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
1700 &clamp_hi_out, out_shift);
1701 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
1702 &clamp_hi_out, out_shift);
1703 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
1704 &clamp_hi_out, out_shift);
1705 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
1706 &clamp_hi_out, out_shift);
1707 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
1708 &clamp_hi_out, out_shift);
1709 }
1710 }
1711 }
1712
iadst16_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)1713 static void iadst16_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
1714 int bd, int out_shift) {
1715 const int32_t *cospi = cospi_arr(bit);
1716 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
1717 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
1718 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
1719 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
1720 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
1721 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
1722 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
1723 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
1724 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
1725 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
1726 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
1727 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
1728 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
1729 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
1730 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
1731 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
1732 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
1733 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
1734 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
1735 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
1736 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
1737 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
1738 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
1739 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
1740 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
1741 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
1742 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
1743 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1744 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
1745 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
1746 __m256i u[16], x, y;
1747
1748 {
1749 // stage 0
1750 // stage 1
1751 // stage 2
1752 __m256i zero = _mm256_setzero_si256();
1753 x = _mm256_mullo_epi32(in[0], cospi62);
1754 u[0] = _mm256_add_epi32(x, rnding);
1755 u[0] = _mm256_srai_epi32(u[0], bit);
1756
1757 x = _mm256_mullo_epi32(in[0], cospi2);
1758 u[1] = _mm256_sub_epi32(zero, x);
1759 u[1] = _mm256_add_epi32(u[1], rnding);
1760 u[1] = _mm256_srai_epi32(u[1], bit);
1761
1762 x = _mm256_mullo_epi32(in[2], cospi54);
1763 u[2] = _mm256_add_epi32(x, rnding);
1764 u[2] = _mm256_srai_epi32(u[2], bit);
1765
1766 x = _mm256_mullo_epi32(in[2], cospi10);
1767 u[3] = _mm256_sub_epi32(zero, x);
1768 u[3] = _mm256_add_epi32(u[3], rnding);
1769 u[3] = _mm256_srai_epi32(u[3], bit);
1770
1771 x = _mm256_mullo_epi32(in[4], cospi46);
1772 u[4] = _mm256_add_epi32(x, rnding);
1773 u[4] = _mm256_srai_epi32(u[4], bit);
1774
1775 x = _mm256_mullo_epi32(in[4], cospi18);
1776 u[5] = _mm256_sub_epi32(zero, x);
1777 u[5] = _mm256_add_epi32(u[5], rnding);
1778 u[5] = _mm256_srai_epi32(u[5], bit);
1779
1780 x = _mm256_mullo_epi32(in[6], cospi38);
1781 u[6] = _mm256_add_epi32(x, rnding);
1782 u[6] = _mm256_srai_epi32(u[6], bit);
1783
1784 x = _mm256_mullo_epi32(in[6], cospi26);
1785 u[7] = _mm256_sub_epi32(zero, x);
1786 u[7] = _mm256_add_epi32(u[7], rnding);
1787 u[7] = _mm256_srai_epi32(u[7], bit);
1788
1789 u[8] = _mm256_mullo_epi32(in[7], cospi34);
1790 u[8] = _mm256_add_epi32(u[8], rnding);
1791 u[8] = _mm256_srai_epi32(u[8], bit);
1792
1793 u[9] = _mm256_mullo_epi32(in[7], cospi30);
1794 u[9] = _mm256_add_epi32(u[9], rnding);
1795 u[9] = _mm256_srai_epi32(u[9], bit);
1796
1797 u[10] = _mm256_mullo_epi32(in[5], cospi42);
1798 u[10] = _mm256_add_epi32(u[10], rnding);
1799 u[10] = _mm256_srai_epi32(u[10], bit);
1800
1801 u[11] = _mm256_mullo_epi32(in[5], cospi22);
1802 u[11] = _mm256_add_epi32(u[11], rnding);
1803 u[11] = _mm256_srai_epi32(u[11], bit);
1804
1805 u[12] = _mm256_mullo_epi32(in[3], cospi50);
1806 u[12] = _mm256_add_epi32(u[12], rnding);
1807 u[12] = _mm256_srai_epi32(u[12], bit);
1808
1809 u[13] = _mm256_mullo_epi32(in[3], cospi14);
1810 u[13] = _mm256_add_epi32(u[13], rnding);
1811 u[13] = _mm256_srai_epi32(u[13], bit);
1812
1813 u[14] = _mm256_mullo_epi32(in[1], cospi58);
1814 u[14] = _mm256_add_epi32(u[14], rnding);
1815 u[14] = _mm256_srai_epi32(u[14], bit);
1816
1817 u[15] = _mm256_mullo_epi32(in[1], cospi6);
1818 u[15] = _mm256_add_epi32(u[15], rnding);
1819 u[15] = _mm256_srai_epi32(u[15], bit);
1820
1821 // stage 3
1822 addsub_avx2(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
1823 addsub_avx2(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
1824 addsub_avx2(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
1825 addsub_avx2(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
1826 addsub_avx2(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
1827 addsub_avx2(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
1828 addsub_avx2(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
1829 addsub_avx2(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
1830
1831 // stage 4
1832 y = _mm256_mullo_epi32(u[8], cospi56);
1833 x = _mm256_mullo_epi32(u[9], cospi56);
1834 u[8] = _mm256_mullo_epi32(u[8], cospi8);
1835 u[8] = _mm256_add_epi32(u[8], x);
1836 u[8] = _mm256_add_epi32(u[8], rnding);
1837 u[8] = _mm256_srai_epi32(u[8], bit);
1838
1839 x = _mm256_mullo_epi32(u[9], cospi8);
1840 u[9] = _mm256_sub_epi32(y, x);
1841 u[9] = _mm256_add_epi32(u[9], rnding);
1842 u[9] = _mm256_srai_epi32(u[9], bit);
1843
1844 x = _mm256_mullo_epi32(u[11], cospi24);
1845 y = _mm256_mullo_epi32(u[10], cospi24);
1846 u[10] = _mm256_mullo_epi32(u[10], cospi40);
1847 u[10] = _mm256_add_epi32(u[10], x);
1848 u[10] = _mm256_add_epi32(u[10], rnding);
1849 u[10] = _mm256_srai_epi32(u[10], bit);
1850
1851 x = _mm256_mullo_epi32(u[11], cospi40);
1852 u[11] = _mm256_sub_epi32(y, x);
1853 u[11] = _mm256_add_epi32(u[11], rnding);
1854 u[11] = _mm256_srai_epi32(u[11], bit);
1855
1856 x = _mm256_mullo_epi32(u[13], cospi8);
1857 y = _mm256_mullo_epi32(u[12], cospi8);
1858 u[12] = _mm256_mullo_epi32(u[12], cospim56);
1859 u[12] = _mm256_add_epi32(u[12], x);
1860 u[12] = _mm256_add_epi32(u[12], rnding);
1861 u[12] = _mm256_srai_epi32(u[12], bit);
1862
1863 x = _mm256_mullo_epi32(u[13], cospim56);
1864 u[13] = _mm256_sub_epi32(y, x);
1865 u[13] = _mm256_add_epi32(u[13], rnding);
1866 u[13] = _mm256_srai_epi32(u[13], bit);
1867
1868 x = _mm256_mullo_epi32(u[15], cospi40);
1869 y = _mm256_mullo_epi32(u[14], cospi40);
1870 u[14] = _mm256_mullo_epi32(u[14], cospim24);
1871 u[14] = _mm256_add_epi32(u[14], x);
1872 u[14] = _mm256_add_epi32(u[14], rnding);
1873 u[14] = _mm256_srai_epi32(u[14], bit);
1874
1875 x = _mm256_mullo_epi32(u[15], cospim24);
1876 u[15] = _mm256_sub_epi32(y, x);
1877 u[15] = _mm256_add_epi32(u[15], rnding);
1878 u[15] = _mm256_srai_epi32(u[15], bit);
1879
1880 // stage 5
1881 addsub_avx2(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
1882 addsub_avx2(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
1883 addsub_avx2(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
1884 addsub_avx2(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
1885 addsub_avx2(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
1886 addsub_avx2(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
1887 addsub_avx2(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
1888 addsub_avx2(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
1889
1890 // stage 6
1891 x = _mm256_mullo_epi32(u[5], cospi48);
1892 y = _mm256_mullo_epi32(u[4], cospi48);
1893 u[4] = _mm256_mullo_epi32(u[4], cospi16);
1894 u[4] = _mm256_add_epi32(u[4], x);
1895 u[4] = _mm256_add_epi32(u[4], rnding);
1896 u[4] = _mm256_srai_epi32(u[4], bit);
1897
1898 x = _mm256_mullo_epi32(u[5], cospi16);
1899 u[5] = _mm256_sub_epi32(y, x);
1900 u[5] = _mm256_add_epi32(u[5], rnding);
1901 u[5] = _mm256_srai_epi32(u[5], bit);
1902
1903 x = _mm256_mullo_epi32(u[7], cospi16);
1904 y = _mm256_mullo_epi32(u[6], cospi16);
1905 u[6] = _mm256_mullo_epi32(u[6], cospim48);
1906 u[6] = _mm256_add_epi32(u[6], x);
1907 u[6] = _mm256_add_epi32(u[6], rnding);
1908 u[6] = _mm256_srai_epi32(u[6], bit);
1909
1910 x = _mm256_mullo_epi32(u[7], cospim48);
1911 u[7] = _mm256_sub_epi32(y, x);
1912 u[7] = _mm256_add_epi32(u[7], rnding);
1913 u[7] = _mm256_srai_epi32(u[7], bit);
1914
1915 x = _mm256_mullo_epi32(u[13], cospi48);
1916 y = _mm256_mullo_epi32(u[12], cospi48);
1917 u[12] = _mm256_mullo_epi32(u[12], cospi16);
1918 u[12] = _mm256_add_epi32(u[12], x);
1919 u[12] = _mm256_add_epi32(u[12], rnding);
1920 u[12] = _mm256_srai_epi32(u[12], bit);
1921
1922 x = _mm256_mullo_epi32(u[13], cospi16);
1923 u[13] = _mm256_sub_epi32(y, x);
1924 u[13] = _mm256_add_epi32(u[13], rnding);
1925 u[13] = _mm256_srai_epi32(u[13], bit);
1926
1927 x = _mm256_mullo_epi32(u[15], cospi16);
1928 y = _mm256_mullo_epi32(u[14], cospi16);
1929 u[14] = _mm256_mullo_epi32(u[14], cospim48);
1930 u[14] = _mm256_add_epi32(u[14], x);
1931 u[14] = _mm256_add_epi32(u[14], rnding);
1932 u[14] = _mm256_srai_epi32(u[14], bit);
1933
1934 x = _mm256_mullo_epi32(u[15], cospim48);
1935 u[15] = _mm256_sub_epi32(y, x);
1936 u[15] = _mm256_add_epi32(u[15], rnding);
1937 u[15] = _mm256_srai_epi32(u[15], bit);
1938
1939 // stage 7
1940 addsub_avx2(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
1941 addsub_avx2(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
1942 addsub_avx2(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
1943 addsub_avx2(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
1944 addsub_avx2(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
1945 addsub_avx2(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
1946 addsub_avx2(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
1947 addsub_avx2(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
1948
1949 // stage 8
1950 y = _mm256_mullo_epi32(u[2], cospi32);
1951 x = _mm256_mullo_epi32(u[3], cospi32);
1952 u[2] = _mm256_add_epi32(y, x);
1953 u[2] = _mm256_add_epi32(u[2], rnding);
1954 u[2] = _mm256_srai_epi32(u[2], bit);
1955
1956 u[3] = _mm256_sub_epi32(y, x);
1957 u[3] = _mm256_add_epi32(u[3], rnding);
1958 u[3] = _mm256_srai_epi32(u[3], bit);
1959 y = _mm256_mullo_epi32(u[6], cospi32);
1960 x = _mm256_mullo_epi32(u[7], cospi32);
1961 u[6] = _mm256_add_epi32(y, x);
1962 u[6] = _mm256_add_epi32(u[6], rnding);
1963 u[6] = _mm256_srai_epi32(u[6], bit);
1964
1965 u[7] = _mm256_sub_epi32(y, x);
1966 u[7] = _mm256_add_epi32(u[7], rnding);
1967 u[7] = _mm256_srai_epi32(u[7], bit);
1968
1969 y = _mm256_mullo_epi32(u[10], cospi32);
1970 x = _mm256_mullo_epi32(u[11], cospi32);
1971 u[10] = _mm256_add_epi32(y, x);
1972 u[10] = _mm256_add_epi32(u[10], rnding);
1973 u[10] = _mm256_srai_epi32(u[10], bit);
1974
1975 u[11] = _mm256_sub_epi32(y, x);
1976 u[11] = _mm256_add_epi32(u[11], rnding);
1977 u[11] = _mm256_srai_epi32(u[11], bit);
1978
1979 y = _mm256_mullo_epi32(u[14], cospi32);
1980 x = _mm256_mullo_epi32(u[15], cospi32);
1981 u[14] = _mm256_add_epi32(y, x);
1982 u[14] = _mm256_add_epi32(u[14], rnding);
1983 u[14] = _mm256_srai_epi32(u[14], bit);
1984
1985 u[15] = _mm256_sub_epi32(y, x);
1986 u[15] = _mm256_add_epi32(u[15], rnding);
1987 u[15] = _mm256_srai_epi32(u[15], bit);
1988
1989 // stage 9
1990 if (do_cols) {
1991 out[0] = u[0];
1992 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), u[8]);
1993 out[2] = u[12];
1994 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), u[4]);
1995 out[4] = u[6];
1996 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), u[14]);
1997 out[6] = u[10];
1998 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), u[2]);
1999 out[8] = u[3];
2000 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), u[11]);
2001 out[10] = u[15];
2002 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), u[7]);
2003 out[12] = u[5];
2004 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), u[13]);
2005 out[14] = u[9];
2006 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), u[1]);
2007 } else {
2008 const int log_range_out = AOMMAX(16, bd + 6);
2009 const __m256i clamp_lo_out =
2010 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2011 const __m256i clamp_hi_out =
2012 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2013
2014 neg_shift_avx2(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2015 out_shift);
2016 neg_shift_avx2(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2017 &clamp_hi_out, out_shift);
2018 neg_shift_avx2(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2019 &clamp_hi_out, out_shift);
2020 neg_shift_avx2(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2021 &clamp_hi_out, out_shift);
2022 neg_shift_avx2(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2023 &clamp_hi_out, out_shift);
2024 neg_shift_avx2(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2025 &clamp_hi_out, out_shift);
2026 neg_shift_avx2(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2027 &clamp_hi_out, out_shift);
2028 neg_shift_avx2(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2029 &clamp_hi_out, out_shift);
2030 }
2031 }
2032 }
2033
iadst16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2034 static void iadst16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2035 int bd, int out_shift) {
2036 const int32_t *cospi = cospi_arr(bit);
2037 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
2038 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
2039 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
2040 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
2041 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
2042 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
2043 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
2044 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
2045 const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
2046 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
2047 const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
2048 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
2049 const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
2050 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
2051 const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
2052 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
2053 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2054 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2055 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2056 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2057 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
2058 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
2059 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2060 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2061 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2062 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2063 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2064 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2065 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2066 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2067 __m256i u[16], v[16], x, y;
2068
2069 {
2070 // stage 0
2071 // stage 1
2072 // stage 2
2073 v[0] = _mm256_mullo_epi32(in[15], cospi2);
2074 x = _mm256_mullo_epi32(in[0], cospi62);
2075 v[0] = _mm256_add_epi32(v[0], x);
2076 v[0] = _mm256_add_epi32(v[0], rnding);
2077 v[0] = _mm256_srai_epi32(v[0], bit);
2078
2079 v[1] = _mm256_mullo_epi32(in[15], cospi62);
2080 x = _mm256_mullo_epi32(in[0], cospi2);
2081 v[1] = _mm256_sub_epi32(v[1], x);
2082 v[1] = _mm256_add_epi32(v[1], rnding);
2083 v[1] = _mm256_srai_epi32(v[1], bit);
2084
2085 v[2] = _mm256_mullo_epi32(in[13], cospi10);
2086 x = _mm256_mullo_epi32(in[2], cospi54);
2087 v[2] = _mm256_add_epi32(v[2], x);
2088 v[2] = _mm256_add_epi32(v[2], rnding);
2089 v[2] = _mm256_srai_epi32(v[2], bit);
2090
2091 v[3] = _mm256_mullo_epi32(in[13], cospi54);
2092 x = _mm256_mullo_epi32(in[2], cospi10);
2093 v[3] = _mm256_sub_epi32(v[3], x);
2094 v[3] = _mm256_add_epi32(v[3], rnding);
2095 v[3] = _mm256_srai_epi32(v[3], bit);
2096
2097 v[4] = _mm256_mullo_epi32(in[11], cospi18);
2098 x = _mm256_mullo_epi32(in[4], cospi46);
2099 v[4] = _mm256_add_epi32(v[4], x);
2100 v[4] = _mm256_add_epi32(v[4], rnding);
2101 v[4] = _mm256_srai_epi32(v[4], bit);
2102
2103 v[5] = _mm256_mullo_epi32(in[11], cospi46);
2104 x = _mm256_mullo_epi32(in[4], cospi18);
2105 v[5] = _mm256_sub_epi32(v[5], x);
2106 v[5] = _mm256_add_epi32(v[5], rnding);
2107 v[5] = _mm256_srai_epi32(v[5], bit);
2108
2109 v[6] = _mm256_mullo_epi32(in[9], cospi26);
2110 x = _mm256_mullo_epi32(in[6], cospi38);
2111 v[6] = _mm256_add_epi32(v[6], x);
2112 v[6] = _mm256_add_epi32(v[6], rnding);
2113 v[6] = _mm256_srai_epi32(v[6], bit);
2114
2115 v[7] = _mm256_mullo_epi32(in[9], cospi38);
2116 x = _mm256_mullo_epi32(in[6], cospi26);
2117 v[7] = _mm256_sub_epi32(v[7], x);
2118 v[7] = _mm256_add_epi32(v[7], rnding);
2119 v[7] = _mm256_srai_epi32(v[7], bit);
2120
2121 v[8] = _mm256_mullo_epi32(in[7], cospi34);
2122 x = _mm256_mullo_epi32(in[8], cospi30);
2123 v[8] = _mm256_add_epi32(v[8], x);
2124 v[8] = _mm256_add_epi32(v[8], rnding);
2125 v[8] = _mm256_srai_epi32(v[8], bit);
2126
2127 v[9] = _mm256_mullo_epi32(in[7], cospi30);
2128 x = _mm256_mullo_epi32(in[8], cospi34);
2129 v[9] = _mm256_sub_epi32(v[9], x);
2130 v[9] = _mm256_add_epi32(v[9], rnding);
2131 v[9] = _mm256_srai_epi32(v[9], bit);
2132
2133 v[10] = _mm256_mullo_epi32(in[5], cospi42);
2134 x = _mm256_mullo_epi32(in[10], cospi22);
2135 v[10] = _mm256_add_epi32(v[10], x);
2136 v[10] = _mm256_add_epi32(v[10], rnding);
2137 v[10] = _mm256_srai_epi32(v[10], bit);
2138
2139 v[11] = _mm256_mullo_epi32(in[5], cospi22);
2140 x = _mm256_mullo_epi32(in[10], cospi42);
2141 v[11] = _mm256_sub_epi32(v[11], x);
2142 v[11] = _mm256_add_epi32(v[11], rnding);
2143 v[11] = _mm256_srai_epi32(v[11], bit);
2144
2145 v[12] = _mm256_mullo_epi32(in[3], cospi50);
2146 x = _mm256_mullo_epi32(in[12], cospi14);
2147 v[12] = _mm256_add_epi32(v[12], x);
2148 v[12] = _mm256_add_epi32(v[12], rnding);
2149 v[12] = _mm256_srai_epi32(v[12], bit);
2150
2151 v[13] = _mm256_mullo_epi32(in[3], cospi14);
2152 x = _mm256_mullo_epi32(in[12], cospi50);
2153 v[13] = _mm256_sub_epi32(v[13], x);
2154 v[13] = _mm256_add_epi32(v[13], rnding);
2155 v[13] = _mm256_srai_epi32(v[13], bit);
2156
2157 v[14] = _mm256_mullo_epi32(in[1], cospi58);
2158 x = _mm256_mullo_epi32(in[14], cospi6);
2159 v[14] = _mm256_add_epi32(v[14], x);
2160 v[14] = _mm256_add_epi32(v[14], rnding);
2161 v[14] = _mm256_srai_epi32(v[14], bit);
2162
2163 v[15] = _mm256_mullo_epi32(in[1], cospi6);
2164 x = _mm256_mullo_epi32(in[14], cospi58);
2165 v[15] = _mm256_sub_epi32(v[15], x);
2166 v[15] = _mm256_add_epi32(v[15], rnding);
2167 v[15] = _mm256_srai_epi32(v[15], bit);
2168
2169 // stage 3
2170 addsub_avx2(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2171 addsub_avx2(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2172 addsub_avx2(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2173 addsub_avx2(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2174 addsub_avx2(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2175 addsub_avx2(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2176 addsub_avx2(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2177 addsub_avx2(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2178
2179 // stage 4
2180 v[0] = u[0];
2181 v[1] = u[1];
2182 v[2] = u[2];
2183 v[3] = u[3];
2184 v[4] = u[4];
2185 v[5] = u[5];
2186 v[6] = u[6];
2187 v[7] = u[7];
2188
2189 v[8] = _mm256_mullo_epi32(u[8], cospi8);
2190 x = _mm256_mullo_epi32(u[9], cospi56);
2191 v[8] = _mm256_add_epi32(v[8], x);
2192 v[8] = _mm256_add_epi32(v[8], rnding);
2193 v[8] = _mm256_srai_epi32(v[8], bit);
2194
2195 v[9] = _mm256_mullo_epi32(u[8], cospi56);
2196 x = _mm256_mullo_epi32(u[9], cospi8);
2197 v[9] = _mm256_sub_epi32(v[9], x);
2198 v[9] = _mm256_add_epi32(v[9], rnding);
2199 v[9] = _mm256_srai_epi32(v[9], bit);
2200
2201 v[10] = _mm256_mullo_epi32(u[10], cospi40);
2202 x = _mm256_mullo_epi32(u[11], cospi24);
2203 v[10] = _mm256_add_epi32(v[10], x);
2204 v[10] = _mm256_add_epi32(v[10], rnding);
2205 v[10] = _mm256_srai_epi32(v[10], bit);
2206
2207 v[11] = _mm256_mullo_epi32(u[10], cospi24);
2208 x = _mm256_mullo_epi32(u[11], cospi40);
2209 v[11] = _mm256_sub_epi32(v[11], x);
2210 v[11] = _mm256_add_epi32(v[11], rnding);
2211 v[11] = _mm256_srai_epi32(v[11], bit);
2212
2213 v[12] = _mm256_mullo_epi32(u[12], cospim56);
2214 x = _mm256_mullo_epi32(u[13], cospi8);
2215 v[12] = _mm256_add_epi32(v[12], x);
2216 v[12] = _mm256_add_epi32(v[12], rnding);
2217 v[12] = _mm256_srai_epi32(v[12], bit);
2218
2219 v[13] = _mm256_mullo_epi32(u[12], cospi8);
2220 x = _mm256_mullo_epi32(u[13], cospim56);
2221 v[13] = _mm256_sub_epi32(v[13], x);
2222 v[13] = _mm256_add_epi32(v[13], rnding);
2223 v[13] = _mm256_srai_epi32(v[13], bit);
2224
2225 v[14] = _mm256_mullo_epi32(u[14], cospim24);
2226 x = _mm256_mullo_epi32(u[15], cospi40);
2227 v[14] = _mm256_add_epi32(v[14], x);
2228 v[14] = _mm256_add_epi32(v[14], rnding);
2229 v[14] = _mm256_srai_epi32(v[14], bit);
2230
2231 v[15] = _mm256_mullo_epi32(u[14], cospi40);
2232 x = _mm256_mullo_epi32(u[15], cospim24);
2233 v[15] = _mm256_sub_epi32(v[15], x);
2234 v[15] = _mm256_add_epi32(v[15], rnding);
2235 v[15] = _mm256_srai_epi32(v[15], bit);
2236
2237 // stage 5
2238 addsub_avx2(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2239 addsub_avx2(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2240 addsub_avx2(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2241 addsub_avx2(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2242 addsub_avx2(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2243 addsub_avx2(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2244 addsub_avx2(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2245 addsub_avx2(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2246
2247 // stage 6
2248 v[0] = u[0];
2249 v[1] = u[1];
2250 v[2] = u[2];
2251 v[3] = u[3];
2252
2253 v[4] = _mm256_mullo_epi32(u[4], cospi16);
2254 x = _mm256_mullo_epi32(u[5], cospi48);
2255 v[4] = _mm256_add_epi32(v[4], x);
2256 v[4] = _mm256_add_epi32(v[4], rnding);
2257 v[4] = _mm256_srai_epi32(v[4], bit);
2258
2259 v[5] = _mm256_mullo_epi32(u[4], cospi48);
2260 x = _mm256_mullo_epi32(u[5], cospi16);
2261 v[5] = _mm256_sub_epi32(v[5], x);
2262 v[5] = _mm256_add_epi32(v[5], rnding);
2263 v[5] = _mm256_srai_epi32(v[5], bit);
2264
2265 v[6] = _mm256_mullo_epi32(u[6], cospim48);
2266 x = _mm256_mullo_epi32(u[7], cospi16);
2267 v[6] = _mm256_add_epi32(v[6], x);
2268 v[6] = _mm256_add_epi32(v[6], rnding);
2269 v[6] = _mm256_srai_epi32(v[6], bit);
2270
2271 v[7] = _mm256_mullo_epi32(u[6], cospi16);
2272 x = _mm256_mullo_epi32(u[7], cospim48);
2273 v[7] = _mm256_sub_epi32(v[7], x);
2274 v[7] = _mm256_add_epi32(v[7], rnding);
2275 v[7] = _mm256_srai_epi32(v[7], bit);
2276
2277 v[8] = u[8];
2278 v[9] = u[9];
2279 v[10] = u[10];
2280 v[11] = u[11];
2281
2282 v[12] = _mm256_mullo_epi32(u[12], cospi16);
2283 x = _mm256_mullo_epi32(u[13], cospi48);
2284 v[12] = _mm256_add_epi32(v[12], x);
2285 v[12] = _mm256_add_epi32(v[12], rnding);
2286 v[12] = _mm256_srai_epi32(v[12], bit);
2287
2288 v[13] = _mm256_mullo_epi32(u[12], cospi48);
2289 x = _mm256_mullo_epi32(u[13], cospi16);
2290 v[13] = _mm256_sub_epi32(v[13], x);
2291 v[13] = _mm256_add_epi32(v[13], rnding);
2292 v[13] = _mm256_srai_epi32(v[13], bit);
2293
2294 v[14] = _mm256_mullo_epi32(u[14], cospim48);
2295 x = _mm256_mullo_epi32(u[15], cospi16);
2296 v[14] = _mm256_add_epi32(v[14], x);
2297 v[14] = _mm256_add_epi32(v[14], rnding);
2298 v[14] = _mm256_srai_epi32(v[14], bit);
2299
2300 v[15] = _mm256_mullo_epi32(u[14], cospi16);
2301 x = _mm256_mullo_epi32(u[15], cospim48);
2302 v[15] = _mm256_sub_epi32(v[15], x);
2303 v[15] = _mm256_add_epi32(v[15], rnding);
2304 v[15] = _mm256_srai_epi32(v[15], bit);
2305
2306 // stage 7
2307 addsub_avx2(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2308 addsub_avx2(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2309 addsub_avx2(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2310 addsub_avx2(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2311 addsub_avx2(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2312 addsub_avx2(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2313 addsub_avx2(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2314 addsub_avx2(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2315
2316 // stage 8
2317 v[0] = u[0];
2318 v[1] = u[1];
2319
2320 y = _mm256_mullo_epi32(u[2], cospi32);
2321 x = _mm256_mullo_epi32(u[3], cospi32);
2322 v[2] = _mm256_add_epi32(y, x);
2323 v[2] = _mm256_add_epi32(v[2], rnding);
2324 v[2] = _mm256_srai_epi32(v[2], bit);
2325
2326 v[3] = _mm256_sub_epi32(y, x);
2327 v[3] = _mm256_add_epi32(v[3], rnding);
2328 v[3] = _mm256_srai_epi32(v[3], bit);
2329
2330 v[4] = u[4];
2331 v[5] = u[5];
2332
2333 y = _mm256_mullo_epi32(u[6], cospi32);
2334 x = _mm256_mullo_epi32(u[7], cospi32);
2335 v[6] = _mm256_add_epi32(y, x);
2336 v[6] = _mm256_add_epi32(v[6], rnding);
2337 v[6] = _mm256_srai_epi32(v[6], bit);
2338
2339 v[7] = _mm256_sub_epi32(y, x);
2340 v[7] = _mm256_add_epi32(v[7], rnding);
2341 v[7] = _mm256_srai_epi32(v[7], bit);
2342
2343 v[8] = u[8];
2344 v[9] = u[9];
2345
2346 y = _mm256_mullo_epi32(u[10], cospi32);
2347 x = _mm256_mullo_epi32(u[11], cospi32);
2348 v[10] = _mm256_add_epi32(y, x);
2349 v[10] = _mm256_add_epi32(v[10], rnding);
2350 v[10] = _mm256_srai_epi32(v[10], bit);
2351
2352 v[11] = _mm256_sub_epi32(y, x);
2353 v[11] = _mm256_add_epi32(v[11], rnding);
2354 v[11] = _mm256_srai_epi32(v[11], bit);
2355
2356 v[12] = u[12];
2357 v[13] = u[13];
2358
2359 y = _mm256_mullo_epi32(u[14], cospi32);
2360 x = _mm256_mullo_epi32(u[15], cospi32);
2361 v[14] = _mm256_add_epi32(y, x);
2362 v[14] = _mm256_add_epi32(v[14], rnding);
2363 v[14] = _mm256_srai_epi32(v[14], bit);
2364
2365 v[15] = _mm256_sub_epi32(y, x);
2366 v[15] = _mm256_add_epi32(v[15], rnding);
2367 v[15] = _mm256_srai_epi32(v[15], bit);
2368
2369 // stage 9
2370 if (do_cols) {
2371 out[0] = v[0];
2372 out[1] = _mm256_sub_epi32(_mm256_setzero_si256(), v[8]);
2373 out[2] = v[12];
2374 out[3] = _mm256_sub_epi32(_mm256_setzero_si256(), v[4]);
2375 out[4] = v[6];
2376 out[5] = _mm256_sub_epi32(_mm256_setzero_si256(), v[14]);
2377 out[6] = v[10];
2378 out[7] = _mm256_sub_epi32(_mm256_setzero_si256(), v[2]);
2379 out[8] = v[3];
2380 out[9] = _mm256_sub_epi32(_mm256_setzero_si256(), v[11]);
2381 out[10] = v[15];
2382 out[11] = _mm256_sub_epi32(_mm256_setzero_si256(), v[7]);
2383 out[12] = v[5];
2384 out[13] = _mm256_sub_epi32(_mm256_setzero_si256(), v[13]);
2385 out[14] = v[9];
2386 out[15] = _mm256_sub_epi32(_mm256_setzero_si256(), v[1]);
2387 } else {
2388 const int log_range_out = AOMMAX(16, bd + 6);
2389 const __m256i clamp_lo_out =
2390 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2391 const __m256i clamp_hi_out =
2392 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2393
2394 neg_shift_avx2(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2395 out_shift);
2396 neg_shift_avx2(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2397 &clamp_hi_out, out_shift);
2398 neg_shift_avx2(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2399 &clamp_hi_out, out_shift);
2400 neg_shift_avx2(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2401 &clamp_hi_out, out_shift);
2402 neg_shift_avx2(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2403 &clamp_hi_out, out_shift);
2404 neg_shift_avx2(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2405 &clamp_hi_out, out_shift);
2406 neg_shift_avx2(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2407 &clamp_hi_out, out_shift);
2408 neg_shift_avx2(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2409 &clamp_hi_out, out_shift);
2410 }
2411 }
2412 }
idct8x8_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2413 static void idct8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2414 int bd, int out_shift) {
2415 const int32_t *cospi = cospi_arr(bit);
2416 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2417 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2418 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2419 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2420 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2421 __m256i x;
2422
2423 // stage 0
2424 // stage 1
2425 // stage 2
2426 // stage 3
2427 x = _mm256_mullo_epi32(in[0], cospi32);
2428 x = _mm256_add_epi32(x, rnding);
2429 x = _mm256_srai_epi32(x, bit);
2430
2431 // stage 4
2432 // stage 5
2433 if (!do_cols) {
2434 const int log_range_out = AOMMAX(16, bd + 6);
2435 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2436 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2437 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2438 x = _mm256_add_epi32(x, offset);
2439 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2440 }
2441 x = _mm256_max_epi32(x, clamp_lo);
2442 x = _mm256_min_epi32(x, clamp_hi);
2443 out[0] = x;
2444 out[1] = x;
2445 out[2] = x;
2446 out[3] = x;
2447 out[4] = x;
2448 out[5] = x;
2449 out[6] = x;
2450 out[7] = x;
2451 }
idct8x8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2452 static void idct8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2453 int bd, int out_shift) {
2454 const int32_t *cospi = cospi_arr(bit);
2455 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
2456 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
2457 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
2458 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
2459 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
2460 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
2461 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2462 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2463 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
2464 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2465 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2466 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2467 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2468 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2469 __m256i u0, u1, u2, u3, u4, u5, u6, u7;
2470 __m256i v0, v1, v2, v3, v4, v5, v6, v7;
2471 __m256i x, y;
2472
2473 // stage 0
2474 // stage 1
2475 // stage 2
2476 u0 = in[0];
2477 u1 = in[4];
2478 u2 = in[2];
2479 u3 = in[6];
2480
2481 x = _mm256_mullo_epi32(in[1], cospi56);
2482 y = _mm256_mullo_epi32(in[7], cospim8);
2483 u4 = _mm256_add_epi32(x, y);
2484 u4 = _mm256_add_epi32(u4, rnding);
2485 u4 = _mm256_srai_epi32(u4, bit);
2486
2487 x = _mm256_mullo_epi32(in[1], cospi8);
2488 y = _mm256_mullo_epi32(in[7], cospi56);
2489 u7 = _mm256_add_epi32(x, y);
2490 u7 = _mm256_add_epi32(u7, rnding);
2491 u7 = _mm256_srai_epi32(u7, bit);
2492
2493 x = _mm256_mullo_epi32(in[5], cospi24);
2494 y = _mm256_mullo_epi32(in[3], cospim40);
2495 u5 = _mm256_add_epi32(x, y);
2496 u5 = _mm256_add_epi32(u5, rnding);
2497 u5 = _mm256_srai_epi32(u5, bit);
2498
2499 x = _mm256_mullo_epi32(in[5], cospi40);
2500 y = _mm256_mullo_epi32(in[3], cospi24);
2501 u6 = _mm256_add_epi32(x, y);
2502 u6 = _mm256_add_epi32(u6, rnding);
2503 u6 = _mm256_srai_epi32(u6, bit);
2504
2505 // stage 3
2506 x = _mm256_mullo_epi32(u0, cospi32);
2507 y = _mm256_mullo_epi32(u1, cospi32);
2508 v0 = _mm256_add_epi32(x, y);
2509 v0 = _mm256_add_epi32(v0, rnding);
2510 v0 = _mm256_srai_epi32(v0, bit);
2511
2512 v1 = _mm256_sub_epi32(x, y);
2513 v1 = _mm256_add_epi32(v1, rnding);
2514 v1 = _mm256_srai_epi32(v1, bit);
2515
2516 x = _mm256_mullo_epi32(u2, cospi48);
2517 y = _mm256_mullo_epi32(u3, cospim16);
2518 v2 = _mm256_add_epi32(x, y);
2519 v2 = _mm256_add_epi32(v2, rnding);
2520 v2 = _mm256_srai_epi32(v2, bit);
2521
2522 x = _mm256_mullo_epi32(u2, cospi16);
2523 y = _mm256_mullo_epi32(u3, cospi48);
2524 v3 = _mm256_add_epi32(x, y);
2525 v3 = _mm256_add_epi32(v3, rnding);
2526 v3 = _mm256_srai_epi32(v3, bit);
2527
2528 addsub_avx2(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
2529 addsub_avx2(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
2530
2531 // stage 4
2532 addsub_avx2(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
2533 addsub_avx2(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
2534 u4 = v4;
2535 u7 = v7;
2536
2537 x = _mm256_mullo_epi32(v5, cospi32);
2538 y = _mm256_mullo_epi32(v6, cospi32);
2539 u6 = _mm256_add_epi32(y, x);
2540 u6 = _mm256_add_epi32(u6, rnding);
2541 u6 = _mm256_srai_epi32(u6, bit);
2542
2543 u5 = _mm256_sub_epi32(y, x);
2544 u5 = _mm256_add_epi32(u5, rnding);
2545 u5 = _mm256_srai_epi32(u5, bit);
2546
2547 addsub_avx2(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
2548 addsub_avx2(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
2549 addsub_avx2(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
2550 addsub_avx2(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
2551 // stage 5
2552 if (!do_cols) {
2553 const int log_range_out = AOMMAX(16, bd + 6);
2554 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2555 const __m256i clamp_hi_out =
2556 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2557
2558 round_shift_4x4_avx2(out, out_shift);
2559 round_shift_4x4_avx2(out + 4, out_shift);
2560 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 8);
2561 }
2562 }
iadst8x8_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2563 static void iadst8x8_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2564 int bd, int out_shift) {
2565 const int32_t *cospi = cospi_arr(bit);
2566 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2567 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2568 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2569 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2570 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2571 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2572 const __m256i kZero = _mm256_setzero_si256();
2573 __m256i u[8], x;
2574
2575 // stage 0
2576 // stage 1
2577 // stage 2
2578
2579 x = _mm256_mullo_epi32(in[0], cospi60);
2580 u[0] = _mm256_add_epi32(x, rnding);
2581 u[0] = _mm256_srai_epi32(u[0], bit);
2582
2583 x = _mm256_mullo_epi32(in[0], cospi4);
2584 u[1] = _mm256_sub_epi32(kZero, x);
2585 u[1] = _mm256_add_epi32(u[1], rnding);
2586 u[1] = _mm256_srai_epi32(u[1], bit);
2587
2588 // stage 3
2589 // stage 4
2590 __m256i temp1, temp2;
2591 temp1 = _mm256_mullo_epi32(u[0], cospi16);
2592 x = _mm256_mullo_epi32(u[1], cospi48);
2593 temp1 = _mm256_add_epi32(temp1, x);
2594 temp1 = _mm256_add_epi32(temp1, rnding);
2595 temp1 = _mm256_srai_epi32(temp1, bit);
2596 u[4] = temp1;
2597
2598 temp2 = _mm256_mullo_epi32(u[0], cospi48);
2599 x = _mm256_mullo_epi32(u[1], cospi16);
2600 u[5] = _mm256_sub_epi32(temp2, x);
2601 u[5] = _mm256_add_epi32(u[5], rnding);
2602 u[5] = _mm256_srai_epi32(u[5], bit);
2603
2604 // stage 5
2605 // stage 6
2606 temp1 = _mm256_mullo_epi32(u[0], cospi32);
2607 x = _mm256_mullo_epi32(u[1], cospi32);
2608 u[2] = _mm256_add_epi32(temp1, x);
2609 u[2] = _mm256_add_epi32(u[2], rnding);
2610 u[2] = _mm256_srai_epi32(u[2], bit);
2611
2612 u[3] = _mm256_sub_epi32(temp1, x);
2613 u[3] = _mm256_add_epi32(u[3], rnding);
2614 u[3] = _mm256_srai_epi32(u[3], bit);
2615
2616 temp1 = _mm256_mullo_epi32(u[4], cospi32);
2617 x = _mm256_mullo_epi32(u[5], cospi32);
2618 u[6] = _mm256_add_epi32(temp1, x);
2619 u[6] = _mm256_add_epi32(u[6], rnding);
2620 u[6] = _mm256_srai_epi32(u[6], bit);
2621
2622 u[7] = _mm256_sub_epi32(temp1, x);
2623 u[7] = _mm256_add_epi32(u[7], rnding);
2624 u[7] = _mm256_srai_epi32(u[7], bit);
2625
2626 // stage 7
2627 if (do_cols) {
2628 out[0] = u[0];
2629 out[1] = _mm256_sub_epi32(kZero, u[4]);
2630 out[2] = u[6];
2631 out[3] = _mm256_sub_epi32(kZero, u[2]);
2632 out[4] = u[3];
2633 out[5] = _mm256_sub_epi32(kZero, u[7]);
2634 out[6] = u[5];
2635 out[7] = _mm256_sub_epi32(kZero, u[1]);
2636 } else {
2637 const int log_range_out = AOMMAX(16, bd + 6);
2638 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2639 const __m256i clamp_hi_out =
2640 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2641
2642 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2643 out_shift);
2644 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2645 out_shift);
2646 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2647 out_shift);
2648 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2649 out_shift);
2650 }
2651 }
2652
iadst8x8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2653 static void iadst8x8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2654 int bd, int out_shift) {
2655 const int32_t *cospi = cospi_arr(bit);
2656 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
2657 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
2658 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
2659 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
2660 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
2661 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
2662 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
2663 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
2664 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
2665 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
2666 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
2667 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2668 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2669 const __m256i kZero = _mm256_setzero_si256();
2670 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2671 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2672 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2673 __m256i u[8], v[8], x;
2674
2675 // stage 0
2676 // stage 1
2677 // stage 2
2678
2679 u[0] = _mm256_mullo_epi32(in[7], cospi4);
2680 x = _mm256_mullo_epi32(in[0], cospi60);
2681 u[0] = _mm256_add_epi32(u[0], x);
2682 u[0] = _mm256_add_epi32(u[0], rnding);
2683 u[0] = _mm256_srai_epi32(u[0], bit);
2684
2685 u[1] = _mm256_mullo_epi32(in[7], cospi60);
2686 x = _mm256_mullo_epi32(in[0], cospi4);
2687 u[1] = _mm256_sub_epi32(u[1], x);
2688 u[1] = _mm256_add_epi32(u[1], rnding);
2689 u[1] = _mm256_srai_epi32(u[1], bit);
2690
2691 u[2] = _mm256_mullo_epi32(in[5], cospi20);
2692 x = _mm256_mullo_epi32(in[2], cospi44);
2693 u[2] = _mm256_add_epi32(u[2], x);
2694 u[2] = _mm256_add_epi32(u[2], rnding);
2695 u[2] = _mm256_srai_epi32(u[2], bit);
2696
2697 u[3] = _mm256_mullo_epi32(in[5], cospi44);
2698 x = _mm256_mullo_epi32(in[2], cospi20);
2699 u[3] = _mm256_sub_epi32(u[3], x);
2700 u[3] = _mm256_add_epi32(u[3], rnding);
2701 u[3] = _mm256_srai_epi32(u[3], bit);
2702
2703 u[4] = _mm256_mullo_epi32(in[3], cospi36);
2704 x = _mm256_mullo_epi32(in[4], cospi28);
2705 u[4] = _mm256_add_epi32(u[4], x);
2706 u[4] = _mm256_add_epi32(u[4], rnding);
2707 u[4] = _mm256_srai_epi32(u[4], bit);
2708
2709 u[5] = _mm256_mullo_epi32(in[3], cospi28);
2710 x = _mm256_mullo_epi32(in[4], cospi36);
2711 u[5] = _mm256_sub_epi32(u[5], x);
2712 u[5] = _mm256_add_epi32(u[5], rnding);
2713 u[5] = _mm256_srai_epi32(u[5], bit);
2714
2715 u[6] = _mm256_mullo_epi32(in[1], cospi52);
2716 x = _mm256_mullo_epi32(in[6], cospi12);
2717 u[6] = _mm256_add_epi32(u[6], x);
2718 u[6] = _mm256_add_epi32(u[6], rnding);
2719 u[6] = _mm256_srai_epi32(u[6], bit);
2720
2721 u[7] = _mm256_mullo_epi32(in[1], cospi12);
2722 x = _mm256_mullo_epi32(in[6], cospi52);
2723 u[7] = _mm256_sub_epi32(u[7], x);
2724 u[7] = _mm256_add_epi32(u[7], rnding);
2725 u[7] = _mm256_srai_epi32(u[7], bit);
2726
2727 // stage 3
2728 addsub_avx2(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
2729 addsub_avx2(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
2730 addsub_avx2(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
2731 addsub_avx2(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
2732
2733 // stage 4
2734 u[0] = v[0];
2735 u[1] = v[1];
2736 u[2] = v[2];
2737 u[3] = v[3];
2738
2739 u[4] = _mm256_mullo_epi32(v[4], cospi16);
2740 x = _mm256_mullo_epi32(v[5], cospi48);
2741 u[4] = _mm256_add_epi32(u[4], x);
2742 u[4] = _mm256_add_epi32(u[4], rnding);
2743 u[4] = _mm256_srai_epi32(u[4], bit);
2744
2745 u[5] = _mm256_mullo_epi32(v[4], cospi48);
2746 x = _mm256_mullo_epi32(v[5], cospi16);
2747 u[5] = _mm256_sub_epi32(u[5], x);
2748 u[5] = _mm256_add_epi32(u[5], rnding);
2749 u[5] = _mm256_srai_epi32(u[5], bit);
2750
2751 u[6] = _mm256_mullo_epi32(v[6], cospim48);
2752 x = _mm256_mullo_epi32(v[7], cospi16);
2753 u[6] = _mm256_add_epi32(u[6], x);
2754 u[6] = _mm256_add_epi32(u[6], rnding);
2755 u[6] = _mm256_srai_epi32(u[6], bit);
2756
2757 u[7] = _mm256_mullo_epi32(v[6], cospi16);
2758 x = _mm256_mullo_epi32(v[7], cospim48);
2759 u[7] = _mm256_sub_epi32(u[7], x);
2760 u[7] = _mm256_add_epi32(u[7], rnding);
2761 u[7] = _mm256_srai_epi32(u[7], bit);
2762
2763 // stage 5
2764 addsub_avx2(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
2765 addsub_avx2(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
2766 addsub_avx2(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
2767 addsub_avx2(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
2768
2769 // stage 6
2770 u[0] = v[0];
2771 u[1] = v[1];
2772 u[4] = v[4];
2773 u[5] = v[5];
2774
2775 v[0] = _mm256_mullo_epi32(v[2], cospi32);
2776 x = _mm256_mullo_epi32(v[3], cospi32);
2777 u[2] = _mm256_add_epi32(v[0], x);
2778 u[2] = _mm256_add_epi32(u[2], rnding);
2779 u[2] = _mm256_srai_epi32(u[2], bit);
2780
2781 u[3] = _mm256_sub_epi32(v[0], x);
2782 u[3] = _mm256_add_epi32(u[3], rnding);
2783 u[3] = _mm256_srai_epi32(u[3], bit);
2784
2785 v[0] = _mm256_mullo_epi32(v[6], cospi32);
2786 x = _mm256_mullo_epi32(v[7], cospi32);
2787 u[6] = _mm256_add_epi32(v[0], x);
2788 u[6] = _mm256_add_epi32(u[6], rnding);
2789 u[6] = _mm256_srai_epi32(u[6], bit);
2790
2791 u[7] = _mm256_sub_epi32(v[0], x);
2792 u[7] = _mm256_add_epi32(u[7], rnding);
2793 u[7] = _mm256_srai_epi32(u[7], bit);
2794
2795 // stage 7
2796 if (do_cols) {
2797 out[0] = u[0];
2798 out[1] = _mm256_sub_epi32(kZero, u[4]);
2799 out[2] = u[6];
2800 out[3] = _mm256_sub_epi32(kZero, u[2]);
2801 out[4] = u[3];
2802 out[5] = _mm256_sub_epi32(kZero, u[7]);
2803 out[6] = u[5];
2804 out[7] = _mm256_sub_epi32(kZero, u[1]);
2805 } else {
2806 const int log_range_out = AOMMAX(16, bd + 6);
2807 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2808 const __m256i clamp_hi_out =
2809 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2810
2811 neg_shift_avx2(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2812 out_shift);
2813 neg_shift_avx2(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
2814 out_shift);
2815 neg_shift_avx2(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
2816 out_shift);
2817 neg_shift_avx2(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
2818 out_shift);
2819 }
2820 }
idct64_stage8_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * cospim16,const __m256i * cospi48,const __m256i * cospi16,const __m256i * cospim48,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2821 static inline void idct64_stage8_avx2(
2822 __m256i *u, const __m256i *cospim32, const __m256i *cospi32,
2823 const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
2824 const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
2825 const __m256i *rnding, int bit) {
2826 int i;
2827 __m256i temp1, temp2, temp3, temp4;
2828 temp1 = half_btf_avx2(cospim32, &u[10], cospi32, &u[13], rnding, bit);
2829 u[13] = half_btf_avx2(cospi32, &u[10], cospi32, &u[13], rnding, bit);
2830 u[10] = temp1;
2831 temp2 = half_btf_avx2(cospim32, &u[11], cospi32, &u[12], rnding, bit);
2832 u[12] = half_btf_avx2(cospi32, &u[11], cospi32, &u[12], rnding, bit);
2833 u[11] = temp2;
2834
2835 for (i = 16; i < 20; ++i) {
2836 addsub_avx2(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
2837 addsub_avx2(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo, clamp_hi);
2838 }
2839
2840 temp1 = half_btf_avx2(cospim16, &u[36], cospi48, &u[59], rnding, bit);
2841 temp2 = half_btf_avx2(cospim16, &u[37], cospi48, &u[58], rnding, bit);
2842 temp3 = half_btf_avx2(cospim16, &u[38], cospi48, &u[57], rnding, bit);
2843 temp4 = half_btf_avx2(cospim16, &u[39], cospi48, &u[56], rnding, bit);
2844 u[56] = half_btf_avx2(cospi48, &u[39], cospi16, &u[56], rnding, bit);
2845 u[57] = half_btf_avx2(cospi48, &u[38], cospi16, &u[57], rnding, bit);
2846 u[58] = half_btf_avx2(cospi48, &u[37], cospi16, &u[58], rnding, bit);
2847 u[59] = half_btf_avx2(cospi48, &u[36], cospi16, &u[59], rnding, bit);
2848 u[36] = temp1;
2849 u[37] = temp2;
2850 u[38] = temp3;
2851 u[39] = temp4;
2852
2853 temp1 = half_btf_avx2(cospim48, &u[40], cospim16, &u[55], rnding, bit);
2854 temp2 = half_btf_avx2(cospim48, &u[41], cospim16, &u[54], rnding, bit);
2855 temp3 = half_btf_avx2(cospim48, &u[42], cospim16, &u[53], rnding, bit);
2856 temp4 = half_btf_avx2(cospim48, &u[43], cospim16, &u[52], rnding, bit);
2857 u[52] = half_btf_avx2(cospim16, &u[43], cospi48, &u[52], rnding, bit);
2858 u[53] = half_btf_avx2(cospim16, &u[42], cospi48, &u[53], rnding, bit);
2859 u[54] = half_btf_avx2(cospim16, &u[41], cospi48, &u[54], rnding, bit);
2860 u[55] = half_btf_avx2(cospim16, &u[40], cospi48, &u[55], rnding, bit);
2861 u[40] = temp1;
2862 u[41] = temp2;
2863 u[42] = temp3;
2864 u[43] = temp4;
2865 }
2866
idct64_stage9_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2867 static inline void idct64_stage9_avx2(__m256i *u, const __m256i *cospim32,
2868 const __m256i *cospi32,
2869 const __m256i *clamp_lo,
2870 const __m256i *clamp_hi,
2871 const __m256i *rnding, int bit) {
2872 int i;
2873 __m256i temp1, temp2, temp3, temp4;
2874 for (i = 0; i < 8; ++i) {
2875 addsub_avx2(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
2876 }
2877
2878 temp1 = half_btf_avx2(cospim32, &u[20], cospi32, &u[27], rnding, bit);
2879 temp2 = half_btf_avx2(cospim32, &u[21], cospi32, &u[26], rnding, bit);
2880 temp3 = half_btf_avx2(cospim32, &u[22], cospi32, &u[25], rnding, bit);
2881 temp4 = half_btf_avx2(cospim32, &u[23], cospi32, &u[24], rnding, bit);
2882 u[24] = half_btf_avx2(cospi32, &u[23], cospi32, &u[24], rnding, bit);
2883 u[25] = half_btf_avx2(cospi32, &u[22], cospi32, &u[25], rnding, bit);
2884 u[26] = half_btf_avx2(cospi32, &u[21], cospi32, &u[26], rnding, bit);
2885 u[27] = half_btf_avx2(cospi32, &u[20], cospi32, &u[27], rnding, bit);
2886 u[20] = temp1;
2887 u[21] = temp2;
2888 u[22] = temp3;
2889 u[23] = temp4;
2890 for (i = 32; i < 40; i++) {
2891 addsub_avx2(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
2892 }
2893
2894 for (i = 48; i < 56; i++) {
2895 addsub_avx2(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
2896 }
2897 }
2898
idct64_stage10_avx2(__m256i * u,const __m256i * cospim32,const __m256i * cospi32,const __m256i * clamp_lo,const __m256i * clamp_hi,const __m256i * rnding,int bit)2899 static inline void idct64_stage10_avx2(__m256i *u, const __m256i *cospim32,
2900 const __m256i *cospi32,
2901 const __m256i *clamp_lo,
2902 const __m256i *clamp_hi,
2903 const __m256i *rnding, int bit) {
2904 __m256i temp1, temp2, temp3, temp4;
2905 for (int i = 0; i < 16; i++) {
2906 addsub_avx2(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
2907 }
2908
2909 temp1 = half_btf_avx2(cospim32, &u[40], cospi32, &u[55], rnding, bit);
2910 temp2 = half_btf_avx2(cospim32, &u[41], cospi32, &u[54], rnding, bit);
2911 temp3 = half_btf_avx2(cospim32, &u[42], cospi32, &u[53], rnding, bit);
2912 temp4 = half_btf_avx2(cospim32, &u[43], cospi32, &u[52], rnding, bit);
2913 u[52] = half_btf_avx2(cospi32, &u[43], cospi32, &u[52], rnding, bit);
2914 u[53] = half_btf_avx2(cospi32, &u[42], cospi32, &u[53], rnding, bit);
2915 u[54] = half_btf_avx2(cospi32, &u[41], cospi32, &u[54], rnding, bit);
2916 u[55] = half_btf_avx2(cospi32, &u[40], cospi32, &u[55], rnding, bit);
2917 u[40] = temp1;
2918 u[41] = temp2;
2919 u[42] = temp3;
2920 u[43] = temp4;
2921
2922 temp1 = half_btf_avx2(cospim32, &u[44], cospi32, &u[51], rnding, bit);
2923 temp2 = half_btf_avx2(cospim32, &u[45], cospi32, &u[50], rnding, bit);
2924 temp3 = half_btf_avx2(cospim32, &u[46], cospi32, &u[49], rnding, bit);
2925 temp4 = half_btf_avx2(cospim32, &u[47], cospi32, &u[48], rnding, bit);
2926 u[48] = half_btf_avx2(cospi32, &u[47], cospi32, &u[48], rnding, bit);
2927 u[49] = half_btf_avx2(cospi32, &u[46], cospi32, &u[49], rnding, bit);
2928 u[50] = half_btf_avx2(cospi32, &u[45], cospi32, &u[50], rnding, bit);
2929 u[51] = half_btf_avx2(cospi32, &u[44], cospi32, &u[51], rnding, bit);
2930 u[44] = temp1;
2931 u[45] = temp2;
2932 u[46] = temp3;
2933 u[47] = temp4;
2934 }
2935
idct64_stage11_avx2(__m256i * u,__m256i * out,int do_cols,int bd,int out_shift,const __m256i * clamp_lo,const __m256i * clamp_hi)2936 static inline void idct64_stage11_avx2(__m256i *u, __m256i *out, int do_cols,
2937 int bd, int out_shift,
2938 const __m256i *clamp_lo,
2939 const __m256i *clamp_hi) {
2940 for (int i = 0; i < 32; i++) {
2941 addsub_avx2(u[i], u[63 - i], &out[(i)], &out[(63 - i)], clamp_lo, clamp_hi);
2942 }
2943
2944 if (!do_cols) {
2945 const int log_range_out = AOMMAX(16, bd + 6);
2946 const __m256i clamp_lo_out = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2947 const __m256i clamp_hi_out =
2948 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2949
2950 round_shift_8x8_avx2(out, out_shift);
2951 round_shift_8x8_avx2(out + 16, out_shift);
2952 round_shift_8x8_avx2(out + 32, out_shift);
2953 round_shift_8x8_avx2(out + 48, out_shift);
2954 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
2955 }
2956 }
2957
idct64_low1_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)2958 static void idct64_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
2959 int bd, int out_shift) {
2960 const int32_t *cospi = cospi_arr(bit);
2961 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
2962 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2963 __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
2964 __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
2965
2966 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
2967
2968 {
2969 __m256i x;
2970
2971 // stage 1
2972 // stage 2
2973 // stage 3
2974 // stage 4
2975 // stage 5
2976 // stage 6
2977 x = half_btf_0_avx2(&cospi32, &in[0], &rnding, bit);
2978
2979 // stage 8
2980 // stage 9
2981 // stage 10
2982 // stage 11
2983 if (!do_cols) {
2984 const int log_range_out = AOMMAX(16, bd + 6);
2985 clamp_lo = _mm256_set1_epi32(-(1 << (log_range_out - 1)));
2986 clamp_hi = _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
2987 if (out_shift != 0) {
2988 __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
2989 x = _mm256_add_epi32(x, offset);
2990 x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
2991 }
2992 }
2993 x = _mm256_max_epi32(x, clamp_lo);
2994 x = _mm256_min_epi32(x, clamp_hi);
2995 out[0] = x;
2996 out[1] = x;
2997 out[2] = x;
2998 out[3] = x;
2999 out[4] = x;
3000 out[5] = x;
3001 out[6] = x;
3002 out[7] = x;
3003 out[8] = x;
3004 out[9] = x;
3005 out[10] = x;
3006 out[11] = x;
3007 out[12] = x;
3008 out[13] = x;
3009 out[14] = x;
3010 out[15] = x;
3011 out[16] = x;
3012 out[17] = x;
3013 out[18] = x;
3014 out[19] = x;
3015 out[20] = x;
3016 out[21] = x;
3017 out[22] = x;
3018 out[23] = x;
3019 out[24] = x;
3020 out[25] = x;
3021 out[26] = x;
3022 out[27] = x;
3023 out[28] = x;
3024 out[29] = x;
3025 out[30] = x;
3026 out[31] = x;
3027 out[32] = x;
3028 out[33] = x;
3029 out[34] = x;
3030 out[35] = x;
3031 out[36] = x;
3032 out[37] = x;
3033 out[38] = x;
3034 out[39] = x;
3035 out[40] = x;
3036 out[41] = x;
3037 out[42] = x;
3038 out[43] = x;
3039 out[44] = x;
3040 out[45] = x;
3041 out[46] = x;
3042 out[47] = x;
3043 out[48] = x;
3044 out[49] = x;
3045 out[50] = x;
3046 out[51] = x;
3047 out[52] = x;
3048 out[53] = x;
3049 out[54] = x;
3050 out[55] = x;
3051 out[56] = x;
3052 out[57] = x;
3053 out[58] = x;
3054 out[59] = x;
3055 out[60] = x;
3056 out[61] = x;
3057 out[62] = x;
3058 out[63] = x;
3059 }
3060 }
idct64_low8_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3061 static void idct64_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3062 int bd, int out_shift) {
3063 int i, j;
3064 const int32_t *cospi = cospi_arr(bit);
3065 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3066 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3067 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3068 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3069
3070 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3071 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3072 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3073 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3074 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3075 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3076 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3077 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3078 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3079 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3080 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3081 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3082 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3083 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3084 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3085 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3086 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3087 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3088 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3089 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3090 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3091 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3092 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3093 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3094 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3095 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3096 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3097 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3098 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3099 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3100 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3101 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3102 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3103 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3104 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3105 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3106 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3107 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3108
3109 {
3110 __m256i u[64];
3111
3112 // stage 1
3113 u[0] = in[0];
3114 u[8] = in[4];
3115 u[16] = in[2];
3116 u[24] = in[6];
3117 u[32] = in[1];
3118 u[40] = in[5];
3119 u[48] = in[3];
3120 u[56] = in[7];
3121
3122 // stage 2
3123 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3124 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3125 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3126 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3127 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3128 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3129 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3130 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3131
3132 // stage 3
3133 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3134 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3135 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3136 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3137 u[33] = u[32];
3138 u[38] = u[39];
3139 u[41] = u[40];
3140 u[46] = u[47];
3141 u[49] = u[48];
3142 u[54] = u[55];
3143 u[57] = u[56];
3144 u[62] = u[63];
3145
3146 // stage 4
3147 __m256i temp1, temp2;
3148 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3149 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3150 u[17] = u[16];
3151 u[22] = u[23];
3152 u[25] = u[24];
3153 u[30] = u[31];
3154
3155 temp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3156 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3157 u[33] = temp1;
3158
3159 temp2 = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3160 u[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3161 u[57] = temp2;
3162
3163 temp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3164 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3165 u[41] = temp1;
3166
3167 temp2 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3168 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3169 u[46] = temp2;
3170
3171 // stage 5
3172 u[9] = u[8];
3173 u[14] = u[15];
3174
3175 temp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3176 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3177 u[17] = temp1;
3178
3179 temp2 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3180 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3181 u[22] = temp2;
3182
3183 u[35] = u[32];
3184 u[34] = u[33];
3185 u[36] = u[39];
3186 u[37] = u[38];
3187 u[43] = u[40];
3188 u[42] = u[41];
3189 u[44] = u[47];
3190 u[45] = u[46];
3191 u[51] = u[48];
3192 u[50] = u[49];
3193 u[52] = u[55];
3194 u[53] = u[54];
3195 u[59] = u[56];
3196 u[58] = u[57];
3197 u[60] = u[63];
3198 u[61] = u[62];
3199
3200 // stage 6
3201 temp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3202 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3203 u[0] = temp1;
3204
3205 temp2 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3206 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3207 u[9] = temp2;
3208 u[19] = u[16];
3209 u[18] = u[17];
3210 u[20] = u[23];
3211 u[21] = u[22];
3212 u[27] = u[24];
3213 u[26] = u[25];
3214 u[28] = u[31];
3215 u[29] = u[30];
3216
3217 temp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3218 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3219 u[34] = temp1;
3220 temp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3221 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3222 u[35] = temp2;
3223 temp1 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3224 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3225 u[36] = temp1;
3226 temp2 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3227 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3228 u[37] = temp2;
3229 temp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3230 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3231 u[42] = temp1;
3232 temp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3233 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3234 u[43] = temp2;
3235 temp1 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3236 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3237 u[44] = temp1;
3238 temp2 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3239 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3240 u[45] = temp2;
3241
3242 // stage 7
3243 u[3] = u[0];
3244 u[2] = u[1];
3245 u[11] = u[8];
3246 u[10] = u[9];
3247 u[12] = u[15];
3248 u[13] = u[14];
3249
3250 temp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3251 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3252 u[18] = temp1;
3253 temp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3254 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3255 u[19] = temp2;
3256 temp1 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3257 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3258 u[20] = temp1;
3259 temp2 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3260 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3261 u[21] = temp2;
3262 for (i = 32; i < 64; i += 16) {
3263 for (j = i; j < i + 4; j++) {
3264 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3265 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3266 &clamp_hi);
3267 }
3268 }
3269
3270 // stage 8
3271 u[7] = u[0];
3272 u[6] = u[1];
3273 u[5] = u[2];
3274 u[4] = u[3];
3275
3276 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3277 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3278
3279 // stage 9
3280 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3281 bit);
3282
3283 // stage 10
3284 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3285 bit);
3286
3287 // stage 11
3288 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3289 }
3290 }
idct64_low16_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3291 static void idct64_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
3292 int bd, int out_shift) {
3293 int i, j;
3294 const int32_t *cospi = cospi_arr(bit);
3295 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3296 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3297 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3298 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3299
3300 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3301 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3302 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3303 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3304 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3305 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3306 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3307 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3308 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3309 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3310 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3311 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3312 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3313 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3314 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3315 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3316 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3317 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3318 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3319 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3320 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3321 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3322 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3323 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3324 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3325 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3326 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3327 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3328 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3329 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3330 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3331 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3332 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3333
3334 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3335 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3336 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3337 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3338 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3339 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3340 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3341 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3342 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3343 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3344 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3345 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3346 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3347 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3348 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3349 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3350 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3351 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3352 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3353 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3354 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3355
3356 {
3357 __m256i u[64];
3358 __m256i tmp1, tmp2, tmp3, tmp4;
3359 // stage 1
3360 u[0] = in[0];
3361 u[32] = in[1];
3362 u[36] = in[9];
3363 u[40] = in[5];
3364 u[44] = in[13];
3365 u[48] = in[3];
3366 u[52] = in[11];
3367 u[56] = in[7];
3368 u[60] = in[15];
3369 u[16] = in[2];
3370 u[20] = in[10];
3371 u[24] = in[6];
3372 u[28] = in[14];
3373 u[4] = in[8];
3374 u[8] = in[4];
3375 u[12] = in[12];
3376
3377 // stage 2
3378 u[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3379 u[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3380 u[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3381 u[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3382 u[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3383 u[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3384 u[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3385 u[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3386 u[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3387 u[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3388 u[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3389 u[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3390 u[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3391 u[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3392 u[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3393 u[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3394
3395 // stage 3
3396 u[31] = half_btf_0_avx2(&cospi2, &u[16], &rnding, bit);
3397 u[16] = half_btf_0_avx2(&cospi62, &u[16], &rnding, bit);
3398 u[19] = half_btf_0_avx2(&cospim50, &u[28], &rnding, bit);
3399 u[28] = half_btf_0_avx2(&cospi14, &u[28], &rnding, bit);
3400 u[27] = half_btf_0_avx2(&cospi10, &u[20], &rnding, bit);
3401 u[20] = half_btf_0_avx2(&cospi54, &u[20], &rnding, bit);
3402 u[23] = half_btf_0_avx2(&cospim58, &u[24], &rnding, bit);
3403 u[24] = half_btf_0_avx2(&cospi6, &u[24], &rnding, bit);
3404 u[33] = u[32];
3405 u[34] = u[35];
3406 u[37] = u[36];
3407 u[38] = u[39];
3408 u[41] = u[40];
3409 u[42] = u[43];
3410 u[45] = u[44];
3411 u[46] = u[47];
3412 u[49] = u[48];
3413 u[50] = u[51];
3414 u[53] = u[52];
3415 u[54] = u[55];
3416 u[57] = u[56];
3417 u[58] = u[59];
3418 u[61] = u[60];
3419 u[62] = u[63];
3420
3421 // stage 4
3422 u[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3423 u[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3424 u[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3425 u[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3426
3427 u[17] = u[16];
3428 u[18] = u[19];
3429 u[21] = u[20];
3430 u[22] = u[23];
3431 u[25] = u[24];
3432 u[26] = u[27];
3433 u[29] = u[28];
3434 u[30] = u[31];
3435
3436 tmp1 = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3437 tmp2 = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3438 tmp3 = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3439 tmp4 = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3440 u[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3441 u[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3442 u[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3443 u[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3444 u[33] = tmp1;
3445 u[34] = tmp2;
3446 u[37] = tmp3;
3447 u[38] = tmp4;
3448
3449 tmp1 = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3450 tmp2 = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3451 tmp3 = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3452 tmp4 = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3453 u[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3454 u[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3455 u[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3456 u[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3457 u[41] = tmp1;
3458 u[42] = tmp2;
3459 u[45] = tmp3;
3460 u[46] = tmp4;
3461
3462 // stage 5
3463 u[7] = half_btf_0_avx2(&cospi8, &u[4], &rnding, bit);
3464 u[4] = half_btf_0_avx2(&cospi56, &u[4], &rnding, bit);
3465
3466 u[9] = u[8];
3467 u[10] = u[11];
3468 u[13] = u[12];
3469 u[14] = u[15];
3470
3471 tmp1 = half_btf_avx2(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3472 tmp2 = half_btf_avx2(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3473 tmp3 = half_btf_avx2(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3474 tmp4 = half_btf_avx2(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3475 u[25] = half_btf_avx2(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3476 u[26] = half_btf_avx2(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3477 u[29] = half_btf_avx2(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3478 u[30] = half_btf_avx2(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3479 u[17] = tmp1;
3480 u[18] = tmp2;
3481 u[21] = tmp3;
3482 u[22] = tmp4;
3483
3484 for (i = 32; i < 64; i += 8) {
3485 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3486 &clamp_hi);
3487 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3488 &clamp_hi);
3489
3490 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3491 &clamp_hi);
3492 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3493 &clamp_hi);
3494 }
3495
3496 // stage 6
3497 tmp1 = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3498 u[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3499 u[0] = tmp1;
3500 u[5] = u[4];
3501 u[6] = u[7];
3502
3503 tmp1 = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3504 u[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3505 u[9] = tmp1;
3506 tmp2 = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3507 u[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3508 u[10] = tmp2;
3509
3510 for (i = 16; i < 32; i += 8) {
3511 addsub_avx2(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3512 &clamp_hi);
3513 addsub_avx2(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3514 &clamp_hi);
3515
3516 addsub_avx2(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3517 &clamp_hi);
3518 addsub_avx2(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3519 &clamp_hi);
3520 }
3521
3522 tmp1 = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3523 tmp2 = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3524 tmp3 = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3525 tmp4 = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3526 u[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3527 u[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3528 u[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3529 u[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3530 u[34] = tmp1;
3531 u[35] = tmp2;
3532 u[36] = tmp3;
3533 u[37] = tmp4;
3534
3535 tmp1 = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3536 tmp2 = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3537 tmp3 = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3538 tmp4 = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3539 u[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3540 u[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3541 u[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3542 u[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3543 u[42] = tmp1;
3544 u[43] = tmp2;
3545 u[44] = tmp3;
3546 u[45] = tmp4;
3547
3548 // stage 7
3549 u[3] = u[0];
3550 u[2] = u[1];
3551 tmp1 = half_btf_avx2(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3552 u[6] = half_btf_avx2(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3553 u[5] = tmp1;
3554 addsub_avx2(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3555 addsub_avx2(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3556 addsub_avx2(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3557 addsub_avx2(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3558
3559 tmp1 = half_btf_avx2(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3560 tmp2 = half_btf_avx2(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3561 tmp3 = half_btf_avx2(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3562 tmp4 = half_btf_avx2(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3563 u[26] = half_btf_avx2(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3564 u[27] = half_btf_avx2(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3565 u[28] = half_btf_avx2(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3566 u[29] = half_btf_avx2(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3567 u[18] = tmp1;
3568 u[19] = tmp2;
3569 u[20] = tmp3;
3570 u[21] = tmp4;
3571
3572 for (i = 32; i < 64; i += 16) {
3573 for (j = i; j < i + 4; j++) {
3574 addsub_avx2(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3575 addsub_avx2(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3576 &clamp_hi);
3577 }
3578 }
3579
3580 // stage 8
3581 for (i = 0; i < 4; ++i) {
3582 addsub_avx2(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3583 }
3584
3585 idct64_stage8_avx2(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3586 &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3587
3588 // stage 9
3589 idct64_stage9_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3590 bit);
3591
3592 // stage 10
3593 idct64_stage10_avx2(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3594 bit);
3595
3596 // stage 11
3597 idct64_stage11_avx2(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3598 }
3599 }
idct64_avx2(__m256i * in,__m256i * out,int bit,int do_cols,int bd,int out_shift)3600 static void idct64_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
3601 int out_shift) {
3602 int i, j;
3603 const int32_t *cospi = cospi_arr(bit);
3604 const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
3605 const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3606 const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
3607 const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
3608
3609 const __m256i cospi1 = _mm256_set1_epi32(cospi[1]);
3610 const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
3611 const __m256i cospi3 = _mm256_set1_epi32(cospi[3]);
3612 const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
3613 const __m256i cospi5 = _mm256_set1_epi32(cospi[5]);
3614 const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
3615 const __m256i cospi7 = _mm256_set1_epi32(cospi[7]);
3616 const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
3617 const __m256i cospi9 = _mm256_set1_epi32(cospi[9]);
3618 const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
3619 const __m256i cospi11 = _mm256_set1_epi32(cospi[11]);
3620 const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
3621 const __m256i cospi13 = _mm256_set1_epi32(cospi[13]);
3622 const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
3623 const __m256i cospi15 = _mm256_set1_epi32(cospi[15]);
3624 const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
3625 const __m256i cospi17 = _mm256_set1_epi32(cospi[17]);
3626 const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
3627 const __m256i cospi19 = _mm256_set1_epi32(cospi[19]);
3628 const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
3629 const __m256i cospi21 = _mm256_set1_epi32(cospi[21]);
3630 const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
3631 const __m256i cospi23 = _mm256_set1_epi32(cospi[23]);
3632 const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
3633 const __m256i cospi25 = _mm256_set1_epi32(cospi[25]);
3634 const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
3635 const __m256i cospi27 = _mm256_set1_epi32(cospi[27]);
3636 const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
3637 const __m256i cospi29 = _mm256_set1_epi32(cospi[29]);
3638 const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
3639 const __m256i cospi31 = _mm256_set1_epi32(cospi[31]);
3640 const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
3641 const __m256i cospi35 = _mm256_set1_epi32(cospi[35]);
3642 const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
3643 const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
3644 const __m256i cospi39 = _mm256_set1_epi32(cospi[39]);
3645 const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
3646 const __m256i cospi43 = _mm256_set1_epi32(cospi[43]);
3647 const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
3648 const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
3649 const __m256i cospi47 = _mm256_set1_epi32(cospi[47]);
3650 const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
3651 const __m256i cospi51 = _mm256_set1_epi32(cospi[51]);
3652 const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
3653 const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
3654 const __m256i cospi55 = _mm256_set1_epi32(cospi[55]);
3655 const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
3656 const __m256i cospi59 = _mm256_set1_epi32(cospi[59]);
3657 const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
3658 const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
3659 const __m256i cospi63 = _mm256_set1_epi32(cospi[63]);
3660
3661 const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
3662 const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
3663 const __m256i cospim12 = _mm256_set1_epi32(-cospi[12]);
3664 const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
3665 const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
3666 const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
3667 const __m256i cospim28 = _mm256_set1_epi32(-cospi[28]);
3668 const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
3669 const __m256i cospim33 = _mm256_set1_epi32(-cospi[33]);
3670 const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
3671 const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
3672 const __m256i cospim37 = _mm256_set1_epi32(-cospi[37]);
3673 const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
3674 const __m256i cospim41 = _mm256_set1_epi32(-cospi[41]);
3675 const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
3676 const __m256i cospim44 = _mm256_set1_epi32(-cospi[44]);
3677 const __m256i cospim45 = _mm256_set1_epi32(-cospi[45]);
3678 const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
3679 const __m256i cospim49 = _mm256_set1_epi32(-cospi[49]);
3680 const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
3681 const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
3682 const __m256i cospim53 = _mm256_set1_epi32(-cospi[53]);
3683 const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
3684 const __m256i cospim57 = _mm256_set1_epi32(-cospi[57]);
3685 const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
3686 const __m256i cospim60 = _mm256_set1_epi32(-cospi[60]);
3687 const __m256i cospim61 = _mm256_set1_epi32(-cospi[61]);
3688
3689 {
3690 __m256i u[64], v[64];
3691
3692 // stage 1
3693 u[32] = in[1];
3694 u[34] = in[17];
3695 u[36] = in[9];
3696 u[38] = in[25];
3697 u[40] = in[5];
3698 u[42] = in[21];
3699 u[44] = in[13];
3700 u[46] = in[29];
3701 u[48] = in[3];
3702 u[50] = in[19];
3703 u[52] = in[11];
3704 u[54] = in[27];
3705 u[56] = in[7];
3706 u[58] = in[23];
3707 u[60] = in[15];
3708 u[62] = in[31];
3709
3710 v[16] = in[2];
3711 v[18] = in[18];
3712 v[20] = in[10];
3713 v[22] = in[26];
3714 v[24] = in[6];
3715 v[26] = in[22];
3716 v[28] = in[14];
3717 v[30] = in[30];
3718
3719 u[8] = in[4];
3720 u[10] = in[20];
3721 u[12] = in[12];
3722 u[14] = in[28];
3723
3724 v[4] = in[8];
3725 v[6] = in[24];
3726
3727 u[0] = in[0];
3728 u[2] = in[16];
3729
3730 // stage 2
3731 v[32] = half_btf_0_avx2(&cospi63, &u[32], &rnding, bit);
3732 v[33] = half_btf_0_avx2(&cospim33, &u[62], &rnding, bit);
3733 v[34] = half_btf_0_avx2(&cospi47, &u[34], &rnding, bit);
3734 v[35] = half_btf_0_avx2(&cospim49, &u[60], &rnding, bit);
3735 v[36] = half_btf_0_avx2(&cospi55, &u[36], &rnding, bit);
3736 v[37] = half_btf_0_avx2(&cospim41, &u[58], &rnding, bit);
3737 v[38] = half_btf_0_avx2(&cospi39, &u[38], &rnding, bit);
3738 v[39] = half_btf_0_avx2(&cospim57, &u[56], &rnding, bit);
3739 v[40] = half_btf_0_avx2(&cospi59, &u[40], &rnding, bit);
3740 v[41] = half_btf_0_avx2(&cospim37, &u[54], &rnding, bit);
3741 v[42] = half_btf_0_avx2(&cospi43, &u[42], &rnding, bit);
3742 v[43] = half_btf_0_avx2(&cospim53, &u[52], &rnding, bit);
3743 v[44] = half_btf_0_avx2(&cospi51, &u[44], &rnding, bit);
3744 v[45] = half_btf_0_avx2(&cospim45, &u[50], &rnding, bit);
3745 v[46] = half_btf_0_avx2(&cospi35, &u[46], &rnding, bit);
3746 v[47] = half_btf_0_avx2(&cospim61, &u[48], &rnding, bit);
3747 v[48] = half_btf_0_avx2(&cospi3, &u[48], &rnding, bit);
3748 v[49] = half_btf_0_avx2(&cospi29, &u[46], &rnding, bit);
3749 v[50] = half_btf_0_avx2(&cospi19, &u[50], &rnding, bit);
3750 v[51] = half_btf_0_avx2(&cospi13, &u[44], &rnding, bit);
3751 v[52] = half_btf_0_avx2(&cospi11, &u[52], &rnding, bit);
3752 v[53] = half_btf_0_avx2(&cospi21, &u[42], &rnding, bit);
3753 v[54] = half_btf_0_avx2(&cospi27, &u[54], &rnding, bit);
3754 v[55] = half_btf_0_avx2(&cospi5, &u[40], &rnding, bit);
3755 v[56] = half_btf_0_avx2(&cospi7, &u[56], &rnding, bit);
3756 v[57] = half_btf_0_avx2(&cospi25, &u[38], &rnding, bit);
3757 v[58] = half_btf_0_avx2(&cospi23, &u[58], &rnding, bit);
3758 v[59] = half_btf_0_avx2(&cospi9, &u[36], &rnding, bit);
3759 v[60] = half_btf_0_avx2(&cospi15, &u[60], &rnding, bit);
3760 v[61] = half_btf_0_avx2(&cospi17, &u[34], &rnding, bit);
3761 v[62] = half_btf_0_avx2(&cospi31, &u[62], &rnding, bit);
3762 v[63] = half_btf_0_avx2(&cospi1, &u[32], &rnding, bit);
3763
3764 // stage 3
3765 u[16] = half_btf_0_avx2(&cospi62, &v[16], &rnding, bit);
3766 u[17] = half_btf_0_avx2(&cospim34, &v[30], &rnding, bit);
3767 u[18] = half_btf_0_avx2(&cospi46, &v[18], &rnding, bit);
3768 u[19] = half_btf_0_avx2(&cospim50, &v[28], &rnding, bit);
3769 u[20] = half_btf_0_avx2(&cospi54, &v[20], &rnding, bit);
3770 u[21] = half_btf_0_avx2(&cospim42, &v[26], &rnding, bit);
3771 u[22] = half_btf_0_avx2(&cospi38, &v[22], &rnding, bit);
3772 u[23] = half_btf_0_avx2(&cospim58, &v[24], &rnding, bit);
3773 u[24] = half_btf_0_avx2(&cospi6, &v[24], &rnding, bit);
3774 u[25] = half_btf_0_avx2(&cospi26, &v[22], &rnding, bit);
3775 u[26] = half_btf_0_avx2(&cospi22, &v[26], &rnding, bit);
3776 u[27] = half_btf_0_avx2(&cospi10, &v[20], &rnding, bit);
3777 u[28] = half_btf_0_avx2(&cospi14, &v[28], &rnding, bit);
3778 u[29] = half_btf_0_avx2(&cospi18, &v[18], &rnding, bit);
3779 u[30] = half_btf_0_avx2(&cospi30, &v[30], &rnding, bit);
3780 u[31] = half_btf_0_avx2(&cospi2, &v[16], &rnding, bit);
3781
3782 for (i = 32; i < 64; i += 4) {
3783 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3784 &clamp_hi);
3785 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3786 &clamp_hi);
3787 }
3788
3789 // stage 4
3790 v[8] = half_btf_0_avx2(&cospi60, &u[8], &rnding, bit);
3791 v[9] = half_btf_0_avx2(&cospim36, &u[14], &rnding, bit);
3792 v[10] = half_btf_0_avx2(&cospi44, &u[10], &rnding, bit);
3793 v[11] = half_btf_0_avx2(&cospim52, &u[12], &rnding, bit);
3794 v[12] = half_btf_0_avx2(&cospi12, &u[12], &rnding, bit);
3795 v[13] = half_btf_0_avx2(&cospi20, &u[10], &rnding, bit);
3796 v[14] = half_btf_0_avx2(&cospi28, &u[14], &rnding, bit);
3797 v[15] = half_btf_0_avx2(&cospi4, &u[8], &rnding, bit);
3798
3799 for (i = 16; i < 32; i += 4) {
3800 addsub_avx2(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
3801 &clamp_hi);
3802 addsub_avx2(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
3803 &clamp_hi);
3804 }
3805
3806 for (i = 32; i < 64; i += 4) {
3807 v[i + 0] = u[i + 0];
3808 v[i + 3] = u[i + 3];
3809 }
3810
3811 v[33] = half_btf_avx2(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3812 v[34] = half_btf_avx2(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3813 v[37] = half_btf_avx2(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3814 v[38] = half_btf_avx2(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3815 v[41] = half_btf_avx2(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3816 v[42] = half_btf_avx2(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3817 v[45] = half_btf_avx2(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3818 v[46] = half_btf_avx2(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3819 v[49] = half_btf_avx2(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3820 v[50] = half_btf_avx2(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3821 v[53] = half_btf_avx2(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3822 v[54] = half_btf_avx2(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3823 v[57] = half_btf_avx2(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3824 v[58] = half_btf_avx2(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3825 v[61] = half_btf_avx2(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3826 v[62] = half_btf_avx2(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3827
3828 // stage 5
3829 u[4] = half_btf_0_avx2(&cospi56, &v[4], &rnding, bit);
3830 u[5] = half_btf_0_avx2(&cospim40, &v[6], &rnding, bit);
3831 u[6] = half_btf_0_avx2(&cospi24, &v[6], &rnding, bit);
3832 u[7] = half_btf_0_avx2(&cospi8, &v[4], &rnding, bit);
3833
3834 for (i = 8; i < 16; i += 4) {
3835 addsub_avx2(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
3836 &clamp_hi);
3837 addsub_avx2(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
3838 &clamp_hi);
3839 }
3840
3841 for (i = 16; i < 32; i += 4) {
3842 u[i + 0] = v[i + 0];
3843 u[i + 3] = v[i + 3];
3844 }
3845
3846 u[17] = half_btf_avx2(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
3847 u[18] = half_btf_avx2(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
3848 u[21] = half_btf_avx2(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
3849 u[22] = half_btf_avx2(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
3850 u[25] = half_btf_avx2(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
3851 u[26] = half_btf_avx2(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
3852 u[29] = half_btf_avx2(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
3853 u[30] = half_btf_avx2(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
3854
3855 for (i = 32; i < 64; i += 8) {
3856 addsub_avx2(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3857 &clamp_hi);
3858 addsub_avx2(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3859 &clamp_hi);
3860
3861 addsub_avx2(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3862 &clamp_hi);
3863 addsub_avx2(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3864 &clamp_hi);
3865 }
3866
3867 // stage 6
3868 v[0] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3869 v[1] = half_btf_0_avx2(&cospi32, &u[0], &rnding, bit);
3870 v[2] = half_btf_0_avx2(&cospi48, &u[2], &rnding, bit);
3871 v[3] = half_btf_0_avx2(&cospi16, &u[2], &rnding, bit);
3872
3873 addsub_avx2(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
3874 addsub_avx2(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
3875
3876 for (i = 8; i < 16; i += 4) {
3877 v[i + 0] = u[i + 0];
3878 v[i + 3] = u[i + 3];
3879 }
3880
3881 v[9] = half_btf_avx2(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3882 v[10] = half_btf_avx2(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3883 v[13] = half_btf_avx2(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3884 v[14] = half_btf_avx2(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3885
3886 for (i = 16; i < 32; i += 8) {
3887 addsub_avx2(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
3888 &clamp_hi);
3889 addsub_avx2(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
3890 &clamp_hi);
3891
3892 addsub_avx2(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
3893 &clamp_hi);
3894 addsub_avx2(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
3895 &clamp_hi);
3896 }
3897
3898 for (i = 32; i < 64; i += 8) {
3899 v[i + 0] = u[i + 0];
3900 v[i + 1] = u[i + 1];
3901 v[i + 6] = u[i + 6];
3902 v[i + 7] = u[i + 7];
3903 }
3904
3905 v[34] = half_btf_avx2(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3906 v[35] = half_btf_avx2(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3907 v[36] = half_btf_avx2(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3908 v[37] = half_btf_avx2(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3909 v[42] = half_btf_avx2(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3910 v[43] = half_btf_avx2(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3911 v[44] = half_btf_avx2(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3912 v[45] = half_btf_avx2(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3913 v[50] = half_btf_avx2(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3914 v[51] = half_btf_avx2(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3915 v[52] = half_btf_avx2(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3916 v[53] = half_btf_avx2(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3917 v[58] = half_btf_avx2(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3918 v[59] = half_btf_avx2(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3919 v[60] = half_btf_avx2(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3920 v[61] = half_btf_avx2(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3921
3922 // stage 7
3923 addsub_avx2(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
3924 addsub_avx2(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
3925
3926 u[4] = v[4];
3927 u[7] = v[7];
3928 u[5] = half_btf_avx2(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
3929 u[6] = half_btf_avx2(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
3930
3931 addsub_avx2(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3932 addsub_avx2(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3933 addsub_avx2(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3934 addsub_avx2(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3935
3936 for (i = 16; i < 32; i += 8) {
3937 u[i + 0] = v[i + 0];
3938 u[i + 1] = v[i + 1];
3939 u[i + 6] = v[i + 6];
3940 u[i + 7] = v[i + 7];
3941 }
3942
3943 u[18] = half_btf_avx2(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
3944 u[19] = half_btf_avx2(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
3945 u[20] = half_btf_avx2(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
3946 u[21] = half_btf_avx2(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
3947 u[26] = half_btf_avx2(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
3948 u[27] = half_btf_avx2(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
3949 u[28] = half_btf_avx2(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
3950 u[29] = half_btf_avx2(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
3951
3952 for (i = 32; i < 64; i += 16) {
3953 for (j = i; j < i + 4; j++) {
3954 addsub_avx2(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3955 addsub_avx2(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3956 &clamp_hi);
3957 }
3958 }
3959
3960 // stage 8
3961 for (i = 0; i < 4; ++i) {
3962 addsub_avx2(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
3963 }
3964
3965 v[8] = u[8];
3966 v[9] = u[9];
3967 v[14] = u[14];
3968 v[15] = u[15];
3969
3970 v[10] = half_btf_avx2(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
3971 v[11] = half_btf_avx2(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
3972 v[12] = half_btf_avx2(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
3973 v[13] = half_btf_avx2(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
3974
3975 for (i = 16; i < 20; ++i) {
3976 addsub_avx2(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
3977 addsub_avx2(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
3978 &clamp_hi);
3979 }
3980
3981 for (i = 32; i < 36; ++i) {
3982 v[i] = u[i];
3983 v[i + 12] = u[i + 12];
3984 v[i + 16] = u[i + 16];
3985 v[i + 28] = u[i + 28];
3986 }
3987
3988 v[36] = half_btf_avx2(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
3989 v[37] = half_btf_avx2(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
3990 v[38] = half_btf_avx2(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
3991 v[39] = half_btf_avx2(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
3992 v[40] = half_btf_avx2(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
3993 v[41] = half_btf_avx2(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
3994 v[42] = half_btf_avx2(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
3995 v[43] = half_btf_avx2(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
3996 v[52] = half_btf_avx2(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
3997 v[53] = half_btf_avx2(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
3998 v[54] = half_btf_avx2(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
3999 v[55] = half_btf_avx2(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4000 v[56] = half_btf_avx2(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4001 v[57] = half_btf_avx2(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4002 v[58] = half_btf_avx2(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4003 v[59] = half_btf_avx2(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4004
4005 // stage 9
4006 for (i = 0; i < 8; ++i) {
4007 addsub_avx2(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4008 }
4009
4010 for (i = 16; i < 20; ++i) {
4011 u[i] = v[i];
4012 u[i + 12] = v[i + 12];
4013 }
4014
4015 u[20] = half_btf_avx2(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4016 u[21] = half_btf_avx2(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4017 u[22] = half_btf_avx2(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4018 u[23] = half_btf_avx2(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4019 u[24] = half_btf_avx2(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4020 u[25] = half_btf_avx2(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4021 u[26] = half_btf_avx2(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4022 u[27] = half_btf_avx2(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4023
4024 for (i = 32; i < 40; i++) {
4025 addsub_avx2(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4026 }
4027
4028 for (i = 48; i < 56; i++) {
4029 addsub_avx2(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4030 }
4031
4032 // stage 10
4033 for (i = 0; i < 16; i++) {
4034 addsub_avx2(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4035 }
4036
4037 for (i = 32; i < 40; i++) v[i] = u[i];
4038
4039 v[40] = half_btf_avx2(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4040 v[41] = half_btf_avx2(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4041 v[42] = half_btf_avx2(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4042 v[43] = half_btf_avx2(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4043 v[44] = half_btf_avx2(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4044 v[45] = half_btf_avx2(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4045 v[46] = half_btf_avx2(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4046 v[47] = half_btf_avx2(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4047 v[48] = half_btf_avx2(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4048 v[49] = half_btf_avx2(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4049 v[50] = half_btf_avx2(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4050 v[51] = half_btf_avx2(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4051 v[52] = half_btf_avx2(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4052 v[53] = half_btf_avx2(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4053 v[54] = half_btf_avx2(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4054 v[55] = half_btf_avx2(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4055
4056 for (i = 56; i < 64; i++) v[i] = u[i];
4057
4058 // stage 11
4059 for (i = 0; i < 32; i++) {
4060 addsub_avx2(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4061 &clamp_hi);
4062 }
4063 if (!do_cols) {
4064 const int log_range_out = AOMMAX(16, bd + 6);
4065 const __m256i clamp_lo_out =
4066 _mm256_set1_epi32(-(1 << (log_range_out - 1)));
4067 const __m256i clamp_hi_out =
4068 _mm256_set1_epi32((1 << (log_range_out - 1)) - 1);
4069
4070 round_shift_8x8_avx2(out, out_shift);
4071 round_shift_8x8_avx2(out + 16, out_shift);
4072 round_shift_8x8_avx2(out + 32, out_shift);
4073 round_shift_8x8_avx2(out + 48, out_shift);
4074 highbd_clamp_epi32_avx2(out, out, &clamp_lo_out, &clamp_hi_out, 64);
4075 }
4076 }
4077 }
4078 typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
4079 int do_cols, int bd, int out_shift);
4080
4081 static const transform_1d_avx2
4082 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
4083 {
4084 { NULL, NULL, NULL, NULL },
4085 { NULL, NULL, NULL, NULL },
4086 { NULL, NULL, NULL, NULL },
4087 },
4088 {
4089 { idct8x8_low1_avx2, idct8x8_avx2, NULL, NULL },
4090 { iadst8x8_low1_avx2, iadst8x8_avx2, NULL, NULL },
4091 { NULL, NULL, NULL, NULL },
4092 },
4093 {
4094 { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
4095 { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
4096 { NULL, NULL, NULL, NULL },
4097 },
4098 { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
4099 { NULL, NULL, NULL, NULL },
4100 { NULL, NULL, NULL, NULL } },
4101
4102 { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2, idct64_avx2 },
4103 { NULL, NULL, NULL, NULL },
4104 { NULL, NULL, NULL, NULL } }
4105 };
4106
highbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)4107 static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
4108 uint16_t *output, int stride,
4109 TX_TYPE tx_type,
4110 TX_SIZE tx_size, int eob,
4111 const int bd) {
4112 __m256i buf1[64 * 8];
4113 int eobx, eoby;
4114 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
4115 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
4116 const int txw_idx = get_txw_idx(tx_size);
4117 const int txh_idx = get_txh_idx(tx_size);
4118 const int txfm_size_col = tx_size_wide[tx_size];
4119 const int txfm_size_row = tx_size_high[tx_size];
4120 const int buf_size_w_div8 = txfm_size_col >> 3;
4121 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
4122 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4123 const int input_stride = AOMMIN(32, txfm_size_row);
4124 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
4125 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4126 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4127 const transform_1d_avx2 row_txfm =
4128 highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4129 const transform_1d_avx2 col_txfm =
4130 highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4131
4132 assert(col_txfm != NULL);
4133 assert(row_txfm != NULL);
4134 int ud_flip, lr_flip;
4135 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4136
4137 // 1st stage: column transform
4138 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4139 __m256i buf0[64];
4140 load_buffer_32bit_input(input + i * 8, input_stride, buf0,
4141 buf_size_nonzero_w);
4142 if (rect_type == 1 || rect_type == -1) {
4143 round_shift_rect_array_32_avx2(buf0, buf0, buf_size_nonzero_w, 0,
4144 NewInvSqrt2);
4145 }
4146 row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
4147
4148 __m256i *_buf1 = buf1 + i * 8;
4149 if (lr_flip) {
4150 for (int j = 0; j < buf_size_w_div8; ++j) {
4151 transpose_8x8_flip_avx2(
4152 &buf0[j * 8], &_buf1[(buf_size_w_div8 - 1 - j) * txfm_size_row]);
4153 }
4154 } else {
4155 for (int j = 0; j < buf_size_w_div8; ++j) {
4156 transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
4157 }
4158 }
4159 }
4160 // 2nd stage: column transform
4161 for (int i = 0; i < buf_size_w_div8; i++) {
4162 col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
4163 bd, 0);
4164
4165 round_shift_array_32_avx2(buf1 + i * txfm_size_row,
4166 buf1 + i * txfm_size_row, txfm_size_row,
4167 -shift[1]);
4168 }
4169
4170 // write to buffer
4171 if (txfm_size_col >= 16) {
4172 for (int i = 0; i < (txfm_size_col >> 4); i++) {
4173 highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
4174 output + 16 * i, stride, ud_flip,
4175 txfm_size_row, bd);
4176 }
4177 } else if (txfm_size_col == 8) {
4178 highbd_write_buffer_8xn_avx2(buf1, output, stride, ud_flip, txfm_size_row,
4179 bd);
4180 }
4181 }
4182
av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)4183 static void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
4184 uint8_t *output, int stride,
4185 TX_TYPE tx_type,
4186 TX_SIZE tx_size, int eob,
4187 const int bd) {
4188 switch (tx_type) {
4189 case DCT_DCT:
4190 case ADST_DCT:
4191 case DCT_ADST:
4192 case ADST_ADST:
4193 case FLIPADST_DCT:
4194 case DCT_FLIPADST:
4195 case FLIPADST_FLIPADST:
4196 case ADST_FLIPADST:
4197 case FLIPADST_ADST:
4198 highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
4199 stride, tx_type, tx_size, eob, bd);
4200 break;
4201 case IDTX:
4202 case H_DCT:
4203 case H_ADST:
4204 case H_FLIPADST:
4205 case V_DCT:
4206 case V_ADST:
4207 case V_FLIPADST:
4208 av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
4209 tx_size, eob, bd);
4210 break;
4211 default: assert(0); break;
4212 }
4213 }
av1_highbd_inv_txfm_add_avx2(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)4214 void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
4215 int stride, const TxfmParam *txfm_param) {
4216 assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
4217 const TX_SIZE tx_size = txfm_param->tx_size;
4218 switch (tx_size) {
4219 case TX_4X8:
4220 case TX_8X4:
4221 case TX_4X4:
4222 case TX_16X4:
4223 case TX_4X16:
4224 av1_highbd_inv_txfm_add_sse4_1(input, dest, stride, txfm_param);
4225 break;
4226 default:
4227 av1_highbd_inv_txfm2d_add_universe_avx2(
4228 input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
4229 txfm_param->eob, txfm_param->bd);
4230 break;
4231 }
4232 }
4233