1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_config.h"
13
14 #include "config/av1_rtcd.h"
15
16 #include "av1/common/av1_inv_txfm1d_cfg.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/common/x86/av1_inv_txfm_avx2.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20
21 // TODO([email protected]): move this to header file
22
23 // Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
24 static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
25 4 * 5793 };
26
idct16_stage5_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)27 static inline void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
28 const __m256i _r, int8_t cos_bit) {
29 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
30 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
31 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
32 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
33 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
34
35 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
36 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
37 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
38 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
39 }
40
idct16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)41 static inline void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
42 const __m256i _r, int8_t cos_bit) {
43 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
44 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
45 btf_16_adds_subs_avx2(&x[0], &x[7]);
46 btf_16_adds_subs_avx2(&x[1], &x[6]);
47 btf_16_adds_subs_avx2(&x[2], &x[5]);
48 btf_16_adds_subs_avx2(&x[3], &x[4]);
49 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
50 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
51 }
52
idct16_stage7_avx2(__m256i * output,__m256i * x1)53 static inline void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
54 btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
55 btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
56 btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
57 btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
58 btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
59 btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
60 btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
61 btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
62 }
63
idct16_avx2(const __m256i * input,__m256i * output)64 static void idct16_avx2(const __m256i *input, __m256i *output) {
65 const int32_t *cospi = cospi_arr(INV_COS_BIT);
66 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
67
68 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
69 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
70 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
71 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
72 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
73 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
74 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
75 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
76 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
77 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
78 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
79 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
80 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
81 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
82 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
83 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
84 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
85 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
86 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
87
88 // stage 1
89 __m256i x1[16];
90 x1[0] = input[0];
91 x1[1] = input[8];
92 x1[2] = input[4];
93 x1[3] = input[12];
94 x1[4] = input[2];
95 x1[5] = input[10];
96 x1[6] = input[6];
97 x1[7] = input[14];
98 x1[8] = input[1];
99 x1[9] = input[9];
100 x1[10] = input[5];
101 x1[11] = input[13];
102 x1[12] = input[3];
103 x1[13] = input[11];
104 x1[14] = input[7];
105 x1[15] = input[15];
106
107 // stage 2
108 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
109 INV_COS_BIT);
110 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
111 INV_COS_BIT);
112 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
113 INV_COS_BIT);
114 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
115 INV_COS_BIT);
116
117 // stage 3
118 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
119 INV_COS_BIT);
120 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
121 INV_COS_BIT);
122 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
123 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
124 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
125 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
126
127 // stage 4
128 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
129 INV_COS_BIT);
130 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
131 INV_COS_BIT);
132 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
133 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
134 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
135 INV_COS_BIT);
136 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
137 INV_COS_BIT);
138
139 idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
140 idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
141 idct16_stage7_avx2(output, x1);
142 }
143
idct16_low8_avx2(const __m256i * input,__m256i * output)144 static void idct16_low8_avx2(const __m256i *input, __m256i *output) {
145 const int32_t *cospi = cospi_arr(INV_COS_BIT);
146 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
147
148 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
149 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
150 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
151
152 // stage 1
153 __m256i x1[16];
154 x1[0] = input[0];
155 x1[2] = input[4];
156 x1[4] = input[2];
157 x1[6] = input[6];
158 x1[8] = input[1];
159 x1[10] = input[5];
160 x1[12] = input[3];
161 x1[14] = input[7];
162
163 // stage 2
164 btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
165 btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
166 btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
167 btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
168
169 // stage 3
170 btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
171 btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
172 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
173 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
174 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
175 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
176
177 // stage 4
178 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
179 btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
180 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
181 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
182 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r,
183 INV_COS_BIT);
184 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r,
185 INV_COS_BIT);
186
187 idct16_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
188 idct16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
189 idct16_stage7_avx2(output, x1);
190 }
191
idct16_low1_avx2(const __m256i * input,__m256i * output)192 static void idct16_low1_avx2(const __m256i *input, __m256i *output) {
193 const int32_t *cospi = cospi_arr(INV_COS_BIT);
194
195 // stage 1
196 __m256i x1[2];
197 x1[0] = input[0];
198
199 // stage 2
200 // stage 3
201 // stage 4
202 btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
203
204 // stage 5
205 // stage 6
206 output[0] = x1[0];
207 output[1] = x1[1];
208 output[2] = x1[1];
209 output[3] = x1[0];
210 output[4] = x1[0];
211 output[5] = x1[1];
212 output[6] = x1[1];
213 output[7] = x1[0];
214 output[8] = x1[0];
215 output[9] = x1[1];
216 output[10] = x1[1];
217 output[11] = x1[0];
218 output[12] = x1[0];
219 output[13] = x1[1];
220 output[14] = x1[1];
221 output[15] = x1[0];
222 }
223
iadst16_stage3_avx2(__m256i * x)224 static inline void iadst16_stage3_avx2(__m256i *x) {
225 btf_16_adds_subs_avx2(&x[0], &x[8]);
226 btf_16_adds_subs_avx2(&x[1], &x[9]);
227 btf_16_adds_subs_avx2(&x[2], &x[10]);
228 btf_16_adds_subs_avx2(&x[3], &x[11]);
229 btf_16_adds_subs_avx2(&x[4], &x[12]);
230 btf_16_adds_subs_avx2(&x[5], &x[13]);
231 btf_16_adds_subs_avx2(&x[6], &x[14]);
232 btf_16_adds_subs_avx2(&x[7], &x[15]);
233 }
234
iadst16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)235 static inline void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
236 const __m256i _r, int8_t cos_bit) {
237 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
238 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
239 const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
240 const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
241 const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
242 const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
243 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
244 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
245 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
246 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
247 }
248
iadst16_stage5_avx2(__m256i * x)249 static inline void iadst16_stage5_avx2(__m256i *x) {
250 btf_16_adds_subs_avx2(&x[0], &x[4]);
251 btf_16_adds_subs_avx2(&x[1], &x[5]);
252 btf_16_adds_subs_avx2(&x[2], &x[6]);
253 btf_16_adds_subs_avx2(&x[3], &x[7]);
254 btf_16_adds_subs_avx2(&x[8], &x[12]);
255 btf_16_adds_subs_avx2(&x[9], &x[13]);
256 btf_16_adds_subs_avx2(&x[10], &x[14]);
257 btf_16_adds_subs_avx2(&x[11], &x[15]);
258 }
259
iadst16_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)260 static inline void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
261 const __m256i _r, int8_t cos_bit) {
262 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
263 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
264 const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
265 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
266 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
267 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
268 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
269 }
270
iadst16_stage7_avx2(__m256i * x)271 static inline void iadst16_stage7_avx2(__m256i *x) {
272 btf_16_adds_subs_avx2(&x[0], &x[2]);
273 btf_16_adds_subs_avx2(&x[1], &x[3]);
274 btf_16_adds_subs_avx2(&x[4], &x[6]);
275 btf_16_adds_subs_avx2(&x[5], &x[7]);
276 btf_16_adds_subs_avx2(&x[8], &x[10]);
277 btf_16_adds_subs_avx2(&x[9], &x[11]);
278 btf_16_adds_subs_avx2(&x[12], &x[14]);
279 btf_16_adds_subs_avx2(&x[13], &x[15]);
280 }
281
iadst16_stage8_avx2(__m256i * x1,const int32_t * cospi,const __m256i _r,int8_t cos_bit)282 static inline void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
283 const __m256i _r, int8_t cos_bit) {
284 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
285 const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
286 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
287 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
288 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
289 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
290 }
291
iadst16_stage9_avx2(__m256i * output,__m256i * x1)292 static inline void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
293 const __m256i __zero = _mm256_setzero_si256();
294 output[0] = x1[0];
295 output[1] = _mm256_subs_epi16(__zero, x1[8]);
296 output[2] = x1[12];
297 output[3] = _mm256_subs_epi16(__zero, x1[4]);
298 output[4] = x1[6];
299 output[5] = _mm256_subs_epi16(__zero, x1[14]);
300 output[6] = x1[10];
301 output[7] = _mm256_subs_epi16(__zero, x1[2]);
302 output[8] = x1[3];
303 output[9] = _mm256_subs_epi16(__zero, x1[11]);
304 output[10] = x1[15];
305 output[11] = _mm256_subs_epi16(__zero, x1[7]);
306 output[12] = x1[5];
307 output[13] = _mm256_subs_epi16(__zero, x1[13]);
308 output[14] = x1[9];
309 output[15] = _mm256_subs_epi16(__zero, x1[1]);
310 }
311
iadst16_avx2(const __m256i * input,__m256i * output)312 static void iadst16_avx2(const __m256i *input, __m256i *output) {
313 const int32_t *cospi = cospi_arr(INV_COS_BIT);
314
315 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
316
317 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
318 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
319 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
320 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
321 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
322 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
323 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
324 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
325 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
326 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
327 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
328 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
329 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
330 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
331 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
332 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
333
334 // stage 1
335 __m256i x1[16];
336 x1[0] = input[15];
337 x1[1] = input[0];
338 x1[2] = input[13];
339 x1[3] = input[2];
340 x1[4] = input[11];
341 x1[5] = input[4];
342 x1[6] = input[9];
343 x1[7] = input[6];
344 x1[8] = input[7];
345 x1[9] = input[8];
346 x1[10] = input[5];
347 x1[11] = input[10];
348 x1[12] = input[3];
349 x1[13] = input[12];
350 x1[14] = input[1];
351 x1[15] = input[14];
352
353 // stage 2
354 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r,
355 INV_COS_BIT);
356 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r,
357 INV_COS_BIT);
358 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r,
359 INV_COS_BIT);
360 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r,
361 INV_COS_BIT);
362 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r,
363 INV_COS_BIT);
364 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r,
365 INV_COS_BIT);
366 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r,
367 INV_COS_BIT);
368 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r,
369 INV_COS_BIT);
370
371 iadst16_stage3_avx2(x1);
372 iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
373 iadst16_stage5_avx2(x1);
374 iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
375 iadst16_stage7_avx2(x1);
376 iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
377 iadst16_stage9_avx2(output, x1);
378 }
379
iadst16_low8_avx2(const __m256i * input,__m256i * output)380 static void iadst16_low8_avx2(const __m256i *input, __m256i *output) {
381 const int32_t *cospi = cospi_arr(INV_COS_BIT);
382 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
383
384 // stage 1
385 __m256i x1[16];
386 x1[1] = input[0];
387 x1[3] = input[2];
388 x1[5] = input[4];
389 x1[7] = input[6];
390 x1[8] = input[7];
391 x1[10] = input[5];
392 x1[12] = input[3];
393 x1[14] = input[1];
394
395 // stage 2
396 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
397 btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
398 btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
399 btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
400 btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
401 btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
402 btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
403 btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
404
405 iadst16_stage3_avx2(x1);
406 iadst16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
407 iadst16_stage5_avx2(x1);
408 iadst16_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
409 iadst16_stage7_avx2(x1);
410 iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
411 iadst16_stage9_avx2(output, x1);
412 }
413
iadst16_low1_avx2(const __m256i * input,__m256i * output)414 static void iadst16_low1_avx2(const __m256i *input, __m256i *output) {
415 const int32_t *cospi = cospi_arr(INV_COS_BIT);
416 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
417
418 const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
419 const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
420 const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
421 const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
422
423 // stage 1
424 __m256i x1[16];
425 x1[1] = input[0];
426
427 // stage 2
428 btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
429
430 // stage 3
431 x1[8] = x1[0];
432 x1[9] = x1[1];
433
434 // stage 4
435 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r,
436 INV_COS_BIT);
437
438 // stage 5
439 x1[4] = x1[0];
440 x1[5] = x1[1];
441
442 x1[12] = x1[8];
443 x1[13] = x1[9];
444
445 // stage 6
446 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r,
447 INV_COS_BIT);
448 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r,
449 INV_COS_BIT);
450
451 // stage 7
452 x1[2] = x1[0];
453 x1[3] = x1[1];
454 x1[6] = x1[4];
455 x1[7] = x1[5];
456 x1[10] = x1[8];
457 x1[11] = x1[9];
458 x1[14] = x1[12];
459 x1[15] = x1[13];
460
461 iadst16_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
462 iadst16_stage9_avx2(output, x1);
463 }
464
idct32_high16_stage3_avx2(__m256i * x)465 static inline void idct32_high16_stage3_avx2(__m256i *x) {
466 btf_16_adds_subs_avx2(&x[16], &x[17]);
467 btf_16_adds_subs_avx2(&x[19], &x[18]);
468 btf_16_adds_subs_avx2(&x[20], &x[21]);
469 btf_16_adds_subs_avx2(&x[23], &x[22]);
470 btf_16_adds_subs_avx2(&x[24], &x[25]);
471 btf_16_adds_subs_avx2(&x[27], &x[26]);
472 btf_16_adds_subs_avx2(&x[28], &x[29]);
473 btf_16_adds_subs_avx2(&x[31], &x[30]);
474 }
475
idct32_high16_stage4_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)476 static inline void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
477 const __m256i _r, int8_t cos_bit) {
478 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
479 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
480 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
481 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
482 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
483 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
484 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
485 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
486 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
487 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
488 }
489
idct32_high24_stage5_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)490 static inline void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
491 const __m256i _r, int8_t cos_bit) {
492 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
493 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
494 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
495 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
496 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
497 btf_16_adds_subs_avx2(&x[16], &x[19]);
498 btf_16_adds_subs_avx2(&x[17], &x[18]);
499 btf_16_adds_subs_avx2(&x[23], &x[20]);
500 btf_16_adds_subs_avx2(&x[22], &x[21]);
501 btf_16_adds_subs_avx2(&x[24], &x[27]);
502 btf_16_adds_subs_avx2(&x[25], &x[26]);
503 btf_16_adds_subs_avx2(&x[31], &x[28]);
504 btf_16_adds_subs_avx2(&x[30], &x[29]);
505 }
506
idct32_high28_stage6_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)507 static inline void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
508 const __m256i _r, int8_t cos_bit) {
509 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
510 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
511 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
512 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
513 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
514 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
515 btf_16_adds_subs_avx2(&x[8], &x[11]);
516 btf_16_adds_subs_avx2(&x[9], &x[10]);
517 btf_16_adds_subs_avx2(&x[15], &x[12]);
518 btf_16_adds_subs_avx2(&x[14], &x[13]);
519 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
520 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
521 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
522 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
523 }
524
idct32_stage7_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)525 static inline void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
526 const __m256i _r, int8_t cos_bit) {
527 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
528 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
529 btf_16_adds_subs_avx2(&x[0], &x[7]);
530 btf_16_adds_subs_avx2(&x[1], &x[6]);
531 btf_16_adds_subs_avx2(&x[2], &x[5]);
532 btf_16_adds_subs_avx2(&x[3], &x[4]);
533 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
534 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
535 btf_16_adds_subs_avx2(&x[16], &x[23]);
536 btf_16_adds_subs_avx2(&x[17], &x[22]);
537 btf_16_adds_subs_avx2(&x[18], &x[21]);
538 btf_16_adds_subs_avx2(&x[19], &x[20]);
539 btf_16_adds_subs_avx2(&x[31], &x[24]);
540 btf_16_adds_subs_avx2(&x[30], &x[25]);
541 btf_16_adds_subs_avx2(&x[29], &x[26]);
542 btf_16_adds_subs_avx2(&x[28], &x[27]);
543 }
544
idct32_stage8_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)545 static inline void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
546 const __m256i _r, int8_t cos_bit) {
547 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
548 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
549 btf_16_adds_subs_avx2(&x[0], &x[15]);
550 btf_16_adds_subs_avx2(&x[1], &x[14]);
551 btf_16_adds_subs_avx2(&x[2], &x[13]);
552 btf_16_adds_subs_avx2(&x[3], &x[12]);
553 btf_16_adds_subs_avx2(&x[4], &x[11]);
554 btf_16_adds_subs_avx2(&x[5], &x[10]);
555 btf_16_adds_subs_avx2(&x[6], &x[9]);
556 btf_16_adds_subs_avx2(&x[7], &x[8]);
557 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
558 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
559 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
560 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
561 }
562
idct32_stage9_avx2(__m256i * output,__m256i * x)563 static inline void idct32_stage9_avx2(__m256i *output, __m256i *x) {
564 btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
565 btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
566 btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
567 btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
568 btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
569 btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
570 btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
571 btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
572 btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
573 btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
574 btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
575 btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
576 btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
577 btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
578 btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
579 btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
580 }
581
idct32_low1_avx2(const __m256i * input,__m256i * output)582 static void idct32_low1_avx2(const __m256i *input, __m256i *output) {
583 const int32_t *cospi = cospi_arr(INV_COS_BIT);
584
585 // stage 1
586 __m256i x[2];
587 x[0] = input[0];
588
589 // stage 2
590 // stage 3
591 // stage 4
592 // stage 5
593 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
594
595 // stage 6
596 // stage 7
597 // stage 8
598 // stage 9
599 output[0] = x[0];
600 output[31] = x[0];
601 output[1] = x[1];
602 output[30] = x[1];
603 output[2] = x[1];
604 output[29] = x[1];
605 output[3] = x[0];
606 output[28] = x[0];
607 output[4] = x[0];
608 output[27] = x[0];
609 output[5] = x[1];
610 output[26] = x[1];
611 output[6] = x[1];
612 output[25] = x[1];
613 output[7] = x[0];
614 output[24] = x[0];
615 output[8] = x[0];
616 output[23] = x[0];
617 output[9] = x[1];
618 output[22] = x[1];
619 output[10] = x[1];
620 output[21] = x[1];
621 output[11] = x[0];
622 output[20] = x[0];
623 output[12] = x[0];
624 output[19] = x[0];
625 output[13] = x[1];
626 output[18] = x[1];
627 output[14] = x[1];
628 output[17] = x[1];
629 output[15] = x[0];
630 output[16] = x[0];
631 }
632
idct32_low8_avx2(const __m256i * input,__m256i * output)633 static void idct32_low8_avx2(const __m256i *input, __m256i *output) {
634 const int32_t *cospi = cospi_arr(INV_COS_BIT);
635 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
636
637 // stage 1
638 __m256i x[32];
639 x[0] = input[0];
640 x[4] = input[4];
641 x[8] = input[2];
642 x[12] = input[6];
643 x[16] = input[1];
644 x[20] = input[5];
645 x[24] = input[3];
646 x[28] = input[7];
647
648 // stage 2
649 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
650 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
651 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
652 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
653
654 // stage 3
655 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
656 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
657 x[17] = x[16];
658 x[18] = x[19];
659 x[21] = x[20];
660 x[22] = x[23];
661 x[25] = x[24];
662 x[26] = x[27];
663 x[29] = x[28];
664 x[30] = x[31];
665
666 // stage 4
667 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
668 x[9] = x[8];
669 x[10] = x[11];
670 x[13] = x[12];
671 x[14] = x[15];
672 idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
673
674 // stage 5
675 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
676 x[5] = x[4];
677 x[6] = x[7];
678 idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
679 // stage 6
680 x[3] = x[0];
681 x[2] = x[1];
682 idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
683
684 idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
685 idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
686 idct32_stage9_avx2(output, x);
687 }
688
idct32_low16_avx2(const __m256i * input,__m256i * output)689 static void idct32_low16_avx2(const __m256i *input, __m256i *output) {
690 const int32_t *cospi = cospi_arr(INV_COS_BIT);
691 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
692
693 // stage 1
694 __m256i x[32];
695 x[0] = input[0];
696 x[2] = input[8];
697 x[4] = input[4];
698 x[6] = input[12];
699 x[8] = input[2];
700 x[10] = input[10];
701 x[12] = input[6];
702 x[14] = input[14];
703 x[16] = input[1];
704 x[18] = input[9];
705 x[20] = input[5];
706 x[22] = input[13];
707 x[24] = input[3];
708 x[26] = input[11];
709 x[28] = input[7];
710 x[30] = input[15];
711
712 // stage 2
713 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
714 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
715 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
716 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
717 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
718 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
719 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
720 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
721
722 // stage 3
723 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
724 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
725 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
726 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
727 idct32_high16_stage3_avx2(x);
728
729 // stage 4
730 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
731 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
732 btf_16_adds_subs_avx2(&x[8], &x[9]);
733 btf_16_adds_subs_avx2(&x[11], &x[10]);
734 btf_16_adds_subs_avx2(&x[12], &x[13]);
735 btf_16_adds_subs_avx2(&x[15], &x[14]);
736 idct32_high16_stage4_avx2(x, cospi, _r, INV_COS_BIT);
737
738 // stage 5
739 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
740 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
741 btf_16_adds_subs_avx2(&x[4], &x[5]);
742 btf_16_adds_subs_avx2(&x[7], &x[6]);
743 idct32_high24_stage5_avx2(x, cospi, _r, INV_COS_BIT);
744
745 btf_16_adds_subs_avx2(&x[0], &x[3]);
746 btf_16_adds_subs_avx2(&x[1], &x[2]);
747 idct32_high28_stage6_avx2(x, cospi, _r, INV_COS_BIT);
748
749 idct32_stage7_avx2(x, cospi, _r, INV_COS_BIT);
750 idct32_stage8_avx2(x, cospi, _r, INV_COS_BIT);
751 idct32_stage9_avx2(output, x);
752 }
753
idct32_avx2(const __m256i * input,__m256i * output)754 static void idct32_avx2(const __m256i *input, __m256i *output) {
755 const int32_t *cospi = cospi_arr(INV_COS_BIT);
756 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
757
758 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
759 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
760 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
761 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
762 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
763 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
764 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
765 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
766 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
767 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
768 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
769 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
770 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
771 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
772 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
773 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
774 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
775 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
776 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
777 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
778 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
779 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
780 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
781 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
782 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
783 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
784 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
785 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
786 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
787 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
788 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
789 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
790
791 // stage 1
792 __m256i x1[32];
793 x1[0] = input[0];
794 x1[1] = input[16];
795 x1[2] = input[8];
796 x1[3] = input[24];
797 x1[4] = input[4];
798 x1[5] = input[20];
799 x1[6] = input[12];
800 x1[7] = input[28];
801 x1[8] = input[2];
802 x1[9] = input[18];
803 x1[10] = input[10];
804 x1[11] = input[26];
805 x1[12] = input[6];
806 x1[13] = input[22];
807 x1[14] = input[14];
808 x1[15] = input[30];
809 x1[16] = input[1];
810 x1[17] = input[17];
811 x1[18] = input[9];
812 x1[19] = input[25];
813 x1[20] = input[5];
814 x1[21] = input[21];
815 x1[22] = input[13];
816 x1[23] = input[29];
817 x1[24] = input[3];
818 x1[25] = input[19];
819 x1[26] = input[11];
820 x1[27] = input[27];
821 x1[28] = input[7];
822 x1[29] = input[23];
823 x1[30] = input[15];
824 x1[31] = input[31];
825
826 // stage 2
827 btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r,
828 INV_COS_BIT);
829 btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r,
830 INV_COS_BIT);
831 btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r,
832 INV_COS_BIT);
833 btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r,
834 INV_COS_BIT);
835 btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r,
836 INV_COS_BIT);
837 btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r,
838 INV_COS_BIT);
839 btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r,
840 INV_COS_BIT);
841 btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r,
842 INV_COS_BIT);
843
844 // stage 3
845 btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r,
846 INV_COS_BIT);
847 btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r,
848 INV_COS_BIT);
849 btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r,
850 INV_COS_BIT);
851 btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r,
852 INV_COS_BIT);
853 idct32_high16_stage3_avx2(x1);
854
855 // stage 4
856 btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r,
857 INV_COS_BIT);
858 btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r,
859 INV_COS_BIT);
860 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
861 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
862 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
863 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
864 idct32_high16_stage4_avx2(x1, cospi, _r, INV_COS_BIT);
865
866 // stage 5
867 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r,
868 INV_COS_BIT);
869 btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r,
870 INV_COS_BIT);
871 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
872 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
873 idct32_high24_stage5_avx2(x1, cospi, _r, INV_COS_BIT);
874
875 // stage 6
876 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
877 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
878 idct32_high28_stage6_avx2(x1, cospi, _r, INV_COS_BIT);
879
880 idct32_stage7_avx2(x1, cospi, _r, INV_COS_BIT);
881 idct32_stage8_avx2(x1, cospi, _r, INV_COS_BIT);
882 idct32_stage9_avx2(output, x1);
883 }
884
idct64_stage4_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)885 static inline void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
886 const __m256i _r, int8_t cos_bit) {
887 (void)cos_bit;
888 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
889 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
890 const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
891 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
892 const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
893 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
894 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
895 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
896 const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
897 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
898 const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
899 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
900 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
901 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
902 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
903 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
904 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
905 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
906 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
907 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
908 }
909
idct64_stage5_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)910 static inline void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
911 const __m256i _r, int8_t cos_bit) {
912 (void)cos_bit;
913 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
914 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
915 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
916 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
917 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
918 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
919 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
920 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
921 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
922 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
923 btf_16_adds_subs_avx2(&x[32], &x[35]);
924 btf_16_adds_subs_avx2(&x[33], &x[34]);
925 btf_16_adds_subs_avx2(&x[39], &x[36]);
926 btf_16_adds_subs_avx2(&x[38], &x[37]);
927 btf_16_adds_subs_avx2(&x[40], &x[43]);
928 btf_16_adds_subs_avx2(&x[41], &x[42]);
929 btf_16_adds_subs_avx2(&x[47], &x[44]);
930 btf_16_adds_subs_avx2(&x[46], &x[45]);
931 btf_16_adds_subs_avx2(&x[48], &x[51]);
932 btf_16_adds_subs_avx2(&x[49], &x[50]);
933 btf_16_adds_subs_avx2(&x[55], &x[52]);
934 btf_16_adds_subs_avx2(&x[54], &x[53]);
935 btf_16_adds_subs_avx2(&x[56], &x[59]);
936 btf_16_adds_subs_avx2(&x[57], &x[58]);
937 btf_16_adds_subs_avx2(&x[63], &x[60]);
938 btf_16_adds_subs_avx2(&x[62], &x[61]);
939 }
940
idct64_stage6_high32_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)941 static inline void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
942 const __m256i _r, int8_t cos_bit) {
943 (void)cos_bit;
944 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
945 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
946 const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
947 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
948 const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
949 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
950 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
951 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
952 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
953 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
954 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
955 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
956 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
957 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
958 }
959
idct64_stage6_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)960 static inline void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
961 const __m256i _r, int8_t cos_bit) {
962 btf_16_adds_subs_avx2(&x[16], &x[19]);
963 btf_16_adds_subs_avx2(&x[17], &x[18]);
964 btf_16_adds_subs_avx2(&x[23], &x[20]);
965 btf_16_adds_subs_avx2(&x[22], &x[21]);
966 btf_16_adds_subs_avx2(&x[24], &x[27]);
967 btf_16_adds_subs_avx2(&x[25], &x[26]);
968 btf_16_adds_subs_avx2(&x[31], &x[28]);
969 btf_16_adds_subs_avx2(&x[30], &x[29]);
970 idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
971 }
972
idct64_stage7_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)973 static inline void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
974 const __m256i _r, int8_t cos_bit) {
975 (void)cos_bit;
976 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
977 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
978 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
979 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
980 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
981 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
982 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
983 btf_16_adds_subs_avx2(&x[32], &x[39]);
984 btf_16_adds_subs_avx2(&x[33], &x[38]);
985 btf_16_adds_subs_avx2(&x[34], &x[37]);
986 btf_16_adds_subs_avx2(&x[35], &x[36]);
987 btf_16_adds_subs_avx2(&x[47], &x[40]);
988 btf_16_adds_subs_avx2(&x[46], &x[41]);
989 btf_16_adds_subs_avx2(&x[45], &x[42]);
990 btf_16_adds_subs_avx2(&x[44], &x[43]);
991 btf_16_adds_subs_avx2(&x[48], &x[55]);
992 btf_16_adds_subs_avx2(&x[49], &x[54]);
993 btf_16_adds_subs_avx2(&x[50], &x[53]);
994 btf_16_adds_subs_avx2(&x[51], &x[52]);
995 btf_16_adds_subs_avx2(&x[63], &x[56]);
996 btf_16_adds_subs_avx2(&x[62], &x[57]);
997 btf_16_adds_subs_avx2(&x[61], &x[58]);
998 btf_16_adds_subs_avx2(&x[60], &x[59]);
999 }
1000
idct64_stage8_high48_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1001 static inline void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
1002 const __m256i _r, int8_t cos_bit) {
1003 (void)cos_bit;
1004 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1005 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1006 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1007 btf_16_adds_subs_avx2(&x[16], &x[23]);
1008 btf_16_adds_subs_avx2(&x[17], &x[22]);
1009 btf_16_adds_subs_avx2(&x[18], &x[21]);
1010 btf_16_adds_subs_avx2(&x[19], &x[20]);
1011 btf_16_adds_subs_avx2(&x[31], &x[24]);
1012 btf_16_adds_subs_avx2(&x[30], &x[25]);
1013 btf_16_adds_subs_avx2(&x[29], &x[26]);
1014 btf_16_adds_subs_avx2(&x[28], &x[27]);
1015 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
1016 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
1017 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
1018 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
1019 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
1020 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
1021 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
1022 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
1023 }
1024
idct64_stage9_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1025 static inline void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
1026 const __m256i _r, int8_t cos_bit) {
1027 (void)cos_bit;
1028 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1029 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1030 btf_16_adds_subs_avx2(&x[0], &x[15]);
1031 btf_16_adds_subs_avx2(&x[1], &x[14]);
1032 btf_16_adds_subs_avx2(&x[2], &x[13]);
1033 btf_16_adds_subs_avx2(&x[3], &x[12]);
1034 btf_16_adds_subs_avx2(&x[4], &x[11]);
1035 btf_16_adds_subs_avx2(&x[5], &x[10]);
1036 btf_16_adds_subs_avx2(&x[6], &x[9]);
1037 btf_16_adds_subs_avx2(&x[7], &x[8]);
1038 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
1039 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
1040 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
1041 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
1042 btf_16_adds_subs_avx2(&x[32], &x[47]);
1043 btf_16_adds_subs_avx2(&x[33], &x[46]);
1044 btf_16_adds_subs_avx2(&x[34], &x[45]);
1045 btf_16_adds_subs_avx2(&x[35], &x[44]);
1046 btf_16_adds_subs_avx2(&x[36], &x[43]);
1047 btf_16_adds_subs_avx2(&x[37], &x[42]);
1048 btf_16_adds_subs_avx2(&x[38], &x[41]);
1049 btf_16_adds_subs_avx2(&x[39], &x[40]);
1050 btf_16_adds_subs_avx2(&x[63], &x[48]);
1051 btf_16_adds_subs_avx2(&x[62], &x[49]);
1052 btf_16_adds_subs_avx2(&x[61], &x[50]);
1053 btf_16_adds_subs_avx2(&x[60], &x[51]);
1054 btf_16_adds_subs_avx2(&x[59], &x[52]);
1055 btf_16_adds_subs_avx2(&x[58], &x[53]);
1056 btf_16_adds_subs_avx2(&x[57], &x[54]);
1057 btf_16_adds_subs_avx2(&x[56], &x[55]);
1058 }
1059
idct64_stage10_avx2(__m256i * x,const int32_t * cospi,const __m256i _r,int8_t cos_bit)1060 static inline void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
1061 const __m256i _r, int8_t cos_bit) {
1062 (void)cos_bit;
1063 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1064 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1065 btf_16_adds_subs_avx2(&x[0], &x[31]);
1066 btf_16_adds_subs_avx2(&x[1], &x[30]);
1067 btf_16_adds_subs_avx2(&x[2], &x[29]);
1068 btf_16_adds_subs_avx2(&x[3], &x[28]);
1069 btf_16_adds_subs_avx2(&x[4], &x[27]);
1070 btf_16_adds_subs_avx2(&x[5], &x[26]);
1071 btf_16_adds_subs_avx2(&x[6], &x[25]);
1072 btf_16_adds_subs_avx2(&x[7], &x[24]);
1073 btf_16_adds_subs_avx2(&x[8], &x[23]);
1074 btf_16_adds_subs_avx2(&x[9], &x[22]);
1075 btf_16_adds_subs_avx2(&x[10], &x[21]);
1076 btf_16_adds_subs_avx2(&x[11], &x[20]);
1077 btf_16_adds_subs_avx2(&x[12], &x[19]);
1078 btf_16_adds_subs_avx2(&x[13], &x[18]);
1079 btf_16_adds_subs_avx2(&x[14], &x[17]);
1080 btf_16_adds_subs_avx2(&x[15], &x[16]);
1081 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
1082 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
1083 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
1084 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
1085 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
1086 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
1087 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
1088 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
1089 }
1090
idct64_stage11_avx2(__m256i * output,__m256i * x)1091 static inline void idct64_stage11_avx2(__m256i *output, __m256i *x) {
1092 btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
1093 btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
1094 btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
1095 btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
1096 btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
1097 btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
1098 btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
1099 btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
1100 btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
1101 btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
1102 btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
1103 btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
1104 btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
1105 btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
1106 btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
1107 btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
1108 btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
1109 btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
1110 btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
1111 btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
1112 btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
1113 btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
1114 btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
1115 btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
1116 btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
1117 btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
1118 btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
1119 btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
1120 btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
1121 btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
1122 btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
1123 btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
1124 }
1125
idct64_low1_avx2(const __m256i * input,__m256i * output)1126 static void idct64_low1_avx2(const __m256i *input, __m256i *output) {
1127 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1128
1129 // stage 1
1130 __m256i x[32];
1131 x[0] = input[0];
1132
1133 // stage 2
1134 // stage 3
1135 // stage 4
1136 // stage 5
1137 // stage 6
1138 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1139
1140 // stage 7
1141 // stage 8
1142 // stage 9
1143 // stage 10
1144 // stage 11
1145 output[0] = x[0];
1146 output[63] = x[0];
1147 output[1] = x[1];
1148 output[62] = x[1];
1149 output[2] = x[1];
1150 output[61] = x[1];
1151 output[3] = x[0];
1152 output[60] = x[0];
1153 output[4] = x[0];
1154 output[59] = x[0];
1155 output[5] = x[1];
1156 output[58] = x[1];
1157 output[6] = x[1];
1158 output[57] = x[1];
1159 output[7] = x[0];
1160 output[56] = x[0];
1161 output[8] = x[0];
1162 output[55] = x[0];
1163 output[9] = x[1];
1164 output[54] = x[1];
1165 output[10] = x[1];
1166 output[53] = x[1];
1167 output[11] = x[0];
1168 output[52] = x[0];
1169 output[12] = x[0];
1170 output[51] = x[0];
1171 output[13] = x[1];
1172 output[50] = x[1];
1173 output[14] = x[1];
1174 output[49] = x[1];
1175 output[15] = x[0];
1176 output[48] = x[0];
1177 output[16] = x[0];
1178 output[47] = x[0];
1179 output[17] = x[1];
1180 output[46] = x[1];
1181 output[18] = x[1];
1182 output[45] = x[1];
1183 output[19] = x[0];
1184 output[44] = x[0];
1185 output[20] = x[0];
1186 output[43] = x[0];
1187 output[21] = x[1];
1188 output[42] = x[1];
1189 output[22] = x[1];
1190 output[41] = x[1];
1191 output[23] = x[0];
1192 output[40] = x[0];
1193 output[24] = x[0];
1194 output[39] = x[0];
1195 output[25] = x[1];
1196 output[38] = x[1];
1197 output[26] = x[1];
1198 output[37] = x[1];
1199 output[27] = x[0];
1200 output[36] = x[0];
1201 output[28] = x[0];
1202 output[35] = x[0];
1203 output[29] = x[1];
1204 output[34] = x[1];
1205 output[30] = x[1];
1206 output[33] = x[1];
1207 output[31] = x[0];
1208 output[32] = x[0];
1209 }
1210
idct64_low8_avx2(const __m256i * input,__m256i * output)1211 static void idct64_low8_avx2(const __m256i *input, __m256i *output) {
1212 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1213 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1214 const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
1215 const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
1216 const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
1217 const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
1218 const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
1219 const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
1220 const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
1221 const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
1222 const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
1223 const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
1224 const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
1225 const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
1226 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1227 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1228 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1229 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1230
1231 // stage 1
1232 __m256i x[64];
1233 x[0] = input[0];
1234 x[8] = input[4];
1235 x[16] = input[2];
1236 x[24] = input[6];
1237 x[32] = input[1];
1238 x[40] = input[5];
1239 x[48] = input[3];
1240 x[56] = input[7];
1241
1242 // stage 2
1243 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1244 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1245 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1246 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1247
1248 // stage 3
1249 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1250 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1251 x[33] = x[32];
1252 x[38] = x[39];
1253 x[41] = x[40];
1254 x[46] = x[47];
1255 x[49] = x[48];
1256 x[54] = x[55];
1257 x[57] = x[56];
1258 x[62] = x[63];
1259
1260 // stage 4
1261 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1262 x[17] = x[16];
1263 x[22] = x[23];
1264 x[25] = x[24];
1265 x[30] = x[31];
1266 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r,
1267 INV_COS_BIT);
1268 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r,
1269 INV_COS_BIT);
1270 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r,
1271 INV_COS_BIT);
1272 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r,
1273 INV_COS_BIT);
1274
1275 // stage 5
1276 x[9] = x[8];
1277 x[14] = x[15];
1278 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r,
1279 INV_COS_BIT);
1280 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r,
1281 INV_COS_BIT);
1282 x[35] = x[32];
1283 x[34] = x[33];
1284 x[36] = x[39];
1285 x[37] = x[38];
1286 x[43] = x[40];
1287 x[42] = x[41];
1288 x[44] = x[47];
1289 x[45] = x[46];
1290 x[51] = x[48];
1291 x[50] = x[49];
1292 x[52] = x[55];
1293 x[53] = x[54];
1294 x[59] = x[56];
1295 x[58] = x[57];
1296 x[60] = x[63];
1297 x[61] = x[62];
1298
1299 // stage 6
1300 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1301 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1302 x[19] = x[16];
1303 x[18] = x[17];
1304 x[20] = x[23];
1305 x[21] = x[22];
1306 x[27] = x[24];
1307 x[26] = x[25];
1308 x[28] = x[31];
1309 x[29] = x[30];
1310 idct64_stage6_high32_avx2(x, cospi, _r, INV_COS_BIT);
1311
1312 // stage 7
1313 x[3] = x[0];
1314 x[2] = x[1];
1315 x[11] = x[8];
1316 x[10] = x[9];
1317 x[12] = x[15];
1318 x[13] = x[14];
1319 idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1320
1321 // stage 8
1322 x[7] = x[0];
1323 x[6] = x[1];
1324 x[5] = x[2];
1325 x[4] = x[3];
1326 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1327 INV_COS_BIT);
1328 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1329 INV_COS_BIT);
1330 idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1331
1332 idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1333 idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1334 idct64_stage11_avx2(output, x);
1335 }
1336
idct64_low16_avx2(const __m256i * input,__m256i * output)1337 static void idct64_low16_avx2(const __m256i *input, __m256i *output) {
1338 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1339 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1340
1341 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1342 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1343 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1344 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1345 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1346
1347 // stage 1
1348 __m256i x[64];
1349 x[0] = input[0];
1350 x[4] = input[8];
1351 x[8] = input[4];
1352 x[12] = input[12];
1353 x[16] = input[2];
1354 x[20] = input[10];
1355 x[24] = input[6];
1356 x[28] = input[14];
1357 x[32] = input[1];
1358 x[36] = input[9];
1359 x[40] = input[5];
1360 x[44] = input[13];
1361 x[48] = input[3];
1362 x[52] = input[11];
1363 x[56] = input[7];
1364 x[60] = input[15];
1365
1366 // stage 2
1367 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1368 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1369 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1370 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1371 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1372 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1373 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1374 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1375
1376 // stage 3
1377 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1378 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1379 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1380 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1381 x[33] = x[32];
1382 x[34] = x[35];
1383 x[37] = x[36];
1384 x[38] = x[39];
1385 x[41] = x[40];
1386 x[42] = x[43];
1387 x[45] = x[44];
1388 x[46] = x[47];
1389 x[49] = x[48];
1390 x[50] = x[51];
1391 x[53] = x[52];
1392 x[54] = x[55];
1393 x[57] = x[56];
1394 x[58] = x[59];
1395 x[61] = x[60];
1396 x[62] = x[63];
1397
1398 // stage 4
1399 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1400 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1401 x[17] = x[16];
1402 x[18] = x[19];
1403 x[21] = x[20];
1404 x[22] = x[23];
1405 x[25] = x[24];
1406 x[26] = x[27];
1407 x[29] = x[28];
1408 x[30] = x[31];
1409 idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
1410
1411 // stage 5
1412 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1413 x[9] = x[8];
1414 x[10] = x[11];
1415 x[13] = x[12];
1416 x[14] = x[15];
1417 idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
1418
1419 // stage 6
1420 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1421 x[5] = x[4];
1422 x[6] = x[7];
1423 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1424 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
1425 INV_COS_BIT);
1426 idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
1427
1428 // stage 7
1429 x[3] = x[0];
1430 x[2] = x[1];
1431 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
1432 btf_16_adds_subs_avx2(&x[8], &x[11]);
1433 btf_16_adds_subs_avx2(&x[9], &x[10]);
1434 btf_16_adds_subs_avx2(&x[15], &x[12]);
1435 btf_16_adds_subs_avx2(&x[14], &x[13]);
1436 idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1437
1438 // stage 8
1439 btf_16_adds_subs_avx2(&x[0], &x[7]);
1440 btf_16_adds_subs_avx2(&x[1], &x[6]);
1441 btf_16_adds_subs_avx2(&x[2], &x[5]);
1442 btf_16_adds_subs_avx2(&x[3], &x[4]);
1443 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1444 INV_COS_BIT);
1445 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1446 INV_COS_BIT);
1447 idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1448
1449 idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1450 idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1451 idct64_stage11_avx2(output, x);
1452 }
1453
idct64_low32_avx2(const __m256i * input,__m256i * output)1454 static void idct64_low32_avx2(const __m256i *input, __m256i *output) {
1455 const int32_t *cospi = cospi_arr(INV_COS_BIT);
1456 const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
1457
1458 const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1459 const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
1460 const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
1461 const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
1462 const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
1463
1464 // stage 1
1465 __m256i x[64];
1466 x[0] = input[0];
1467 x[2] = input[16];
1468 x[4] = input[8];
1469 x[6] = input[24];
1470 x[8] = input[4];
1471 x[10] = input[20];
1472 x[12] = input[12];
1473 x[14] = input[28];
1474 x[16] = input[2];
1475 x[18] = input[18];
1476 x[20] = input[10];
1477 x[22] = input[26];
1478 x[24] = input[6];
1479 x[26] = input[22];
1480 x[28] = input[14];
1481 x[30] = input[30];
1482 x[32] = input[1];
1483 x[34] = input[17];
1484 x[36] = input[9];
1485 x[38] = input[25];
1486 x[40] = input[5];
1487 x[42] = input[21];
1488 x[44] = input[13];
1489 x[46] = input[29];
1490 x[48] = input[3];
1491 x[50] = input[19];
1492 x[52] = input[11];
1493 x[54] = input[27];
1494 x[56] = input[7];
1495 x[58] = input[23];
1496 x[60] = input[15];
1497 x[62] = input[31];
1498
1499 // stage 2
1500 btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
1501 btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
1502 btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
1503 btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
1504 btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
1505 btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
1506 btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
1507 btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
1508 btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
1509 btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
1510 btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
1511 btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
1512 btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
1513 btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
1514 btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
1515 btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
1516
1517 // stage 3
1518 btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
1519 btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
1520 btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
1521 btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
1522 btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
1523 btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
1524 btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
1525 btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
1526 btf_16_adds_subs_avx2(&x[32], &x[33]);
1527 btf_16_adds_subs_avx2(&x[35], &x[34]);
1528 btf_16_adds_subs_avx2(&x[36], &x[37]);
1529 btf_16_adds_subs_avx2(&x[39], &x[38]);
1530 btf_16_adds_subs_avx2(&x[40], &x[41]);
1531 btf_16_adds_subs_avx2(&x[43], &x[42]);
1532 btf_16_adds_subs_avx2(&x[44], &x[45]);
1533 btf_16_adds_subs_avx2(&x[47], &x[46]);
1534 btf_16_adds_subs_avx2(&x[48], &x[49]);
1535 btf_16_adds_subs_avx2(&x[51], &x[50]);
1536 btf_16_adds_subs_avx2(&x[52], &x[53]);
1537 btf_16_adds_subs_avx2(&x[55], &x[54]);
1538 btf_16_adds_subs_avx2(&x[56], &x[57]);
1539 btf_16_adds_subs_avx2(&x[59], &x[58]);
1540 btf_16_adds_subs_avx2(&x[60], &x[61]);
1541 btf_16_adds_subs_avx2(&x[63], &x[62]);
1542
1543 // stage 4
1544 btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
1545 btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
1546 btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
1547 btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
1548 btf_16_adds_subs_avx2(&x[16], &x[17]);
1549 btf_16_adds_subs_avx2(&x[19], &x[18]);
1550 btf_16_adds_subs_avx2(&x[20], &x[21]);
1551 btf_16_adds_subs_avx2(&x[23], &x[22]);
1552 btf_16_adds_subs_avx2(&x[24], &x[25]);
1553 btf_16_adds_subs_avx2(&x[27], &x[26]);
1554 btf_16_adds_subs_avx2(&x[28], &x[29]);
1555 btf_16_adds_subs_avx2(&x[31], &x[30]);
1556 idct64_stage4_high32_avx2(x, cospi, _r, INV_COS_BIT);
1557
1558 // stage 5
1559 btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
1560 btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
1561 btf_16_adds_subs_avx2(&x[8], &x[9]);
1562 btf_16_adds_subs_avx2(&x[11], &x[10]);
1563 btf_16_adds_subs_avx2(&x[12], &x[13]);
1564 btf_16_adds_subs_avx2(&x[15], &x[14]);
1565 idct64_stage5_high48_avx2(x, cospi, _r, INV_COS_BIT);
1566
1567 // stage 6
1568 btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
1569 btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
1570 btf_16_adds_subs_avx2(&x[4], &x[5]);
1571 btf_16_adds_subs_avx2(&x[7], &x[6]);
1572 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, INV_COS_BIT);
1573 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r,
1574 INV_COS_BIT);
1575 idct64_stage6_high48_avx2(x, cospi, _r, INV_COS_BIT);
1576
1577 // stage 7
1578 btf_16_adds_subs_avx2(&x[0], &x[3]);
1579 btf_16_adds_subs_avx2(&x[1], &x[2]);
1580 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, INV_COS_BIT);
1581 btf_16_adds_subs_avx2(&x[8], &x[11]);
1582 btf_16_adds_subs_avx2(&x[9], &x[10]);
1583 btf_16_adds_subs_avx2(&x[15], &x[12]);
1584 btf_16_adds_subs_avx2(&x[14], &x[13]);
1585 idct64_stage7_high48_avx2(x, cospi, _r, INV_COS_BIT);
1586
1587 // stage 8
1588 btf_16_adds_subs_avx2(&x[0], &x[7]);
1589 btf_16_adds_subs_avx2(&x[1], &x[6]);
1590 btf_16_adds_subs_avx2(&x[2], &x[5]);
1591 btf_16_adds_subs_avx2(&x[3], &x[4]);
1592 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r,
1593 INV_COS_BIT);
1594 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r,
1595 INV_COS_BIT);
1596 idct64_stage8_high48_avx2(x, cospi, _r, INV_COS_BIT);
1597
1598 // stage 9~11
1599 idct64_stage9_avx2(x, cospi, _r, INV_COS_BIT);
1600 idct64_stage10_avx2(x, cospi, _r, INV_COS_BIT);
1601 idct64_stage11_avx2(output, x);
1602 }
1603
1604 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output);
1605
1606 // 1D functions process 16 pixels at one time.
1607 static const transform_1d_avx2
1608 lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
1609 {
1610 { NULL, NULL, NULL, NULL },
1611 { NULL, NULL, NULL, NULL },
1612 { NULL, NULL, NULL, NULL },
1613 },
1614 { { NULL, NULL, NULL, NULL },
1615 { NULL, NULL, NULL, NULL },
1616 { NULL, NULL, NULL, NULL } },
1617 {
1618 { idct16_low1_avx2, idct16_low8_avx2, idct16_avx2, NULL },
1619 { iadst16_low1_avx2, iadst16_low8_avx2, iadst16_avx2, NULL },
1620 { NULL, NULL, NULL, NULL },
1621 },
1622 { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
1623 { NULL, NULL, NULL, NULL },
1624 { NULL, NULL, NULL, NULL } },
1625 { { idct64_low1_avx2, idct64_low8_avx2, idct64_low16_avx2,
1626 idct64_low32_avx2 },
1627 { NULL, NULL, NULL, NULL },
1628 { NULL, NULL, NULL, NULL } }
1629 };
1630
1631 // only process w >= 16 h >= 16
lowbd_inv_txfm2d_add_no_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1632 static inline void lowbd_inv_txfm2d_add_no_identity_avx2(
1633 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1634 TX_SIZE tx_size, int eob) {
1635 __m256i buf1[64 * 16];
1636 int eobx, eoby;
1637 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
1638 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1639 const int txw_idx = get_txw_idx(tx_size);
1640 const int txh_idx = get_txh_idx(tx_size);
1641 const int txfm_size_col = tx_size_wide[tx_size];
1642 const int txfm_size_row = tx_size_high[tx_size];
1643 const int buf_size_w_div16 = txfm_size_col >> 4;
1644 const int buf_size_nonzero_w = ((eobx + 16) >> 4) << 4;
1645 const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
1646 const int input_stride = AOMMIN(32, txfm_size_row);
1647 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1648
1649 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1650 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1651 const transform_1d_avx2 row_txfm =
1652 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1653 const transform_1d_avx2 col_txfm =
1654 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1655
1656 assert(col_txfm != NULL);
1657 assert(row_txfm != NULL);
1658 int ud_flip, lr_flip;
1659 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1660 const __m256i scale0 = _mm256_set1_epi16(1 << (15 + shift[0]));
1661 for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
1662 __m256i buf0[64];
1663 load_buffer_32bit_to_16bit_w16_avx2(input + 16 * i, input_stride, buf0,
1664 buf_size_nonzero_w);
1665 if (rect_type == 1 || rect_type == -1) {
1666 round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code
1667 }
1668 row_txfm(buf0, buf0);
1669 for (int j = 0; j < txfm_size_col; ++j) {
1670 buf0[j] = _mm256_mulhrs_epi16(buf0[j], scale0);
1671 }
1672
1673 __m256i *buf1_cur = buf1 + (i << 4);
1674 if (lr_flip) {
1675 for (int j = 0; j < buf_size_w_div16; ++j) {
1676 __m256i temp[16];
1677 flip_buf_avx2(buf0 + 16 * j, temp, 16);
1678 int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
1679 transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
1680 }
1681 } else {
1682 for (int j = 0; j < buf_size_w_div16; ++j) {
1683 transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
1684 }
1685 }
1686 }
1687 const __m256i scale1 = _mm256_set1_epi16(1 << (15 + shift[1]));
1688 for (int i = 0; i < buf_size_w_div16; i++) {
1689 __m256i *buf1_cur = buf1 + i * txfm_size_row;
1690 col_txfm(buf1_cur, buf1_cur);
1691 for (int j = 0; j < txfm_size_row; ++j) {
1692 buf1_cur[j] = _mm256_mulhrs_epi16(buf1_cur[j], scale1);
1693 }
1694 }
1695 for (int i = 0; i < buf_size_w_div16; i++) {
1696 lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
1697 stride, ud_flip, txfm_size_row);
1698 }
1699 }
1700
iidentity_row_16xn_avx2(__m256i * out,const int32_t * input,int stride,int shift,int height,int txw_idx,int rect_type)1701 static inline void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
1702 int stride, int shift, int height,
1703 int txw_idx, int rect_type) {
1704 const int32_t *input_row = input;
1705 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
1706 const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
1707 (1 << (NewSqrt2Bits - shift - 1)));
1708 const __m256i one = _mm256_set1_epi16(1);
1709 const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
1710 if (rect_type != 1 && rect_type != -1) {
1711 for (int i = 0; i < height; ++i) {
1712 const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1713 input_row += stride;
1714 __m256i lo = _mm256_unpacklo_epi16(src, one);
1715 __m256i hi = _mm256_unpackhi_epi16(src, one);
1716 lo = _mm256_madd_epi16(lo, scale__r);
1717 hi = _mm256_madd_epi16(hi, scale__r);
1718 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1719 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1720 out[i] = _mm256_packs_epi32(lo, hi);
1721 }
1722 } else {
1723 const __m256i rect_scale =
1724 _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
1725 for (int i = 0; i < height; ++i) {
1726 __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
1727 src = _mm256_mulhrs_epi16(src, rect_scale);
1728 input_row += stride;
1729 __m256i lo = _mm256_unpacklo_epi16(src, one);
1730 __m256i hi = _mm256_unpackhi_epi16(src, one);
1731 lo = _mm256_madd_epi16(lo, scale__r);
1732 hi = _mm256_madd_epi16(hi, scale__r);
1733 lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
1734 hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
1735 out[i] = _mm256_packs_epi32(lo, hi);
1736 }
1737 }
1738 }
1739
iidentity_col_16xn_avx2(uint8_t * output,int stride,__m256i * buf,int shift,int height,int txh_idx)1740 static inline void iidentity_col_16xn_avx2(uint8_t *output, int stride,
1741 __m256i *buf, int shift, int height,
1742 int txh_idx) {
1743 const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
1744 const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
1745 const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
1746 const __m256i one = _mm256_set1_epi16(1);
1747 const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
1748 for (int h = 0; h < height; ++h) {
1749 __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
1750 __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
1751 lo = _mm256_madd_epi16(lo, scale_coeff);
1752 hi = _mm256_madd_epi16(hi, scale_coeff);
1753 lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
1754 hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
1755 lo = _mm256_add_epi32(lo, shift__r);
1756 hi = _mm256_add_epi32(hi, shift__r);
1757 lo = _mm256_srai_epi32(lo, -shift);
1758 hi = _mm256_srai_epi32(hi, -shift);
1759 const __m256i x = _mm256_packs_epi32(lo, hi);
1760 write_recon_w16_avx2(x, output);
1761 output += stride;
1762 }
1763 }
1764
lowbd_inv_txfm2d_add_idtx_avx2(const int32_t * input,uint8_t * output,int stride,TX_SIZE tx_size,int32_t eob)1765 static inline void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
1766 uint8_t *output, int stride,
1767 TX_SIZE tx_size,
1768 int32_t eob) {
1769 (void)eob;
1770 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1771 const int txw_idx = get_txw_idx(tx_size);
1772 const int txh_idx = get_txh_idx(tx_size);
1773 const int txfm_size_col = tx_size_wide[tx_size];
1774 const int txfm_size_row = tx_size_high[tx_size];
1775 const int col_max = AOMMIN(32, txfm_size_col);
1776 const int row_max = AOMMIN(32, txfm_size_row);
1777 const int input_stride = row_max;
1778 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1779 __m256i buf[32];
1780
1781 for (int i = 0; i < (col_max >> 4); ++i) {
1782 for (int j = 0; j < (row_max >> 4); j++) {
1783 iidentity_row_16xn_avx2(buf, input + j * 16 + i * 16 * input_stride,
1784 row_max, shift[0], 16, txw_idx, rect_type);
1785 transpose_16bit_16x16_avx2(buf, buf);
1786 iidentity_col_16xn_avx2(output + i * 16 + j * 16 * stride, stride, buf,
1787 shift[1], 16, txh_idx);
1788 }
1789 }
1790 }
1791
lowbd_inv_txfm2d_add_h_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1792 static inline void lowbd_inv_txfm2d_add_h_identity_avx2(
1793 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1794 TX_SIZE tx_size, int eob) {
1795 int eobx, eoby;
1796 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
1797 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1798 const int txw_idx = get_txw_idx(tx_size);
1799 const int txh_idx = get_txh_idx(tx_size);
1800 const int txfm_size_col = tx_size_wide[tx_size];
1801 const int txfm_size_row = tx_size_high[tx_size];
1802 const int txfm_size_row_notzero = AOMMIN(32, txfm_size_row);
1803 const int input_stride = txfm_size_row_notzero;
1804 const int buf_size_w_div16 = (eobx + 16) >> 4;
1805 const int buf_size_h_div16 = (eoby + 16) >> 4;
1806 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1807
1808 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
1809 const transform_1d_avx2 col_txfm =
1810 lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
1811
1812 assert(col_txfm != NULL);
1813
1814 int ud_flip, lr_flip;
1815 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1816 for (int i = 0; i < buf_size_w_div16; i++) {
1817 __m256i buf0[64];
1818 for (int j = 0; j < buf_size_h_div16; j++) {
1819 __m256i *buf0_cur = buf0 + j * 16;
1820 const int32_t *input_cur = input + i * 16 * input_stride + j * 16;
1821 iidentity_row_16xn_avx2(buf0_cur, input_cur, input_stride, shift[0], 16,
1822 txw_idx, rect_type);
1823 transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
1824 }
1825 col_txfm(buf0, buf0);
1826 __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
1827 int k = ud_flip ? (txfm_size_row - 1) : 0;
1828 const int step = ud_flip ? -1 : 1;
1829 for (int j = 0; j < txfm_size_row; ++j, k += step) {
1830 __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
1831 write_recon_w16_avx2(res, output + (i << 4) + j * stride);
1832 }
1833 }
1834 }
1835
lowbd_inv_txfm2d_add_v_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)1836 static inline void lowbd_inv_txfm2d_add_v_identity_avx2(
1837 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
1838 TX_SIZE tx_size, int eob) {
1839 __m256i buf1[64];
1840 int eobx, eoby;
1841 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
1842 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
1843 const int txw_idx = get_txw_idx(tx_size);
1844 const int txh_idx = get_txh_idx(tx_size);
1845 const int txfm_size_col = tx_size_wide[tx_size];
1846 const int txfm_size_row = tx_size_high[tx_size];
1847 const int buf_size_w_div16 = txfm_size_col >> 4;
1848 const int buf_size_h_div16 = (eoby + 16) >> 4;
1849 const int buf_size_nonzero_w = ((eobx + 8) >> 3) << 3;
1850 const int input_stride = AOMMIN(32, txfm_size_row);
1851 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
1852
1853 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
1854 const transform_1d_avx2 row_txfm =
1855 lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
1856
1857 assert(row_txfm != NULL);
1858
1859 int ud_flip, lr_flip;
1860 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1861 for (int i = 0; i < buf_size_h_div16; i++) {
1862 __m256i buf0[64];
1863 load_buffer_32bit_to_16bit_w16_avx2(input + i * 16, input_stride, buf0,
1864 buf_size_nonzero_w);
1865 if (rect_type == 1 || rect_type == -1) {
1866 round_shift_avx2(buf0, buf0, buf_size_nonzero_w); // rect special code
1867 }
1868 row_txfm(buf0, buf0);
1869 round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
1870 __m256i *_buf1 = buf1;
1871 if (lr_flip) {
1872 for (int j = 0; j < buf_size_w_div16; ++j) {
1873 __m256i temp[16];
1874 flip_buf_avx2(buf0 + 16 * j, temp, 16);
1875 transpose_16bit_16x16_avx2(temp,
1876 _buf1 + 16 * (buf_size_w_div16 - 1 - j));
1877 }
1878 } else {
1879 for (int j = 0; j < buf_size_w_div16; ++j) {
1880 transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
1881 }
1882 }
1883 for (int j = 0; j < buf_size_w_div16; ++j) {
1884 iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
1885 buf1 + j * 16, shift[1], 16, txh_idx);
1886 }
1887 }
1888 }
1889
1890 static const transform_1d_ssse3 lowbd_txfm_all_1d_zeros_8x8_arr[2][2] = {
1891 { av1_idct8_low1_ssse3, av1_idct8_sse2 },
1892 { av1_iadst8_low1_ssse3, av1_iadst8_sse2 }
1893 };
1894
load_buffer_avx2(const int32_t * in,int stride,__m128i * out)1895 static inline void load_buffer_avx2(const int32_t *in, int stride,
1896 __m128i *out) {
1897 const __m256i a = _mm256_load_si256((const __m256i *)in);
1898 const __m256i b = _mm256_load_si256((const __m256i *)(in + stride * 1));
1899 const __m256i c = _mm256_load_si256((const __m256i *)(in + stride * 2));
1900 const __m256i d = _mm256_load_si256((const __m256i *)(in + stride * 3));
1901 const __m256i e = _mm256_load_si256((const __m256i *)(in + stride * 4));
1902 const __m256i f = _mm256_load_si256((const __m256i *)(in + stride * 5));
1903 const __m256i g = _mm256_load_si256((const __m256i *)(in + stride * 6));
1904 const __m256i h = _mm256_load_si256((const __m256i *)(in + stride * 7));
1905
1906 // a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7
1907 const __m256i ab_16bit = _mm256_packs_epi32(a, b);
1908 // c0 c1 c2 c3 d0 d1 d2 d3 c4 c5 c6 c7 d4 d5 d6 d7
1909 const __m256i cd_16bit = _mm256_packs_epi32(c, d);
1910 // e0 e1 e2 e3 f0 f1 f2 f3 e4 e5 e6 e7 f4 f5 f6 f7
1911 const __m256i ef_16bit = _mm256_packs_epi32(e, f);
1912 // g0 g1 g2 g3 h0 h1 h2 h3 g4 g5 g6 g7 h4 h5 h6 h7
1913 const __m256i gh_16bit = _mm256_packs_epi32(g, h);
1914
1915 // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
1916 const __m256i ab = _mm256_permute4x64_epi64(ab_16bit, 0xd8);
1917 // c0 c1 c2 c3 c4 c5 c6 c7 d0 d1 d2 d3 d4 d5 d6 d7
1918 const __m256i cd = _mm256_permute4x64_epi64(cd_16bit, 0xd8);
1919 // e0 e1 e2 e3 e4 e5 e6 e7 f0 f1 f2 f3 f4 f5 f6 f7
1920 const __m256i ef = _mm256_permute4x64_epi64(ef_16bit, 0xd8);
1921 // g0 g1 g2 g3 g4 g5 g6 g7 h0 h1 h2 h3 h4 h5 h6 h7
1922 const __m256i gh = _mm256_permute4x64_epi64(gh_16bit, 0xd8);
1923
1924 out[0] = _mm256_castsi256_si128(ab);
1925 out[1] = _mm256_extractf128_si256(ab, 1);
1926 out[2] = _mm256_castsi256_si128(cd);
1927 out[3] = _mm256_extractf128_si256(cd, 1);
1928 out[4] = _mm256_castsi256_si128(ef);
1929 out[5] = _mm256_extractf128_si256(ef, 1);
1930 out[6] = _mm256_castsi256_si128(gh);
1931 out[7] = _mm256_extractf128_si256(gh, 1);
1932 }
1933
round_and_transpose_avx2(const __m128i * const in,__m128i * const out,int bit,int * lr_flip)1934 static inline void round_and_transpose_avx2(const __m128i *const in,
1935 __m128i *const out, int bit,
1936 int *lr_flip) {
1937 __m256i buf_temp[4];
1938 const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
1939 int j = *lr_flip ? 7 : 0;
1940 const int step = *lr_flip ? -1 : 1;
1941
1942 // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
1943 buf_temp[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1944 in[j + 4 * step], 1);
1945 j += step;
1946 // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
1947 buf_temp[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1948 in[j + 4 * step], 1);
1949 j += step;
1950 // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
1951 buf_temp[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1952 in[j + 4 * step], 1);
1953 j += step;
1954 // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
1955 buf_temp[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]),
1956 in[j + 4 * step], 1);
1957
1958 // 70 71 72 73 74 75 76 77 | 30 31 32 33 34 35 36 37
1959 buf_temp[0] = _mm256_mulhrs_epi16(buf_temp[0], scale);
1960 // 60 61 62 63 64 65 66 67 | 20 21 22 23 24 25 26 27
1961 buf_temp[1] = _mm256_mulhrs_epi16(buf_temp[1], scale);
1962 // 50 51 52 53 54 55 56 57 | 10 11 12 13 14 15 16 17
1963 buf_temp[2] = _mm256_mulhrs_epi16(buf_temp[2], scale);
1964 // 40 41 42 43 44 45 46 47 | 00 01 02 03 04 05 06 07
1965 buf_temp[3] = _mm256_mulhrs_epi16(buf_temp[3], scale);
1966
1967 // 70 60 71 61 72 62 73 63 | 30 20 31 21 32 22 33 23
1968 const __m256i unpcklo0 = _mm256_unpacklo_epi16(buf_temp[0], buf_temp[1]);
1969 // 74 64 75 65 76 66 77 67 | 34 24 35 25 36 26 37 27
1970 const __m256i unpckhi0 = _mm256_unpackhi_epi16(buf_temp[0], buf_temp[1]);
1971 // 50 40 51 41 52 42 53 43 | 10 00 11 01 12 02 13 03
1972 const __m256i unpcklo1 = _mm256_unpacklo_epi16(buf_temp[2], buf_temp[3]);
1973 // 54 44 55 45 56 46 57 47 | 14 04 15 05 16 06 17 07
1974 const __m256i unpckhi1 = _mm256_unpackhi_epi16(buf_temp[2], buf_temp[3]);
1975
1976 // 70 60 50 40 71 61 51 41 | 30 20 10 00 31 21 11 01
1977 const __m256i unpcklo00 = _mm256_unpacklo_epi32(unpcklo0, unpcklo1);
1978 // 72 62 52 42 73 63 53 43 | 32 22 12 02 33 23 13 03
1979 const __m256i unpckhi00 = _mm256_unpackhi_epi32(unpcklo0, unpcklo1);
1980 // 74 64 54 44 75 65 55 45 | 34 24 14 04 35 25 15 05
1981 const __m256i unpcklo01 = _mm256_unpacklo_epi32(unpckhi0, unpckhi1);
1982 // 76 66 56 46 77 67 57 47 | 36 26 16 06 37 27 17 07
1983 const __m256i unpckhi01 = _mm256_unpackhi_epi32(unpckhi0, unpckhi1);
1984
1985 // 70 60 50 40 30 20 10 00 | 71 61 51 41 31 21 11 01
1986 const __m256i reg_00 = _mm256_permute4x64_epi64(unpcklo00, 0xd8);
1987 // 72 62 52 42 32 22 12 02 | 73 63 53 43 33 23 13 03
1988 const __m256i reg_01 = _mm256_permute4x64_epi64(unpckhi00, 0xd8);
1989 // 74 64 54 44 34 24 14 04 | 75 65 55 45 35 25 15 05
1990 const __m256i reg_10 = _mm256_permute4x64_epi64(unpcklo01, 0xd8);
1991 // 76 66 56 46 36 26 16 06 | 77 67 57 47 37 27 17 07
1992 const __m256i reg_11 = _mm256_permute4x64_epi64(unpckhi01, 0xd8);
1993
1994 // 70 60 50 40 30 20 10 00
1995 out[0] = _mm256_castsi256_si128(reg_00);
1996 // 71 61 51 41 31 21 11 01
1997 out[1] = _mm256_extracti128_si256(reg_00, 1);
1998 // 72 62 52 42 32 22 12 02
1999 out[2] = _mm256_castsi256_si128(reg_01);
2000 // 73 63 53 43 33 23 13 03
2001 out[3] = _mm256_extracti128_si256(reg_01, 1);
2002 // 74 64 54 44 34 24 14 04
2003 out[4] = _mm256_castsi256_si128(reg_10);
2004 // 75 65 55 45 35 25 15 05
2005 out[5] = _mm256_extracti128_si256(reg_10, 1);
2006 // 76 66 56 46 36 26 16 06
2007 out[6] = _mm256_castsi256_si128(reg_11);
2008 // 77 67 57 47 37 27 17 07
2009 out[7] = _mm256_extracti128_si256(reg_11, 1);
2010 }
2011
round_shift_lowbd_write_buffer_avx2(__m128i * in,int bit,uint8_t * output,int stride,int flipud)2012 static inline void round_shift_lowbd_write_buffer_avx2(__m128i *in, int bit,
2013 uint8_t *output,
2014 int stride, int flipud) {
2015 __m256i in_256[4], v_256[4];
2016 int j = flipud ? 7 : 0;
2017 const int step = flipud ? -1 : 1;
2018 const __m256i scale = _mm256_set1_epi16(1 << (15 + bit));
2019 const __m256i zero = _mm256_setzero_si256();
2020 // in[0], in[1]
2021 in_256[0] =
2022 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2023 j += 2 * step;
2024 // in[2], in[3]
2025 in_256[1] =
2026 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2027 j += 2 * step;
2028 // in[4], in[5]
2029 in_256[2] =
2030 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2031 j += 2 * step;
2032 // in[6], in[7]
2033 in_256[3] =
2034 _mm256_inserti128_si256(_mm256_castsi128_si256(in[j]), in[j + step], 1);
2035
2036 // i00 i01 i02 i03 i04 i05 i06 i07 i10 i11 i12 i13 i14 i15 i16 i17
2037 in_256[0] = _mm256_mulhrs_epi16(in_256[0], scale);
2038 // i20 i21 i22 i23 i24 i25 i26 i27 i30 i31 i32 i33 i34 i35 i36 i37
2039 in_256[1] = _mm256_mulhrs_epi16(in_256[1], scale);
2040 // i40 i41 i42 i43 i44 i45 i46 i47 i50 i51 i52 i53 i54 i55 i56 i57
2041 in_256[2] = _mm256_mulhrs_epi16(in_256[2], scale);
2042 // i60 i61 i62 i63 i64 i65 i66 i67 i70 i71 i72 i73 i74 i75 i76 i77
2043 in_256[3] = _mm256_mulhrs_epi16(in_256[3], scale);
2044
2045 const __m128i v0 = _mm_loadl_epi64((__m128i const *)(output));
2046 const __m128i v1 = _mm_loadl_epi64((__m128i const *)(output + stride));
2047 const __m128i v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
2048 const __m128i v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
2049 const __m128i v4 = _mm_loadl_epi64((__m128i const *)(output + 4 * stride));
2050 const __m128i v5 = _mm_loadl_epi64((__m128i const *)(output + 5 * stride));
2051 const __m128i v6 = _mm_loadl_epi64((__m128i const *)(output + 6 * stride));
2052 const __m128i v7 = _mm_loadl_epi64((__m128i const *)(output + 7 * stride));
2053
2054 v_256[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(v0), v1, 1);
2055 v_256[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(v2), v3, 1);
2056 v_256[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(v4), v5, 1);
2057 v_256[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(v6), v7, 1);
2058
2059 const __m256i unpcklo0 = _mm256_unpacklo_epi8(v_256[0], zero);
2060 const __m256i unpcklo1 = _mm256_unpacklo_epi8(v_256[1], zero);
2061 const __m256i unpcklo2 = _mm256_unpacklo_epi8(v_256[2], zero);
2062 const __m256i unpcklo3 = _mm256_unpacklo_epi8(v_256[3], zero);
2063 // 00 01 10 11
2064 const __m256i x0 = _mm256_adds_epi16(in_256[0], unpcklo0);
2065 // 20 21 30 31
2066 const __m256i x1 = _mm256_adds_epi16(in_256[1], unpcklo1);
2067 // 40 41 50 51
2068 const __m256i x2 = _mm256_adds_epi16(in_256[2], unpcklo2);
2069 // 60 61 70 71
2070 const __m256i x3 = _mm256_adds_epi16(in_256[3], unpcklo3);
2071
2072 // 00 01 20 21 10 11 30 31
2073 const __m256i res_0123 = _mm256_packus_epi16(x0, x1);
2074 // 40 41 60 61 50 51 70 71
2075 const __m256i res_4567 = _mm256_packus_epi16(x2, x3);
2076
2077 // 00 01 20 21
2078 const __m128i res_02 = _mm256_castsi256_si128(res_0123);
2079 // 10 11 30 31
2080 const __m128i res_13 = _mm256_extracti128_si256(res_0123, 1);
2081 // 40 41 60 61
2082 const __m128i res_46 = _mm256_castsi256_si128(res_4567);
2083 // 50 51 70 71
2084 const __m128i res_57 = _mm256_extracti128_si256(res_4567, 1);
2085
2086 // 00 01
2087 _mm_storel_epi64((__m128i *)(output), res_02);
2088 // 10 11
2089 _mm_storel_epi64((__m128i *)(output + stride), res_13);
2090 // 20 21
2091 _mm_storel_epi64((__m128i *)(output + 2 * stride),
2092 _mm_unpackhi_epi64(res_02, res_02));
2093 // 30 31
2094 _mm_storel_epi64((__m128i *)(output + 3 * stride),
2095 _mm_unpackhi_epi64(res_13, res_13));
2096 // 40 41
2097 _mm_storel_epi64((__m128i *)(output + 4 * stride), res_46);
2098 // 50 51
2099 _mm_storel_epi64((__m128i *)(output + 5 * stride), res_57);
2100 // 60 61
2101 _mm_storel_epi64((__m128i *)(output + 6 * stride),
2102 _mm_unpackhi_epi64(res_46, res_46));
2103 // 70 71
2104 _mm_storel_epi64((__m128i *)(output + 7 * stride),
2105 _mm_unpackhi_epi64(res_57, res_57));
2106 }
2107
2108 // AVX2 implementation has the advantage when combined multiple operations
2109 // together.
lowbd_inv_txfm2d_8x8_no_identity_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2110 static inline void lowbd_inv_txfm2d_8x8_no_identity_avx2(
2111 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2112 TX_SIZE tx_size, int eob) {
2113 __m128i buf1[8];
2114 const int input_stride = 8;
2115 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
2116 assert(hitx_1d_tab[tx_type] < 2);
2117 assert(vitx_1d_tab[tx_type] < 2);
2118 const transform_1d_ssse3 row_txfm =
2119 lowbd_txfm_all_1d_zeros_8x8_arr[hitx_1d_tab[tx_type]][eob != 1];
2120 const transform_1d_ssse3 col_txfm =
2121 lowbd_txfm_all_1d_zeros_8x8_arr[vitx_1d_tab[tx_type]][eob != 1];
2122
2123 assert(col_txfm != NULL);
2124 assert(row_txfm != NULL);
2125 int ud_flip, lr_flip;
2126 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2127
2128 __m128i buf0[8];
2129 __m128i *buf0_cur = buf0;
2130 load_buffer_avx2(input, input_stride, buf0_cur);
2131 row_txfm(buf0, buf0);
2132
2133 assert(shift[0] < 0);
2134 __m128i *_buf1 = buf1;
2135 round_and_transpose_avx2(buf0, _buf1, shift[0], &lr_flip);
2136 assert(shift[1] < 0);
2137 col_txfm(buf1, buf1);
2138 round_shift_lowbd_write_buffer_avx2(buf1, shift[1], output, stride, ud_flip);
2139 }
2140
2141 // AVX2 implementation of 8x8 inverse transform. Observed that coding AVX2 for
2142 // tx_type with identity in either of the direction has no advantage.
lowbd_inv_txfm2d_add_8x8_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2143 static void lowbd_inv_txfm2d_add_8x8_avx2(const int32_t *input, uint8_t *output,
2144 int stride, TX_TYPE tx_type,
2145 TX_SIZE tx_size, int eob) {
2146 switch (tx_type) {
2147 case IDTX:
2148 av1_lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
2149
2150 break;
2151 case V_DCT:
2152 case V_ADST:
2153 case V_FLIPADST:
2154 av1_lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
2155 tx_size, eob);
2156 break;
2157 case H_DCT:
2158 case H_ADST:
2159 case H_FLIPADST:
2160 av1_lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
2161 tx_size, eob);
2162 break;
2163 default:
2164 lowbd_inv_txfm2d_8x8_no_identity_avx2(input, output, stride, tx_type,
2165 tx_size, eob);
2166 }
2167 }
2168
2169 // for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
lowbd_inv_txfm2d_add_universe_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2170 static inline void lowbd_inv_txfm2d_add_universe_avx2(
2171 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
2172 TX_SIZE tx_size, int eob) {
2173 (void)eob;
2174 switch (tx_type) {
2175 case DCT_DCT:
2176 case ADST_DCT: // ADST in vertical, DCT in horizontal
2177 case DCT_ADST: // DCT in vertical, ADST in horizontal
2178 case ADST_ADST: // ADST in both directions
2179 case FLIPADST_DCT:
2180 case DCT_FLIPADST:
2181 case FLIPADST_FLIPADST:
2182 case ADST_FLIPADST:
2183 case FLIPADST_ADST:
2184 lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
2185 tx_size, eob);
2186 break;
2187 case IDTX:
2188 lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
2189 break;
2190 case V_DCT:
2191 case V_ADST:
2192 case V_FLIPADST:
2193 lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
2194 tx_size, eob);
2195 break;
2196 case H_DCT:
2197 case H_ADST:
2198 case H_FLIPADST:
2199 lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
2200 tx_size, eob);
2201 break;
2202 default:
2203 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
2204 eob);
2205 break;
2206 }
2207 }
2208
av1_lowbd_inv_txfm2d_add_avx2(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob)2209 void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
2210 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
2211 int eob) {
2212 switch (tx_size) {
2213 case TX_4X4:
2214 case TX_4X8:
2215 case TX_8X4:
2216 case TX_8X16:
2217 case TX_16X8:
2218 case TX_4X16:
2219 case TX_16X4:
2220 case TX_8X32:
2221 case TX_32X8:
2222 av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
2223 eob);
2224 break;
2225 case TX_8X8:
2226 lowbd_inv_txfm2d_add_8x8_avx2(input, output, stride, tx_type, tx_size,
2227 eob);
2228 break;
2229 case TX_16X16:
2230 case TX_32X32:
2231 case TX_64X64:
2232 case TX_16X32:
2233 case TX_32X16:
2234 case TX_32X64:
2235 case TX_64X32:
2236 case TX_16X64:
2237 case TX_64X16:
2238 default:
2239 lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
2240 tx_size, eob);
2241 break;
2242 }
2243 }
2244
av1_inv_txfm_add_avx2(const tran_low_t * dqcoeff,uint8_t * dst,int stride,const TxfmParam * txfm_param)2245 void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
2246 const TxfmParam *txfm_param) {
2247 const TX_TYPE tx_type = txfm_param->tx_type;
2248 if (!txfm_param->lossless) {
2249 av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
2250 txfm_param->tx_size, txfm_param->eob);
2251 } else {
2252 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
2253 }
2254 }
2255