1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/av1_rtcd.h"
13
14 #include "av1/common/enums.h"
15 #include "av1/common/av1_txfm.h"
16 #include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
19 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
21 #include "aom_dsp/x86/txfm_common_avx2.h"
22
fdct16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)23 static inline void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
24 int8_t cos_bit) {
25 const int32_t *cospi = cospi_arr(cos_bit);
26 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
27
28 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
29 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
30 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
31 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
32 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
33 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
34 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
35 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
36 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
37 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
38 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
39 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
40 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
41 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
42 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
43 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
44 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
45 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
46
47 // stage 1
48 __m256i x1[16];
49 btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
50 btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
51 btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
52 btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
53 btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
54 btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
55 btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
56 btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
57
58 // stage 2
59 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
60 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
61 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
62 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
63 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
64 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
65
66 // stage 3
67 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
68 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
69 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
70 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
71 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
72 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
73 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
74
75 // stage 4
76 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
77 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
78 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
79 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
80 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
81 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
82
83 // stage 5
84 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
85 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
86 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
87 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
88 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
89 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
90
91 // stage 6
92 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
93 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
94 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
95 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
96
97 // stage 7
98 output[0] = x1[0];
99 output[1] = x1[8];
100 output[2] = x1[4];
101 output[3] = x1[12];
102 output[4] = x1[2];
103 output[5] = x1[10];
104 output[6] = x1[6];
105 output[7] = x1[14];
106 output[8] = x1[1];
107 output[9] = x1[9];
108 output[10] = x1[5];
109 output[11] = x1[13];
110 output[12] = x1[3];
111 output[13] = x1[11];
112 output[14] = x1[7];
113 output[15] = x1[15];
114 }
115
fdct16x32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)116 static inline void fdct16x32_avx2(const __m256i *input, __m256i *output,
117 int8_t cos_bit) {
118 const int32_t *cospi = cospi_arr(cos_bit);
119 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
120
121 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
122 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
123 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
124 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
125 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
126 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
127 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
128 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
129 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
130 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
131 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
132 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
133 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
134 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
135 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
136 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
137 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
138 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
139 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
140 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
141 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
142 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
143 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
144 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
145 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
146 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
147 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
148 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
149 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
150 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
151 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
152 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
153 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
154 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
155 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
156 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
157
158 // stage 1
159 __m256i x1[32];
160 btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
161 btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
162 btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
163 btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
164 btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
165 btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
166 btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
167 btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
168 btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
169 btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
170 btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
171 btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
172 btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
173 btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
174 btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
175 btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
176
177 // stage 2
178 btf_16_adds_subs_avx2(&x1[0], &x1[15]);
179 btf_16_adds_subs_avx2(&x1[1], &x1[14]);
180 btf_16_adds_subs_avx2(&x1[2], &x1[13]);
181 btf_16_adds_subs_avx2(&x1[3], &x1[12]);
182 btf_16_adds_subs_avx2(&x1[4], &x1[11]);
183 btf_16_adds_subs_avx2(&x1[5], &x1[10]);
184 btf_16_adds_subs_avx2(&x1[6], &x1[9]);
185 btf_16_adds_subs_avx2(&x1[7], &x1[8]);
186 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
187 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
188 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
189 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
190
191 // stage 3
192 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
193 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
194 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
195 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
196 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
197 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
198 btf_16_adds_subs_avx2(&x1[16], &x1[23]);
199 btf_16_adds_subs_avx2(&x1[17], &x1[22]);
200 btf_16_adds_subs_avx2(&x1[18], &x1[21]);
201 btf_16_adds_subs_avx2(&x1[19], &x1[20]);
202 btf_16_adds_subs_avx2(&x1[31], &x1[24]);
203 btf_16_adds_subs_avx2(&x1[30], &x1[25]);
204 btf_16_adds_subs_avx2(&x1[29], &x1[26]);
205 btf_16_adds_subs_avx2(&x1[28], &x1[27]);
206
207 // stage 4
208 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
209 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
210 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
211 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
212 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
213 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
214 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
215 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
216 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
217 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
218 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
219
220 // stage 5
221 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
222 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
223 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
224 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
225 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
226 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
227 btf_16_adds_subs_avx2(&x1[16], &x1[19]);
228 btf_16_adds_subs_avx2(&x1[17], &x1[18]);
229 btf_16_adds_subs_avx2(&x1[23], &x1[20]);
230 btf_16_adds_subs_avx2(&x1[22], &x1[21]);
231 btf_16_adds_subs_avx2(&x1[24], &x1[27]);
232 btf_16_adds_subs_avx2(&x1[25], &x1[26]);
233 btf_16_adds_subs_avx2(&x1[31], &x1[28]);
234 btf_16_adds_subs_avx2(&x1[30], &x1[29]);
235
236 // stage 6
237 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
238 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
239 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
240 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
241 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
242 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
243 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
244 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
245 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
246 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
247
248 // stage 7
249 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
250 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
251 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
252 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
253 btf_16_adds_subs_avx2(&x1[16], &x1[17]);
254 btf_16_adds_subs_avx2(&x1[19], &x1[18]);
255 btf_16_adds_subs_avx2(&x1[20], &x1[21]);
256 btf_16_adds_subs_avx2(&x1[23], &x1[22]);
257 btf_16_adds_subs_avx2(&x1[24], &x1[25]);
258 btf_16_adds_subs_avx2(&x1[27], &x1[26]);
259 btf_16_adds_subs_avx2(&x1[28], &x1[29]);
260 btf_16_adds_subs_avx2(&x1[31], &x1[30]);
261
262 // stage 8
263 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
264 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
265 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
266 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
267 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
268 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
269 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
270 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
271
272 // stage 9
273 output[0] = x1[0];
274 output[1] = x1[16];
275 output[2] = x1[8];
276 output[3] = x1[24];
277 output[4] = x1[4];
278 output[5] = x1[20];
279 output[6] = x1[12];
280 output[7] = x1[28];
281 output[8] = x1[2];
282 output[9] = x1[18];
283 output[10] = x1[10];
284 output[11] = x1[26];
285 output[12] = x1[6];
286 output[13] = x1[22];
287 output[14] = x1[14];
288 output[15] = x1[30];
289 output[16] = x1[1];
290 output[17] = x1[17];
291 output[18] = x1[9];
292 output[19] = x1[25];
293 output[20] = x1[5];
294 output[21] = x1[21];
295 output[22] = x1[13];
296 output[23] = x1[29];
297 output[24] = x1[3];
298 output[25] = x1[19];
299 output[26] = x1[11];
300 output[27] = x1[27];
301 output[28] = x1[7];
302 output[29] = x1[23];
303 output[30] = x1[15];
304 output[31] = x1[31];
305 }
306
fdct16x64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)307 static inline void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
308 int8_t cos_bit) {
309 const int32_t *cospi = cospi_arr(cos_bit);
310 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
311
312 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
313 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
314 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
315 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
316 __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
317 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
318 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
319 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
320 __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
321 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
322 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
323 __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
324 __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
325 __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
326 __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
327 __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
328 __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
329 __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
330 __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
331 __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
332 __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
333 __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
334 __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
335 __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
336 __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
337 __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
338 __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
339 __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
340 __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
341 __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
342 __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
343 __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
344 __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
345 __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
346 __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
347 __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
348 __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
349 __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
350 __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
351 __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
352 __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
353 __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
354 __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
355 __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
356 __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
357 __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
358 __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
359 __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
360 __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
361 __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
362 __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
363 __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
364 __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
365 __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
366 __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
367 __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
368 __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
369 __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
370 __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
371 __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
372 __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
373 __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
374 __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
375 __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
376 __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
377 __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
378 __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
379 __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
380 __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
381 __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
382 __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
383 __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
384
385 // stage 1
386 __m256i x1[64];
387 btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
388 btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
389 btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
390 btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
391 btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
392 btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
393 btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
394 btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
395 btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
396 btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
397 btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
398 btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
399 btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
400 btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
401 btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
402 btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
403 btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
404 btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
405 btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
406 btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
407 btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
408 btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
409 btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
410 btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
411 btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
412 btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
413 btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
414 btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
415 btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
416 btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
417 btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
418 btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
419
420 // stage 2
421 btf_16_adds_subs_avx2(&x1[0], &x1[31]);
422 btf_16_adds_subs_avx2(&x1[1], &x1[30]);
423 btf_16_adds_subs_avx2(&x1[2], &x1[29]);
424 btf_16_adds_subs_avx2(&x1[3], &x1[28]);
425 btf_16_adds_subs_avx2(&x1[4], &x1[27]);
426 btf_16_adds_subs_avx2(&x1[5], &x1[26]);
427 btf_16_adds_subs_avx2(&x1[6], &x1[25]);
428 btf_16_adds_subs_avx2(&x1[7], &x1[24]);
429 btf_16_adds_subs_avx2(&x1[8], &x1[23]);
430 btf_16_adds_subs_avx2(&x1[9], &x1[22]);
431 btf_16_adds_subs_avx2(&x1[10], &x1[21]);
432 btf_16_adds_subs_avx2(&x1[11], &x1[20]);
433 btf_16_adds_subs_avx2(&x1[12], &x1[19]);
434 btf_16_adds_subs_avx2(&x1[13], &x1[18]);
435 btf_16_adds_subs_avx2(&x1[14], &x1[17]);
436 btf_16_adds_subs_avx2(&x1[15], &x1[16]);
437 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
438 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
439 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
440 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
441 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
442 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
443 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
444 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
445
446 // stage 3
447 btf_16_adds_subs_avx2(&x1[0], &x1[15]);
448 btf_16_adds_subs_avx2(&x1[1], &x1[14]);
449 btf_16_adds_subs_avx2(&x1[2], &x1[13]);
450 btf_16_adds_subs_avx2(&x1[3], &x1[12]);
451 btf_16_adds_subs_avx2(&x1[4], &x1[11]);
452 btf_16_adds_subs_avx2(&x1[5], &x1[10]);
453 btf_16_adds_subs_avx2(&x1[6], &x1[9]);
454 btf_16_adds_subs_avx2(&x1[7], &x1[8]);
455 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
456 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
457 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
458 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
459 btf_16_adds_subs_avx2(&x1[32], &x1[47]);
460 btf_16_adds_subs_avx2(&x1[33], &x1[46]);
461 btf_16_adds_subs_avx2(&x1[34], &x1[45]);
462 btf_16_adds_subs_avx2(&x1[35], &x1[44]);
463 btf_16_adds_subs_avx2(&x1[36], &x1[43]);
464 btf_16_adds_subs_avx2(&x1[37], &x1[42]);
465 btf_16_adds_subs_avx2(&x1[38], &x1[41]);
466 btf_16_adds_subs_avx2(&x1[39], &x1[40]);
467 btf_16_adds_subs_avx2(&x1[63], &x1[48]);
468 btf_16_adds_subs_avx2(&x1[62], &x1[49]);
469 btf_16_adds_subs_avx2(&x1[61], &x1[50]);
470 btf_16_adds_subs_avx2(&x1[60], &x1[51]);
471 btf_16_adds_subs_avx2(&x1[59], &x1[52]);
472 btf_16_adds_subs_avx2(&x1[58], &x1[53]);
473 btf_16_adds_subs_avx2(&x1[57], &x1[54]);
474 btf_16_adds_subs_avx2(&x1[56], &x1[55]);
475
476 // stage 4
477 btf_16_adds_subs_avx2(&x1[0], &x1[7]);
478 btf_16_adds_subs_avx2(&x1[1], &x1[6]);
479 btf_16_adds_subs_avx2(&x1[2], &x1[5]);
480 btf_16_adds_subs_avx2(&x1[3], &x1[4]);
481 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
482 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
483 btf_16_adds_subs_avx2(&x1[16], &x1[23]);
484 btf_16_adds_subs_avx2(&x1[17], &x1[22]);
485 btf_16_adds_subs_avx2(&x1[18], &x1[21]);
486 btf_16_adds_subs_avx2(&x1[19], &x1[20]);
487 btf_16_adds_subs_avx2(&x1[31], &x1[24]);
488 btf_16_adds_subs_avx2(&x1[30], &x1[25]);
489 btf_16_adds_subs_avx2(&x1[29], &x1[26]);
490 btf_16_adds_subs_avx2(&x1[28], &x1[27]);
491 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
492 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
493 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
494 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
495 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
496 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
497 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
498 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
499
500 // stage 5
501 btf_16_adds_subs_avx2(&x1[0], &x1[3]);
502 btf_16_adds_subs_avx2(&x1[1], &x1[2]);
503 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
504 btf_16_adds_subs_avx2(&x1[8], &x1[11]);
505 btf_16_adds_subs_avx2(&x1[9], &x1[10]);
506 btf_16_adds_subs_avx2(&x1[15], &x1[12]);
507 btf_16_adds_subs_avx2(&x1[14], &x1[13]);
508 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
509 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
510 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
511 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
512 btf_16_adds_subs_avx2(&x1[32], &x1[39]);
513 btf_16_adds_subs_avx2(&x1[33], &x1[38]);
514 btf_16_adds_subs_avx2(&x1[34], &x1[37]);
515 btf_16_adds_subs_avx2(&x1[35], &x1[36]);
516 btf_16_adds_subs_avx2(&x1[47], &x1[40]);
517 btf_16_adds_subs_avx2(&x1[46], &x1[41]);
518 btf_16_adds_subs_avx2(&x1[45], &x1[42]);
519 btf_16_adds_subs_avx2(&x1[44], &x1[43]);
520 btf_16_adds_subs_avx2(&x1[48], &x1[55]);
521 btf_16_adds_subs_avx2(&x1[49], &x1[54]);
522 btf_16_adds_subs_avx2(&x1[50], &x1[53]);
523 btf_16_adds_subs_avx2(&x1[51], &x1[52]);
524 btf_16_adds_subs_avx2(&x1[63], &x1[56]);
525 btf_16_adds_subs_avx2(&x1[62], &x1[57]);
526 btf_16_adds_subs_avx2(&x1[61], &x1[58]);
527 btf_16_adds_subs_avx2(&x1[60], &x1[59]);
528
529 // stage 6
530 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
531 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
532 btf_16_adds_subs_avx2(&x1[4], &x1[5]);
533 btf_16_adds_subs_avx2(&x1[7], &x1[6]);
534 btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
535 btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
536 btf_16_adds_subs_avx2(&x1[16], &x1[19]);
537 btf_16_adds_subs_avx2(&x1[17], &x1[18]);
538 btf_16_adds_subs_avx2(&x1[23], &x1[20]);
539 btf_16_adds_subs_avx2(&x1[22], &x1[21]);
540 btf_16_adds_subs_avx2(&x1[24], &x1[27]);
541 btf_16_adds_subs_avx2(&x1[25], &x1[26]);
542 btf_16_adds_subs_avx2(&x1[31], &x1[28]);
543 btf_16_adds_subs_avx2(&x1[30], &x1[29]);
544 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
545 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
546 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
547 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
548 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
549 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
550 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
551 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
552
553 // stage 7
554 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
555 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
556 btf_16_adds_subs_avx2(&x1[8], &x1[9]);
557 btf_16_adds_subs_avx2(&x1[11], &x1[10]);
558 btf_16_adds_subs_avx2(&x1[12], &x1[13]);
559 btf_16_adds_subs_avx2(&x1[15], &x1[14]);
560 btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
561 btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
562 btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
563 btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
564 btf_16_adds_subs_avx2(&x1[32], &x1[35]);
565 btf_16_adds_subs_avx2(&x1[33], &x1[34]);
566 btf_16_adds_subs_avx2(&x1[39], &x1[36]);
567 btf_16_adds_subs_avx2(&x1[38], &x1[37]);
568 btf_16_adds_subs_avx2(&x1[40], &x1[43]);
569 btf_16_adds_subs_avx2(&x1[41], &x1[42]);
570 btf_16_adds_subs_avx2(&x1[47], &x1[44]);
571 btf_16_adds_subs_avx2(&x1[46], &x1[45]);
572 btf_16_adds_subs_avx2(&x1[48], &x1[51]);
573 btf_16_adds_subs_avx2(&x1[49], &x1[50]);
574 btf_16_adds_subs_avx2(&x1[55], &x1[52]);
575 btf_16_adds_subs_avx2(&x1[54], &x1[53]);
576 btf_16_adds_subs_avx2(&x1[56], &x1[59]);
577 btf_16_adds_subs_avx2(&x1[57], &x1[58]);
578 btf_16_adds_subs_avx2(&x1[63], &x1[60]);
579 btf_16_adds_subs_avx2(&x1[62], &x1[61]);
580
581 // stage 8
582 btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
583 btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
584 btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
585 btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
586 btf_16_adds_subs_avx2(&x1[16], &x1[17]);
587 btf_16_adds_subs_avx2(&x1[19], &x1[18]);
588 btf_16_adds_subs_avx2(&x1[20], &x1[21]);
589 btf_16_adds_subs_avx2(&x1[23], &x1[22]);
590 btf_16_adds_subs_avx2(&x1[24], &x1[25]);
591 btf_16_adds_subs_avx2(&x1[27], &x1[26]);
592 btf_16_adds_subs_avx2(&x1[28], &x1[29]);
593 btf_16_adds_subs_avx2(&x1[31], &x1[30]);
594 btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
595 btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
596 btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
597 btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
598 btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
599 btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
600 btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
601 btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
602
603 // stage 9
604 btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
605 btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
606 btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
607 btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
608 btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
609 btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
610 btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
611 btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
612 btf_16_adds_subs_avx2(&x1[32], &x1[33]);
613 btf_16_adds_subs_avx2(&x1[35], &x1[34]);
614 btf_16_adds_subs_avx2(&x1[36], &x1[37]);
615 btf_16_adds_subs_avx2(&x1[39], &x1[38]);
616 btf_16_adds_subs_avx2(&x1[40], &x1[41]);
617 btf_16_adds_subs_avx2(&x1[43], &x1[42]);
618 btf_16_adds_subs_avx2(&x1[44], &x1[45]);
619 btf_16_adds_subs_avx2(&x1[47], &x1[46]);
620 btf_16_adds_subs_avx2(&x1[48], &x1[49]);
621 btf_16_adds_subs_avx2(&x1[51], &x1[50]);
622 btf_16_adds_subs_avx2(&x1[52], &x1[53]);
623 btf_16_adds_subs_avx2(&x1[55], &x1[54]);
624 btf_16_adds_subs_avx2(&x1[56], &x1[57]);
625 btf_16_adds_subs_avx2(&x1[59], &x1[58]);
626 btf_16_adds_subs_avx2(&x1[60], &x1[61]);
627 btf_16_adds_subs_avx2(&x1[63], &x1[62]);
628
629 // stage 10
630 btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
631 btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
632 btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
633 btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
634 btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
635 btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
636 btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
637 btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
638 btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
639 btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
640 btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
641 btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
642 btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
643 btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
644 btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
645 btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
646
647 // stage 11
648 output[0] = x1[0];
649 output[1] = x1[32];
650 output[2] = x1[16];
651 output[3] = x1[48];
652 output[4] = x1[8];
653 output[5] = x1[40];
654 output[6] = x1[24];
655 output[7] = x1[56];
656 output[8] = x1[4];
657 output[9] = x1[36];
658 output[10] = x1[20];
659 output[11] = x1[52];
660 output[12] = x1[12];
661 output[13] = x1[44];
662 output[14] = x1[28];
663 output[15] = x1[60];
664 output[16] = x1[2];
665 output[17] = x1[34];
666 output[18] = x1[18];
667 output[19] = x1[50];
668 output[20] = x1[10];
669 output[21] = x1[42];
670 output[22] = x1[26];
671 output[23] = x1[58];
672 output[24] = x1[6];
673 output[25] = x1[38];
674 output[26] = x1[22];
675 output[27] = x1[54];
676 output[28] = x1[14];
677 output[29] = x1[46];
678 output[30] = x1[30];
679 output[31] = x1[62];
680 output[32] = x1[1];
681 output[33] = x1[33];
682 output[34] = x1[17];
683 output[35] = x1[49];
684 output[36] = x1[9];
685 output[37] = x1[41];
686 output[38] = x1[25];
687 output[39] = x1[57];
688 output[40] = x1[5];
689 output[41] = x1[37];
690 output[42] = x1[21];
691 output[43] = x1[53];
692 output[44] = x1[13];
693 output[45] = x1[45];
694 output[46] = x1[29];
695 output[47] = x1[61];
696 output[48] = x1[3];
697 output[49] = x1[35];
698 output[50] = x1[19];
699 output[51] = x1[51];
700 output[52] = x1[11];
701 output[53] = x1[43];
702 output[54] = x1[27];
703 output[55] = x1[59];
704 output[56] = x1[7];
705 output[57] = x1[39];
706 output[58] = x1[23];
707 output[59] = x1[55];
708 output[60] = x1[15];
709 output[61] = x1[47];
710 output[62] = x1[31];
711 output[63] = x1[63];
712 }
713
fdct32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)714 static inline void fdct32_avx2(const __m256i *input, __m256i *output,
715 int8_t cos_bit) {
716 __m256i x1[32];
717 const int32_t *cospi = cospi_arr(cos_bit);
718 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
719 // stage 0
720 // stage 1
721 btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
722 btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
723 btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
724 btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
725 btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
726 btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
727 btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
728 btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
729 btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
730 btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
731 btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
732 btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
733 btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
734 btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
735 btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
736 btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
737
738 // stage 2
739 btf_32_add_sub_avx2(&x1[0], &x1[15]);
740 btf_32_add_sub_avx2(&x1[1], &x1[14]);
741 btf_32_add_sub_avx2(&x1[2], &x1[13]);
742 btf_32_add_sub_avx2(&x1[3], &x1[12]);
743 btf_32_add_sub_avx2(&x1[4], &x1[11]);
744 btf_32_add_sub_avx2(&x1[5], &x1[10]);
745 btf_32_add_sub_avx2(&x1[6], &x1[9]);
746 btf_32_add_sub_avx2(&x1[7], &x1[8]);
747 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
748 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
749 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
750 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
751
752 // stage 3
753 btf_32_add_sub_avx2(&x1[0], &x1[7]);
754 btf_32_add_sub_avx2(&x1[1], &x1[6]);
755 btf_32_add_sub_avx2(&x1[2], &x1[5]);
756 btf_32_add_sub_avx2(&x1[3], &x1[4]);
757 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
758 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
759 btf_32_add_sub_avx2(&x1[16], &x1[23]);
760 btf_32_add_sub_avx2(&x1[17], &x1[22]);
761 btf_32_add_sub_avx2(&x1[18], &x1[21]);
762 btf_32_add_sub_avx2(&x1[19], &x1[20]);
763 btf_32_add_sub_avx2(&x1[31], &x1[24]);
764 btf_32_add_sub_avx2(&x1[30], &x1[25]);
765 btf_32_add_sub_avx2(&x1[29], &x1[26]);
766 btf_32_add_sub_avx2(&x1[28], &x1[27]);
767
768 // stage 4
769 btf_32_add_sub_avx2(&x1[0], &x1[3]);
770 btf_32_add_sub_avx2(&x1[1], &x1[2]);
771 btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
772 btf_32_add_sub_avx2(&x1[8], &x1[11]);
773 btf_32_add_sub_avx2(&x1[9], &x1[10]);
774 btf_32_add_sub_avx2(&x1[15], &x1[12]);
775 btf_32_add_sub_avx2(&x1[14], &x1[13]);
776 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
777 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
778 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
779 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
780
781 // stage 5
782 btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
783 btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
784 btf_32_add_sub_avx2(&x1[4], &x1[5]);
785 btf_32_add_sub_avx2(&x1[7], &x1[6]);
786 btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
787 btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
788 btf_32_add_sub_avx2(&x1[16], &x1[19]);
789 btf_32_add_sub_avx2(&x1[17], &x1[18]);
790 btf_32_add_sub_avx2(&x1[23], &x1[20]);
791 btf_32_add_sub_avx2(&x1[22], &x1[21]);
792 btf_32_add_sub_avx2(&x1[24], &x1[27]);
793 btf_32_add_sub_avx2(&x1[25], &x1[26]);
794 btf_32_add_sub_avx2(&x1[31], &x1[28]);
795 btf_32_add_sub_avx2(&x1[30], &x1[29]);
796
797 // stage 6
798 btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
799 btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
800 btf_32_add_sub_avx2(&x1[8], &x1[9]);
801 btf_32_add_sub_avx2(&x1[11], &x1[10]);
802 btf_32_add_sub_avx2(&x1[12], &x1[13]);
803 btf_32_add_sub_avx2(&x1[15], &x1[14]);
804 btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
805 btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
806 btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
807 btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
808
809 // stage 7
810 btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
811 btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
812 btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
813 btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
814 btf_32_add_sub_avx2(&x1[16], &x1[17]);
815 btf_32_add_sub_avx2(&x1[19], &x1[18]);
816 btf_32_add_sub_avx2(&x1[20], &x1[21]);
817 btf_32_add_sub_avx2(&x1[23], &x1[22]);
818 btf_32_add_sub_avx2(&x1[24], &x1[25]);
819 btf_32_add_sub_avx2(&x1[27], &x1[26]);
820 btf_32_add_sub_avx2(&x1[28], &x1[29]);
821 btf_32_add_sub_avx2(&x1[31], &x1[30]);
822
823 // stage 8
824 btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
825 btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
826 btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
827 btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
828 btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
829 btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
830 btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
831 btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
832
833 // stage 9
834 output[0] = x1[0];
835 output[1] = x1[16];
836 output[2] = x1[8];
837 output[3] = x1[24];
838 output[4] = x1[4];
839 output[5] = x1[20];
840 output[6] = x1[12];
841 output[7] = x1[28];
842 output[8] = x1[2];
843 output[9] = x1[18];
844 output[10] = x1[10];
845 output[11] = x1[26];
846 output[12] = x1[6];
847 output[13] = x1[22];
848 output[14] = x1[14];
849 output[15] = x1[30];
850 output[16] = x1[1];
851 output[17] = x1[17];
852 output[18] = x1[9];
853 output[19] = x1[25];
854 output[20] = x1[5];
855 output[21] = x1[21];
856 output[22] = x1[13];
857 output[23] = x1[29];
858 output[24] = x1[3];
859 output[25] = x1[19];
860 output[26] = x1[11];
861 output[27] = x1[27];
862 output[28] = x1[7];
863 output[29] = x1[23];
864 output[30] = x1[15];
865 output[31] = x1[31];
866 }
867
fdct64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)868 static inline void fdct64_new_avx2(const __m256i *input, __m256i *output,
869 int8_t cos_bit) {
870 const int32_t *cospi = cospi_arr(cos_bit);
871 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
872
873 __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
874 __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
875 __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
876 __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
877 __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
878 __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
879 __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
880 __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
881 __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
882 __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
883 __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
884 __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
885 __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
886 __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
887 __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
888 __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
889 __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
890 __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
891 __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
892 __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
893 __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
894 __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
895 __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
896 __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
897 __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
898 __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
899 __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
900 __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
901 __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
902 __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
903 __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
904 __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
905 __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
906 __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
907 __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
908 __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
909 __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
910 __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
911 __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
912 __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
913 __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
914 __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
915 __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
916 __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
917 __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
918 __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
919 __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
920 __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
921 __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
922 __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
923 __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
924 __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
925 __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
926 __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
927 __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
928 __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
929 __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
930 __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
931 __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
932 __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
933 __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
934 __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
935 __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
936 __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
937 __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
938 __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
939 __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
940 __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
941 __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
942 __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
943 __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
944 __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
945 __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
946 __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
947 __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
948 __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
949 __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
950 __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
951
952 // stage 1
953 __m256i x1[64];
954 btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
955 btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
956 btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
957 btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
958 btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
959 btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
960 btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
961 btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
962 btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
963 btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
964 btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
965 btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
966 btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
967 btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
968 btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
969 btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
970 btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
971 btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
972 btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
973 btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
974 btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
975 btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
976 btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
977 btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
978 btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
979 btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
980 btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
981 btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
982 btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
983 btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
984 btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
985 btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
986
987 // stage 2
988 btf_32_add_sub_avx2(&x1[0], &x1[31]);
989 btf_32_add_sub_avx2(&x1[1], &x1[30]);
990 btf_32_add_sub_avx2(&x1[2], &x1[29]);
991 btf_32_add_sub_avx2(&x1[3], &x1[28]);
992 btf_32_add_sub_avx2(&x1[4], &x1[27]);
993 btf_32_add_sub_avx2(&x1[5], &x1[26]);
994 btf_32_add_sub_avx2(&x1[6], &x1[25]);
995 btf_32_add_sub_avx2(&x1[7], &x1[24]);
996 btf_32_add_sub_avx2(&x1[8], &x1[23]);
997 btf_32_add_sub_avx2(&x1[9], &x1[22]);
998 btf_32_add_sub_avx2(&x1[10], &x1[21]);
999 btf_32_add_sub_avx2(&x1[11], &x1[20]);
1000 btf_32_add_sub_avx2(&x1[12], &x1[19]);
1001 btf_32_add_sub_avx2(&x1[13], &x1[18]);
1002 btf_32_add_sub_avx2(&x1[14], &x1[17]);
1003 btf_32_add_sub_avx2(&x1[15], &x1[16]);
1004 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
1005 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
1006 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
1007 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
1008 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
1009 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
1010 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
1011 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
1012
1013 // stage 3
1014 btf_32_add_sub_avx2(&x1[0], &x1[15]);
1015 btf_32_add_sub_avx2(&x1[1], &x1[14]);
1016 btf_32_add_sub_avx2(&x1[2], &x1[13]);
1017 btf_32_add_sub_avx2(&x1[3], &x1[12]);
1018 btf_32_add_sub_avx2(&x1[4], &x1[11]);
1019 btf_32_add_sub_avx2(&x1[5], &x1[10]);
1020 btf_32_add_sub_avx2(&x1[6], &x1[9]);
1021 btf_32_add_sub_avx2(&x1[7], &x1[8]);
1022 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
1023 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
1024 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
1025 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
1026 btf_32_add_sub_avx2(&x1[32], &x1[47]);
1027 btf_32_add_sub_avx2(&x1[33], &x1[46]);
1028 btf_32_add_sub_avx2(&x1[34], &x1[45]);
1029 btf_32_add_sub_avx2(&x1[35], &x1[44]);
1030 btf_32_add_sub_avx2(&x1[36], &x1[43]);
1031 btf_32_add_sub_avx2(&x1[37], &x1[42]);
1032 btf_32_add_sub_avx2(&x1[38], &x1[41]);
1033 btf_32_add_sub_avx2(&x1[39], &x1[40]);
1034 btf_32_add_sub_avx2(&x1[63], &x1[48]);
1035 btf_32_add_sub_avx2(&x1[62], &x1[49]);
1036 btf_32_add_sub_avx2(&x1[61], &x1[50]);
1037 btf_32_add_sub_avx2(&x1[60], &x1[51]);
1038 btf_32_add_sub_avx2(&x1[59], &x1[52]);
1039 btf_32_add_sub_avx2(&x1[58], &x1[53]);
1040 btf_32_add_sub_avx2(&x1[57], &x1[54]);
1041 btf_32_add_sub_avx2(&x1[56], &x1[55]);
1042
1043 // stage 4
1044 btf_32_add_sub_avx2(&x1[0], &x1[7]);
1045 btf_32_add_sub_avx2(&x1[1], &x1[6]);
1046 btf_32_add_sub_avx2(&x1[2], &x1[5]);
1047 btf_32_add_sub_avx2(&x1[3], &x1[4]);
1048 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
1049 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
1050 btf_32_add_sub_avx2(&x1[16], &x1[23]);
1051 btf_32_add_sub_avx2(&x1[17], &x1[22]);
1052 btf_32_add_sub_avx2(&x1[18], &x1[21]);
1053 btf_32_add_sub_avx2(&x1[19], &x1[20]);
1054 btf_32_add_sub_avx2(&x1[31], &x1[24]);
1055 btf_32_add_sub_avx2(&x1[30], &x1[25]);
1056 btf_32_add_sub_avx2(&x1[29], &x1[26]);
1057 btf_32_add_sub_avx2(&x1[28], &x1[27]);
1058 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
1059 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
1060 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
1061 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
1062 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
1063 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
1064 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
1065 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
1066
1067 // stage 5
1068 btf_32_add_sub_avx2(&x1[0], &x1[3]);
1069 btf_32_add_sub_avx2(&x1[1], &x1[2]);
1070 btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
1071 btf_32_add_sub_avx2(&x1[8], &x1[11]);
1072 btf_32_add_sub_avx2(&x1[9], &x1[10]);
1073 btf_32_add_sub_avx2(&x1[15], &x1[12]);
1074 btf_32_add_sub_avx2(&x1[14], &x1[13]);
1075 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
1076 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
1077 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
1078 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
1079 btf_32_add_sub_avx2(&x1[32], &x1[39]);
1080 btf_32_add_sub_avx2(&x1[33], &x1[38]);
1081 btf_32_add_sub_avx2(&x1[34], &x1[37]);
1082 btf_32_add_sub_avx2(&x1[35], &x1[36]);
1083 btf_32_add_sub_avx2(&x1[47], &x1[40]);
1084 btf_32_add_sub_avx2(&x1[46], &x1[41]);
1085 btf_32_add_sub_avx2(&x1[45], &x1[42]);
1086 btf_32_add_sub_avx2(&x1[44], &x1[43]);
1087 btf_32_add_sub_avx2(&x1[48], &x1[55]);
1088 btf_32_add_sub_avx2(&x1[49], &x1[54]);
1089 btf_32_add_sub_avx2(&x1[50], &x1[53]);
1090 btf_32_add_sub_avx2(&x1[51], &x1[52]);
1091 btf_32_add_sub_avx2(&x1[63], &x1[56]);
1092 btf_32_add_sub_avx2(&x1[62], &x1[57]);
1093 btf_32_add_sub_avx2(&x1[61], &x1[58]);
1094 btf_32_add_sub_avx2(&x1[60], &x1[59]);
1095
1096 // stage 6
1097 btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
1098 btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
1099 btf_32_add_sub_avx2(&x1[4], &x1[5]);
1100 btf_32_add_sub_avx2(&x1[7], &x1[6]);
1101 btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
1102 btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
1103 btf_32_add_sub_avx2(&x1[16], &x1[19]);
1104 btf_32_add_sub_avx2(&x1[17], &x1[18]);
1105 btf_32_add_sub_avx2(&x1[23], &x1[20]);
1106 btf_32_add_sub_avx2(&x1[22], &x1[21]);
1107 btf_32_add_sub_avx2(&x1[24], &x1[27]);
1108 btf_32_add_sub_avx2(&x1[25], &x1[26]);
1109 btf_32_add_sub_avx2(&x1[31], &x1[28]);
1110 btf_32_add_sub_avx2(&x1[30], &x1[29]);
1111 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
1112 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
1113 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
1114 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
1115 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
1116 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
1117 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
1118 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
1119
1120 // stage 7
1121 btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
1122 btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
1123 btf_32_add_sub_avx2(&x1[8], &x1[9]);
1124 btf_32_add_sub_avx2(&x1[11], &x1[10]);
1125 btf_32_add_sub_avx2(&x1[12], &x1[13]);
1126 btf_32_add_sub_avx2(&x1[15], &x1[14]);
1127 btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
1128 btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
1129 btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
1130 btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
1131 btf_32_add_sub_avx2(&x1[32], &x1[35]);
1132 btf_32_add_sub_avx2(&x1[33], &x1[34]);
1133 btf_32_add_sub_avx2(&x1[39], &x1[36]);
1134 btf_32_add_sub_avx2(&x1[38], &x1[37]);
1135 btf_32_add_sub_avx2(&x1[40], &x1[43]);
1136 btf_32_add_sub_avx2(&x1[41], &x1[42]);
1137 btf_32_add_sub_avx2(&x1[47], &x1[44]);
1138 btf_32_add_sub_avx2(&x1[46], &x1[45]);
1139 btf_32_add_sub_avx2(&x1[48], &x1[51]);
1140 btf_32_add_sub_avx2(&x1[49], &x1[50]);
1141 btf_32_add_sub_avx2(&x1[55], &x1[52]);
1142 btf_32_add_sub_avx2(&x1[54], &x1[53]);
1143 btf_32_add_sub_avx2(&x1[56], &x1[59]);
1144 btf_32_add_sub_avx2(&x1[57], &x1[58]);
1145 btf_32_add_sub_avx2(&x1[63], &x1[60]);
1146 btf_32_add_sub_avx2(&x1[62], &x1[61]);
1147
1148 // stage 8
1149 btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
1150 btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
1151 btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
1152 btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
1153 btf_32_add_sub_avx2(&x1[16], &x1[17]);
1154 btf_32_add_sub_avx2(&x1[19], &x1[18]);
1155 btf_32_add_sub_avx2(&x1[20], &x1[21]);
1156 btf_32_add_sub_avx2(&x1[23], &x1[22]);
1157 btf_32_add_sub_avx2(&x1[24], &x1[25]);
1158 btf_32_add_sub_avx2(&x1[27], &x1[26]);
1159 btf_32_add_sub_avx2(&x1[28], &x1[29]);
1160 btf_32_add_sub_avx2(&x1[31], &x1[30]);
1161 btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
1162 btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
1163 btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
1164 btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
1165 btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
1166 btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
1167 btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
1168 btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
1169
1170 // stage 9
1171 btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
1172 btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
1173 btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
1174 btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
1175 btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
1176 btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
1177 btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
1178 btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
1179 btf_32_add_sub_avx2(&x1[32], &x1[33]);
1180 btf_32_add_sub_avx2(&x1[35], &x1[34]);
1181 btf_32_add_sub_avx2(&x1[36], &x1[37]);
1182 btf_32_add_sub_avx2(&x1[39], &x1[38]);
1183 btf_32_add_sub_avx2(&x1[40], &x1[41]);
1184 btf_32_add_sub_avx2(&x1[43], &x1[42]);
1185 btf_32_add_sub_avx2(&x1[44], &x1[45]);
1186 btf_32_add_sub_avx2(&x1[47], &x1[46]);
1187 btf_32_add_sub_avx2(&x1[48], &x1[49]);
1188 btf_32_add_sub_avx2(&x1[51], &x1[50]);
1189 btf_32_add_sub_avx2(&x1[52], &x1[53]);
1190 btf_32_add_sub_avx2(&x1[55], &x1[54]);
1191 btf_32_add_sub_avx2(&x1[56], &x1[57]);
1192 btf_32_add_sub_avx2(&x1[59], &x1[58]);
1193 btf_32_add_sub_avx2(&x1[60], &x1[61]);
1194 btf_32_add_sub_avx2(&x1[63], &x1[62]);
1195
1196 // stage 10
1197 btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
1198 btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
1199 btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
1200 btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
1201 btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
1202 btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
1203 btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
1204 btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
1205 btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
1206 btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
1207 btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
1208 btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
1209 btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
1210 btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
1211 btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
1212 btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
1213
1214 // stage 11
1215 output[0] = x1[0];
1216 output[1] = x1[32];
1217 output[2] = x1[16];
1218 output[3] = x1[48];
1219 output[4] = x1[8];
1220 output[5] = x1[40];
1221 output[6] = x1[24];
1222 output[7] = x1[56];
1223 output[8] = x1[4];
1224 output[9] = x1[36];
1225 output[10] = x1[20];
1226 output[11] = x1[52];
1227 output[12] = x1[12];
1228 output[13] = x1[44];
1229 output[14] = x1[28];
1230 output[15] = x1[60];
1231 output[16] = x1[2];
1232 output[17] = x1[34];
1233 output[18] = x1[18];
1234 output[19] = x1[50];
1235 output[20] = x1[10];
1236 output[21] = x1[42];
1237 output[22] = x1[26];
1238 output[23] = x1[58];
1239 output[24] = x1[6];
1240 output[25] = x1[38];
1241 output[26] = x1[22];
1242 output[27] = x1[54];
1243 output[28] = x1[14];
1244 output[29] = x1[46];
1245 output[30] = x1[30];
1246 output[31] = x1[62];
1247 output[32] = x1[1];
1248 output[33] = x1[33];
1249 output[34] = x1[17];
1250 output[35] = x1[49];
1251 output[36] = x1[9];
1252 output[37] = x1[41];
1253 output[38] = x1[25];
1254 output[39] = x1[57];
1255 output[40] = x1[5];
1256 output[41] = x1[37];
1257 output[42] = x1[21];
1258 output[43] = x1[53];
1259 output[44] = x1[13];
1260 output[45] = x1[45];
1261 output[46] = x1[29];
1262 output[47] = x1[61];
1263 output[48] = x1[3];
1264 output[49] = x1[35];
1265 output[50] = x1[19];
1266 output[51] = x1[51];
1267 output[52] = x1[11];
1268 output[53] = x1[43];
1269 output[54] = x1[27];
1270 output[55] = x1[59];
1271 output[56] = x1[7];
1272 output[57] = x1[39];
1273 output[58] = x1[23];
1274 output[59] = x1[55];
1275 output[60] = x1[15];
1276 output[61] = x1[47];
1277 output[62] = x1[31];
1278 output[63] = x1[63];
1279 }
1280
fadst16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1281 static inline void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
1282 int8_t cos_bit) {
1283 const int32_t *cospi = cospi_arr(cos_bit);
1284 const __m256i __zero = _mm256_setzero_si256();
1285 const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
1286
1287 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1288 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
1289 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
1290 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
1291 __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
1292 __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
1293 __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
1294 __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
1295 __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
1296 __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
1297 __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
1298 __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
1299 __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
1300 __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
1301 __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
1302 __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
1303 __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
1304 __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
1305 __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
1306 __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
1307 __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
1308 __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
1309 __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
1310 __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
1311 __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
1312 __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
1313 __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
1314
1315 // stage 1
1316 __m256i x1[16];
1317 x1[0] = input[0];
1318 x1[1] = _mm256_subs_epi16(__zero, input[15]);
1319 x1[2] = _mm256_subs_epi16(__zero, input[7]);
1320 x1[3] = input[8];
1321 x1[4] = _mm256_subs_epi16(__zero, input[3]);
1322 x1[5] = input[12];
1323 x1[6] = input[4];
1324 x1[7] = _mm256_subs_epi16(__zero, input[11]);
1325 x1[8] = _mm256_subs_epi16(__zero, input[1]);
1326 x1[9] = input[14];
1327 x1[10] = input[6];
1328 x1[11] = _mm256_subs_epi16(__zero, input[9]);
1329 x1[12] = input[2];
1330 x1[13] = _mm256_subs_epi16(__zero, input[13]);
1331 x1[14] = _mm256_subs_epi16(__zero, input[5]);
1332 x1[15] = input[10];
1333
1334 // stage 2
1335 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
1336 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
1337 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
1338 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
1339
1340 // stage 3
1341 btf_16_adds_subs_avx2(&x1[0], &x1[2]);
1342 btf_16_adds_subs_avx2(&x1[1], &x1[3]);
1343 btf_16_adds_subs_avx2(&x1[4], &x1[6]);
1344 btf_16_adds_subs_avx2(&x1[5], &x1[7]);
1345 btf_16_adds_subs_avx2(&x1[8], &x1[10]);
1346 btf_16_adds_subs_avx2(&x1[9], &x1[11]);
1347 btf_16_adds_subs_avx2(&x1[12], &x1[14]);
1348 btf_16_adds_subs_avx2(&x1[13], &x1[15]);
1349
1350 // stage 4
1351 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
1352 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
1353 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
1354 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
1355
1356 // stage 5
1357 btf_16_adds_subs_avx2(&x1[0], &x1[4]);
1358 btf_16_adds_subs_avx2(&x1[1], &x1[5]);
1359 btf_16_adds_subs_avx2(&x1[2], &x1[6]);
1360 btf_16_adds_subs_avx2(&x1[3], &x1[7]);
1361 btf_16_adds_subs_avx2(&x1[8], &x1[12]);
1362 btf_16_adds_subs_avx2(&x1[9], &x1[13]);
1363 btf_16_adds_subs_avx2(&x1[10], &x1[14]);
1364 btf_16_adds_subs_avx2(&x1[11], &x1[15]);
1365
1366 // stage 6
1367 btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
1368 btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
1369 btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
1370 btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
1371
1372 // stage 7
1373 btf_16_adds_subs_avx2(&x1[0], &x1[8]);
1374 btf_16_adds_subs_avx2(&x1[1], &x1[9]);
1375 btf_16_adds_subs_avx2(&x1[2], &x1[10]);
1376 btf_16_adds_subs_avx2(&x1[3], &x1[11]);
1377 btf_16_adds_subs_avx2(&x1[4], &x1[12]);
1378 btf_16_adds_subs_avx2(&x1[5], &x1[13]);
1379 btf_16_adds_subs_avx2(&x1[6], &x1[14]);
1380 btf_16_adds_subs_avx2(&x1[7], &x1[15]);
1381
1382 // stage 8
1383 btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
1384 btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
1385 btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
1386 btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
1387 btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
1388 btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
1389 btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
1390 btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
1391
1392 // stage 9
1393 output[0] = x1[1];
1394 output[1] = x1[14];
1395 output[2] = x1[3];
1396 output[3] = x1[12];
1397 output[4] = x1[5];
1398 output[5] = x1[10];
1399 output[6] = x1[7];
1400 output[7] = x1[8];
1401 output[8] = x1[9];
1402 output[9] = x1[6];
1403 output[10] = x1[11];
1404 output[11] = x1[4];
1405 output[12] = x1[13];
1406 output[13] = x1[2];
1407 output[14] = x1[15];
1408 output[15] = x1[0];
1409 }
1410
fidentity16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1411 static inline void fidentity16x16_new_avx2(const __m256i *input,
1412 __m256i *output, int8_t cos_bit) {
1413 (void)cos_bit;
1414 const __m256i one = _mm256_set1_epi16(1);
1415
1416 for (int i = 0; i < 16; ++i) {
1417 const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
1418 const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
1419 const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
1420 const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
1421 output[i] = _mm256_packs_epi32(b_lo, b_hi);
1422 }
1423 }
1424
fidentity16x32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1425 static inline void fidentity16x32_avx2(const __m256i *input, __m256i *output,
1426 int8_t cos_bit) {
1427 (void)cos_bit;
1428 for (int i = 0; i < 32; ++i) {
1429 output[i] = _mm256_slli_epi16(input[i], 2);
1430 }
1431 }
1432
store_output_32bit_w16(int32_t * const out,const __m256i * const in1,const __m256i * const in2,const int stride,const int out_size)1433 static inline void store_output_32bit_w16(int32_t *const out,
1434 const __m256i *const in1,
1435 const __m256i *const in2,
1436 const int stride,
1437 const int out_size) {
1438 for (int i = 0; i < out_size; ++i) {
1439 _mm256_store_si256((__m256i *)(out + stride * i), in1[i]);
1440 _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]);
1441 }
1442 }
1443
1444 // Store 8 16 bit values. Sign extend the values.
store_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * out,const int stride,const int out_size)1445 static inline void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
1446 int32_t *out,
1447 const int stride,
1448 const int out_size) {
1449 for (int i = 0; i < out_size; ++i) {
1450 _mm256_store_si256((__m256i *)(out),
1451 _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
1452 _mm256_store_si256(
1453 (__m256i *)(out + 8),
1454 _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
1455 out += stride;
1456 }
1457 }
1458
store_rect_16bit_to_32bit_avx2(const __m256i a,int32_t * const b)1459 static inline void store_rect_16bit_to_32bit_avx2(const __m256i a,
1460 int32_t *const b) {
1461 const __m256i one = _mm256_set1_epi16(1);
1462 const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
1463 const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
1464 const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
1465 const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
1466 const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
1467 _mm256_store_si256((__m256i *)b, b_lo);
1468 _mm256_store_si256((__m256i *)(b + 8), b_hi);
1469 }
1470
store_rect_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * const out,const int stride,const int out_size)1471 static inline void store_rect_buffer_16bit_to_32bit_w16_avx2(
1472 const __m256i *const in, int32_t *const out, const int stride,
1473 const int out_size) {
1474 for (int i = 0; i < out_size; ++i) {
1475 store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
1476 }
1477 }
1478
1479 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
1480 int8_t cos_bit);
1481
1482 static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
1483 fdct16x32_avx2, // DCT_DCT
1484 NULL, // ADST_DCT
1485 NULL, // DCT_ADST
1486 NULL, // ADST_ADST
1487 NULL, // FLIPADST_DCT
1488 NULL, // DCT_FLIPADST
1489 NULL, // FLIPADST_FLIPADST
1490 NULL, // ADST_FLIPADST
1491 NULL, // FLIPADST_ADST
1492 fidentity16x32_avx2, // IDTX
1493 fdct16x32_avx2, // V_DCT
1494 fidentity16x32_avx2, // H_DCT
1495 NULL, // V_ADST
1496 NULL, // H_ADST
1497 NULL, // V_FLIPADST
1498 NULL // H_FLIPADST
1499 };
1500
1501 static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
1502 fdct16x32_avx2, // DCT_DCT
1503 NULL, // ADST_DCT
1504 NULL, // DCT_ADST
1505 NULL, // ADST_ADST
1506 NULL, // FLIPADST_DCT
1507 NULL, // DCT_FLIPADST
1508 NULL, // FLIPADST_FLIPADST
1509 NULL, // ADST_FLIPADST
1510 NULL, // FLIPADST_ADST
1511 fidentity16x32_avx2, // IDTX
1512 fidentity16x32_avx2, // V_DCT
1513 fdct16x32_avx2, // H_DCT
1514 NULL, // V_ADST
1515 NULL, // H_ADST
1516 NULL, // V_FLIPADST
1517 NULL // H_FLIPADST
1518 };
1519
1520 static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
1521 fdct16x16_new_avx2, // DCT_DCT
1522 fadst16x16_new_avx2, // ADST_DCT
1523 fdct16x16_new_avx2, // DCT_ADST
1524 fadst16x16_new_avx2, // ADST_ADST
1525 fadst16x16_new_avx2, // FLIPADST_DCT
1526 fdct16x16_new_avx2, // DCT_FLIPADST
1527 fadst16x16_new_avx2, // FLIPADST_FLIPADST
1528 fadst16x16_new_avx2, // ADST_FLIPADST
1529 fadst16x16_new_avx2, // FLIPADST_ADST
1530 fidentity16x16_new_avx2, // IDTX
1531 fdct16x16_new_avx2, // V_DCT
1532 fidentity16x16_new_avx2, // H_DCT
1533 fadst16x16_new_avx2, // V_ADST
1534 fidentity16x16_new_avx2, // H_ADST
1535 fadst16x16_new_avx2, // V_FLIPADST
1536 fidentity16x16_new_avx2 // H_FLIPADST
1537 };
1538
1539 static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
1540 fdct16x16_new_avx2, // DCT_DCT
1541 fdct16x16_new_avx2, // ADST_DCT
1542 fadst16x16_new_avx2, // DCT_ADST
1543 fadst16x16_new_avx2, // ADST_ADST
1544 fdct16x16_new_avx2, // FLIPADST_DCT
1545 fadst16x16_new_avx2, // DCT_FLIPADST
1546 fadst16x16_new_avx2, // FLIPADST_FLIPADST
1547 fadst16x16_new_avx2, // ADST_FLIPADST
1548 fadst16x16_new_avx2, // FLIPADST_ADST
1549 fidentity16x16_new_avx2, // IDTX
1550 fidentity16x16_new_avx2, // V_DCT
1551 fdct16x16_new_avx2, // H_DCT
1552 fidentity16x16_new_avx2, // V_ADST
1553 fadst16x16_new_avx2, // H_ADST
1554 fidentity16x16_new_avx2, // V_FLIPADST
1555 fadst16x16_new_avx2 // H_FLIPADST
1556 };
1557
1558 static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
1559 fdct8x8_new_sse2, // DCT_DCT
1560 fadst8x8_new_sse2, // ADST_DCT
1561 fdct8x8_new_sse2, // DCT_ADST
1562 fadst8x8_new_sse2, // ADST_ADST
1563 fadst8x8_new_sse2, // FLIPADST_DCT
1564 fdct8x8_new_sse2, // DCT_FLIPADST
1565 fadst8x8_new_sse2, // FLIPADST_FLIPADST
1566 fadst8x8_new_sse2, // ADST_FLIPADST
1567 fadst8x8_new_sse2, // FLIPADST_ADST
1568 fidentity8x8_new_sse2, // IDTX
1569 fdct8x8_new_sse2, // V_DCT
1570 fidentity8x8_new_sse2, // H_DCT
1571 fadst8x8_new_sse2, // V_ADST
1572 fidentity8x8_new_sse2, // H_ADST
1573 fadst8x8_new_sse2, // V_FLIPADST
1574 fidentity8x8_new_sse2, // H_FLIPADST
1575 };
1576
1577 static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
1578 fdct8x8_new_sse2, // DCT_DCT
1579 fdct8x8_new_sse2, // ADST_DCT
1580 fadst8x8_new_sse2, // DCT_ADST
1581 fadst8x8_new_sse2, // ADST_ADST
1582 fdct8x8_new_sse2, // FLIPADST_DCT
1583 fadst8x8_new_sse2, // DCT_FLIPADST
1584 fadst8x8_new_sse2, // FLIPADST_FLIPADST
1585 fadst8x8_new_sse2, // ADST_FLIPADST
1586 fadst8x8_new_sse2, // FLIPADST_ADST
1587 fidentity8x8_new_sse2, // IDTX
1588 fidentity8x8_new_sse2, // V_DCT
1589 fdct8x8_new_sse2, // H_DCT
1590 fidentity8x8_new_sse2, // V_ADST
1591 fadst8x8_new_sse2, // H_ADST
1592 fidentity8x8_new_sse2, // V_FLIPADST
1593 fadst8x8_new_sse2 // H_FLIPADST
1594 };
1595
load_buffer_and_round_shift(const int16_t * in,int stride,__m128i * out,int bit)1596 static inline void load_buffer_and_round_shift(const int16_t *in, int stride,
1597 __m128i *out, int bit) {
1598 out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
1599 out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
1600 out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride));
1601 out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride));
1602 out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride));
1603 out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride));
1604 out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride));
1605 out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride));
1606 out[0] = _mm_slli_epi16(out[0], bit);
1607 out[1] = _mm_slli_epi16(out[1], bit);
1608 out[2] = _mm_slli_epi16(out[2], bit);
1609 out[3] = _mm_slli_epi16(out[3], bit);
1610 out[4] = _mm_slli_epi16(out[4], bit);
1611 out[5] = _mm_slli_epi16(out[5], bit);
1612 out[6] = _mm_slli_epi16(out[6], bit);
1613 out[7] = _mm_slli_epi16(out[7], bit);
1614 }
1615
load_buffer_and_flip_round_shift(const int16_t * in,int stride,__m128i * out,int bit)1616 static inline void load_buffer_and_flip_round_shift(const int16_t *in,
1617 int stride, __m128i *out,
1618 int bit) {
1619 out[7] = load_16bit_to_16bit(in + 0 * stride);
1620 out[6] = load_16bit_to_16bit(in + 1 * stride);
1621 out[5] = load_16bit_to_16bit(in + 2 * stride);
1622 out[4] = load_16bit_to_16bit(in + 3 * stride);
1623 out[3] = load_16bit_to_16bit(in + 4 * stride);
1624 out[2] = load_16bit_to_16bit(in + 5 * stride);
1625 out[1] = load_16bit_to_16bit(in + 6 * stride);
1626 out[0] = load_16bit_to_16bit(in + 7 * stride);
1627 out[7] = _mm_slli_epi16(out[7], bit);
1628 out[6] = _mm_slli_epi16(out[6], bit);
1629 out[5] = _mm_slli_epi16(out[5], bit);
1630 out[4] = _mm_slli_epi16(out[4], bit);
1631 out[3] = _mm_slli_epi16(out[3], bit);
1632 out[2] = _mm_slli_epi16(out[2], bit);
1633 out[1] = _mm_slli_epi16(out[1], bit);
1634 out[0] = _mm_slli_epi16(out[0], bit);
1635 }
1636
1637 #define TRANSPOSE_8X8_AVX2() \
1638 { \
1639 /* aa0: 00 10 01 11 02 12 03 13 | 40 50 41 51 42 52 43 53*/ \
1640 /* aa1: 04 14 05 15 06 16 07 17 | 44 54 45 55 46 56 47 57*/ \
1641 /* aa2: 20 30 21 31 22 32 23 33 | 60 70 61 71 62 72 63 73*/ \
1642 /* aa3: 24 34 25 35 26 36 27 37 | 64 74 65 75 66 76 67 77*/ \
1643 const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1); \
1644 const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1); \
1645 const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3); \
1646 const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3); \
1647 /* Unpack 32 bit elements resulting in: */ \
1648 /* bb0: 00 10 20 30 01 11 21 31 | 40 50 60 70 41 51 61 71*/ \
1649 /* bb1: 02 12 22 32 03 13 23 33 | 42 52 62 72 43 53 63 73*/ \
1650 /* bb2: 04 14 24 34 05 15 25 35 | 44 54 64 74 45 55 65 75*/ \
1651 /* bb2: 06 16 26 36 07 17 27 37 | 46 56 66 76 47 57 67 77*/ \
1652 const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2); \
1653 const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2); \
1654 const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3); \
1655 const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3); \
1656 /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/ \
1657 /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/ \
1658 /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/ \
1659 /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/ \
1660 c0 = _mm256_permute4x64_epi64(bb0, 0xd8); \
1661 c1 = _mm256_permute4x64_epi64(bb1, 0xd8); \
1662 c2 = _mm256_permute4x64_epi64(bb2, 0xd8); \
1663 c3 = _mm256_permute4x64_epi64(bb3, 0xd8); \
1664 }
1665
transpose_round_shift_flip_8x8(__m128i * const in,__m128i * const out,int bit)1666 static inline void transpose_round_shift_flip_8x8(__m128i *const in,
1667 __m128i *const out, int bit) {
1668 __m256i c0, c1, c2, c3;
1669 bit = -bit;
1670 const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
1671 const __m256i s04 =
1672 _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
1673 const __m256i s15 =
1674 _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
1675 const __m256i s26 =
1676 _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
1677 const __m256i s37 =
1678 _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
1679
1680 const __m256i a0 = _mm256_adds_epi16(s04, rounding);
1681 const __m256i a1 = _mm256_adds_epi16(s15, rounding);
1682 const __m256i a2 = _mm256_adds_epi16(s26, rounding);
1683 const __m256i a3 = _mm256_adds_epi16(s37, rounding);
1684
1685 // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47
1686 // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57
1687 // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67
1688 // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77
1689 const __m256i b0 = _mm256_srai_epi16(a0, bit);
1690 const __m256i b1 = _mm256_srai_epi16(a1, bit);
1691 const __m256i b2 = _mm256_srai_epi16(a2, bit);
1692 const __m256i b3 = _mm256_srai_epi16(a3, bit);
1693
1694 TRANSPOSE_8X8_AVX2()
1695
1696 // Unpack 64 bit elements resulting in:
1697 // out[7]: 00 10 20 30 40 50 60 70
1698 // out[6]: 01 11 21 31 41 51 61 71
1699 // out[5]: 02 12 22 32 42 52 62 72
1700 // out[4]: 03 13 23 33 43 53 63 73
1701 // out[3]: 04 14 24 34 44 54 64 74
1702 // out[2]: 05 15 25 35 45 55 65 75
1703 // out[1]: 06 16 26 36 46 56 66 76
1704 // out[0]: 07 17 27 37 47 57 67 77
1705 out[7] = _mm256_castsi256_si128(c0);
1706 out[6] = _mm256_extractf128_si256(c0, 1);
1707 out[5] = _mm256_castsi256_si128(c1);
1708 out[4] = _mm256_extractf128_si256(c1, 1);
1709 out[3] = _mm256_castsi256_si128(c2);
1710 out[2] = _mm256_extractf128_si256(c2, 1);
1711 out[1] = _mm256_castsi256_si128(c3);
1712 out[0] = _mm256_extractf128_si256(c3, 1);
1713 }
1714
transpose_round_shift_8x8(__m128i * const in,__m128i * const out,int bit)1715 static inline void transpose_round_shift_8x8(__m128i *const in,
1716 __m128i *const out, int bit) {
1717 __m256i c0, c1, c2, c3;
1718 bit = -bit;
1719 const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
1720 const __m256i s04 =
1721 _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
1722 const __m256i s15 =
1723 _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
1724 const __m256i s26 =
1725 _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
1726 const __m256i s37 =
1727 _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
1728
1729 const __m256i a0 = _mm256_adds_epi16(s04, rounding);
1730 const __m256i a1 = _mm256_adds_epi16(s15, rounding);
1731 const __m256i a2 = _mm256_adds_epi16(s26, rounding);
1732 const __m256i a3 = _mm256_adds_epi16(s37, rounding);
1733
1734 // b0: 00 01 02 03 04 05 06 07 | 40 41 42 43 44 45 46 47
1735 // b1: 10 11 12 13 14 15 16 17 | 50 51 52 53 54 55 56 57
1736 // b2: 20 21 22 23 24 25 26 27 | 60 61 62 63 64 65 66 67
1737 // b3: 30 31 32 33 34 35 36 37 | 70 71 72 73 74 75 76 77
1738 const __m256i b0 = _mm256_srai_epi16(a0, bit);
1739 const __m256i b1 = _mm256_srai_epi16(a1, bit);
1740 const __m256i b2 = _mm256_srai_epi16(a2, bit);
1741 const __m256i b3 = _mm256_srai_epi16(a3, bit);
1742
1743 TRANSPOSE_8X8_AVX2()
1744 // Unpack 64 bit elements resulting in:
1745 // out[7]: 00 10 20 30 40 50 60 70
1746 // out[6]: 01 11 21 31 41 51 61 71
1747 // out[5]: 02 12 22 32 42 52 62 72
1748 // out[4]: 03 13 23 33 43 53 63 73
1749 // out[3]: 04 14 24 34 44 54 64 74
1750 // out[2]: 05 15 25 35 45 55 65 75
1751 // out[1]: 06 16 26 36 46 56 66 76
1752 // out[0]: 07 17 27 37 47 57 67 77
1753 out[0] = _mm256_castsi256_si128(c0);
1754 out[1] = _mm256_extractf128_si256(c0, 1);
1755 out[2] = _mm256_castsi256_si128(c1);
1756 out[3] = _mm256_extractf128_si256(c1, 1);
1757 out[4] = _mm256_castsi256_si128(c2);
1758 out[5] = _mm256_extractf128_si256(c2, 1);
1759 out[6] = _mm256_castsi256_si128(c3);
1760 out[7] = _mm256_extractf128_si256(c3, 1);
1761 }
1762
store_buffer_16bit_to_32bit_w8_avx2(const __m128i * const in,int32_t * const out,const int stride,const int out_size)1763 static inline void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
1764 int32_t *const out,
1765 const int stride,
1766 const int out_size) {
1767 for (int i = 0; i < out_size; ++i) {
1768 _mm256_store_si256((__m256i *)(out + i * stride),
1769 _mm256_cvtepi16_epi32(in[i]));
1770 }
1771 }
1772
av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1773 static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output,
1774 int stride, TX_TYPE tx_type, int bd) {
1775 (void)bd;
1776 __m128i buf0[8], buf1[8], *buf;
1777 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
1778 const int txw_idx = get_txw_idx(TX_8X8);
1779 const int txh_idx = get_txh_idx(TX_8X8);
1780 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1781 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1782 const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
1783 const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
1784 int ud_flip, lr_flip;
1785
1786 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1787 // Condition to check shift bit is avoided while round shifting, by assuming
1788 // that shift[0] will always be positive.
1789 assert(shift[0] > 0);
1790 if (ud_flip)
1791 load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]);
1792 else
1793 load_buffer_and_round_shift(input, stride, buf0, shift[0]);
1794
1795 col_txfm(buf0, buf0, cos_bit_col);
1796 // Condition to check shift bit is avoided while round shifting, by assuming
1797 // that shift[1] will always be negative.
1798 assert(shift[1] < 0);
1799
1800 if (lr_flip) {
1801 transpose_round_shift_flip_8x8(buf0, buf1, shift[1]);
1802 } else {
1803 transpose_round_shift_8x8(buf0, buf1, shift[1]);
1804 }
1805
1806 buf = buf1;
1807 row_txfm(buf, buf, cos_bit_row);
1808
1809 // Round and shift operation is avoided here as the shift bit is assumed to be
1810 // zero always.
1811 assert(shift[2] == 0);
1812 store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8);
1813 }
1814
lowbd_fwd_txfm2d_16x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1815 static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
1816 int stride, TX_TYPE tx_type, int bd) {
1817 (void)bd;
1818 const TX_SIZE tx_size = TX_16X16;
1819 __m256i buf0[16], buf1[16];
1820 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1821 const int txw_idx = get_txw_idx(tx_size);
1822 const int txh_idx = get_txh_idx(tx_size);
1823 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1824 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1825 const int width = tx_size_wide[tx_size];
1826 const int height = tx_size_high[tx_size];
1827 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
1828 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1829 int ud_flip, lr_flip;
1830
1831 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1832 const int32_t i = 0;
1833 if (ud_flip) {
1834 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
1835 } else {
1836 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1837 }
1838 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1839 col_txfm(buf0, buf0, cos_bit_col);
1840 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1841 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
1842
1843 __m256i *buf;
1844 if (lr_flip) {
1845 buf = buf0;
1846 flip_buf_avx2(buf1 + width * i, buf, width);
1847 } else {
1848 buf = buf1 + width * i;
1849 }
1850 row_txfm(buf, buf, cos_bit_row);
1851 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1852 store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
1853 }
1854
lowbd_fwd_txfm2d_32x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1855 static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
1856 int stride, TX_TYPE tx_type, int bd) {
1857 (void)bd;
1858 const TX_SIZE tx_size = TX_32X32;
1859 __m256i buf0[32], buf1[128];
1860 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1861 const int txw_idx = get_txw_idx(tx_size);
1862 const int txh_idx = get_txh_idx(tx_size);
1863 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1864 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1865 const int width = tx_size_wide[tx_size];
1866 const int height = tx_size_high[tx_size];
1867 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1868 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
1869
1870 int ud_flip, lr_flip;
1871 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1872
1873 for (int i = 0; i < 2; i++) {
1874 if (ud_flip) {
1875 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
1876 height);
1877 } else {
1878 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1879 }
1880 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1881 col_txfm(buf0, buf0, cos_bit_col);
1882 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1883 transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
1884 transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
1885 }
1886
1887 for (int i = 0; i < 2; i++) {
1888 __m256i *buf;
1889 if (lr_flip) {
1890 buf = buf0;
1891 flip_buf_avx2(buf1 + width * i, buf, width);
1892 } else {
1893 buf = buf1 + width * i;
1894 }
1895 row_txfm(buf, buf, cos_bit_row);
1896 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1897 store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
1898 }
1899 }
1900
lowbd_fwd_txfm2d_64x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1901 static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
1902 int stride, TX_TYPE tx_type, int bd) {
1903 (void)bd;
1904 (void)tx_type;
1905 assert(tx_type == DCT_DCT);
1906 const TX_SIZE tx_size = TX_64X64;
1907 __m256i buf0[64], buf1[256];
1908 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1909 const int txw_idx = get_txw_idx(tx_size);
1910 const int txh_idx = get_txh_idx(tx_size);
1911 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1912 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1913 const int width = tx_size_wide[tx_size];
1914 const int height = tx_size_high[tx_size];
1915 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1916 const int width_div16 = (width >> 4);
1917 const int height_div16 = (height >> 4);
1918
1919 for (int i = 0; i < width_div16; i++) {
1920 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1921 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1922 col_txfm(buf0, buf0, cos_bit_col);
1923 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1924 for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
1925 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1926 }
1927 }
1928
1929 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1930 __m256i bufA[64];
1931 __m256i bufB[64];
1932 __m128i *buf = (__m128i *)(buf1 + width * i);
1933 for (int j = 0; j < width; ++j) {
1934 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1935 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1936 }
1937 fdct64_new_avx2(bufA, bufA, cos_bit_row);
1938 fdct64_new_avx2(bufB, bufB, cos_bit_row);
1939 round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
1940 round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
1941 store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
1942 }
1943 }
1944
lowbd_fwd_txfm2d_16x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1945 static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
1946 int stride, TX_TYPE tx_type, int bd) {
1947 (void)bd;
1948 const TX_SIZE tx_size = TX_16X32;
1949 __m256i buf0[32], buf1[32];
1950 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1951 const int txw_idx = get_txw_idx(tx_size);
1952 const int txh_idx = get_txh_idx(tx_size);
1953 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1954 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1955 const int width = tx_size_wide[tx_size];
1956 const int height = tx_size_high[tx_size];
1957 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1958 const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1959
1960 int ud_flip, lr_flip;
1961 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1962
1963 if (ud_flip) {
1964 load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
1965 } else {
1966 load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
1967 }
1968 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1969 col_txfm(buf0, buf0, cos_bit_col);
1970 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1971 transpose_16bit_16x16_avx2(buf0, buf1);
1972 transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
1973
1974 for (int i = 0; i < 2; i++) {
1975 __m256i *buf;
1976 if (lr_flip) {
1977 buf = buf0;
1978 flip_buf_avx2(buf1 + width * i, buf, width);
1979 } else {
1980 buf = buf1 + width * i;
1981 }
1982 row_txfm(buf, buf, cos_bit_row);
1983 round_shift_16bit_w16_avx2(buf, width, shift[2]);
1984 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height,
1985 width);
1986 }
1987 }
1988
lowbd_fwd_txfm2d_32x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1989 static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
1990 int stride, TX_TYPE tx_type, int bd) {
1991 (void)bd;
1992 __m256i buf0[32], buf1[64];
1993 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
1994 const int txw_idx = get_txw_idx(TX_32X16);
1995 const int txh_idx = get_txh_idx(TX_32X16);
1996 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1997 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1998 const int width = 32;
1999 const int height = 16;
2000 const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
2001 const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
2002
2003 int ud_flip, lr_flip;
2004 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2005
2006 for (int i = 0; i < 2; i++) {
2007 if (ud_flip) {
2008 load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
2009 height);
2010 } else {
2011 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2012 }
2013 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2014 col_txfm(buf0, buf0, cos_bit_col);
2015 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2016 transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
2017 }
2018
2019 __m256i *buf;
2020 if (lr_flip) {
2021 buf = buf0;
2022 flip_buf_avx2(buf1, buf, width);
2023 } else {
2024 buf = buf1;
2025 }
2026 row_txfm(buf, buf, cos_bit_row);
2027 round_shift_16bit_w16_avx2(buf, width, shift[2]);
2028 store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width);
2029 }
2030
lowbd_fwd_txfm2d_64x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2031 static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
2032 int stride, TX_TYPE tx_type, int bd) {
2033 (void)bd;
2034 const TX_SIZE tx_size = TX_64X32;
2035 __m256i buf0[64], buf1[256];
2036 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2037 const int txw_idx = get_txw_idx(tx_size);
2038 const int txh_idx = get_txh_idx(tx_size);
2039 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2040 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2041 const int width = tx_size_wide[tx_size];
2042 const int height = tx_size_high[tx_size];
2043 const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
2044 const int width_div16 = (width >> 4);
2045 const int height_div16 = (height >> 4);
2046
2047 for (int i = 0; i < width_div16; i++) {
2048 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2049 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2050 col_txfm(buf0, buf0, cos_bit_col);
2051 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2052 for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
2053 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2054 }
2055 }
2056 assert(tx_type == DCT_DCT);
2057 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
2058 __m256i bufA[64];
2059 __m256i bufB[64];
2060 __m128i *buf = (__m128i *)(buf1 + width * i);
2061 for (int j = 0; j < width; ++j) {
2062 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
2063 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
2064 }
2065 fdct64_new_avx2(bufA, bufA, cos_bit_row);
2066 fdct64_new_avx2(bufB, bufB, cos_bit_row);
2067 round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
2068 round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
2069
2070 store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
2071 }
2072 }
2073
lowbd_fwd_txfm2d_32x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2074 static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
2075 int stride, TX_TYPE tx_type, int bd) {
2076 (void)bd;
2077 (void)tx_type;
2078 assert(tx_type == DCT_DCT);
2079 const TX_SIZE tx_size = TX_32X64;
2080 __m256i buf0[64], buf1[256];
2081 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2082 const int txw_idx = get_txw_idx(tx_size);
2083 const int txh_idx = get_txh_idx(tx_size);
2084 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2085 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2086 const int width = tx_size_wide[tx_size];
2087 const int height = tx_size_high[tx_size];
2088 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
2089 const int width_div16 = (width >> 4);
2090 const int height_div16 = (height >> 4);
2091
2092 for (int i = 0; i < width_div16; i++) {
2093 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2094 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2095 col_txfm(buf0, buf0, cos_bit_col);
2096 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2097 for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
2098 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2099 }
2100 }
2101
2102 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
2103 __m256i bufA[32];
2104 __m256i bufB[32];
2105 __m128i *buf = (__m128i *)(buf1 + width * i);
2106 for (int j = 0; j < width; ++j) {
2107 bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
2108 bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
2109 }
2110 fdct32_avx2(bufA, bufA, cos_bit_row);
2111 fdct32_avx2(bufB, bufB, cos_bit_row);
2112 round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
2113 round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
2114
2115 store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
2116 }
2117 }
2118
lowbd_fwd_txfm2d_16x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2119 static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
2120 int stride, TX_TYPE tx_type, int bd) {
2121 (void)bd;
2122 (void)tx_type;
2123 assert(tx_type == DCT_DCT);
2124 const TX_SIZE tx_size = TX_16X64;
2125 __m256i buf0[64], buf1[64];
2126 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2127 const int txw_idx = get_txw_idx(tx_size);
2128 const int txh_idx = get_txh_idx(tx_size);
2129 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2130 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2131 const int width = tx_size_wide[tx_size];
2132 const int height = tx_size_high[tx_size];
2133 const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
2134 const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
2135 const int width_div16 = (width >> 4);
2136 const int height_div16 = (height >> 4);
2137
2138 for (int i = 0; i < width_div16; i++) {
2139 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2140 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2141 col_txfm(buf0, buf0, cos_bit_col);
2142 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2143 for (int j = 0; j < height_div16; ++j) {
2144 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2145 }
2146 }
2147
2148 for (int i = 0; i < AOMMIN(2, height_div16); i++) {
2149 __m256i *buf = buf1 + width * i;
2150 row_txfm(buf, buf, cos_bit_row);
2151 round_shift_16bit_w16_avx2(buf, width, shift[2]);
2152 store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width);
2153 }
2154 }
2155
lowbd_fwd_txfm2d_64x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2156 static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
2157 int stride, TX_TYPE tx_type, int bd) {
2158 (void)bd;
2159 (void)tx_type;
2160 assert(tx_type == DCT_DCT);
2161 const TX_SIZE tx_size = TX_64X16;
2162 __m256i buf0[64], buf1[64];
2163 const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2164 const int txw_idx = get_txw_idx(tx_size);
2165 const int txh_idx = get_txh_idx(tx_size);
2166 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2167 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2168 const int width = tx_size_wide[tx_size];
2169 const int height = tx_size_high[tx_size];
2170 const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
2171 const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
2172 const int width_div16 = (width >> 4);
2173 const int height_div16 = (height >> 4);
2174
2175 for (int i = 0; i < width_div16; i++) {
2176 load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2177 round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2178 col_txfm(buf0, buf0, cos_bit_col);
2179 round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2180 for (int j = 0; j < height_div16; ++j) {
2181 transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2182 }
2183 }
2184
2185 for (int i = 0; i < height_div16; i++) {
2186 __m256i *buf = buf1 + width * i;
2187 row_txfm(buf, buf, cos_bit_row);
2188 round_shift_16bit_w16_avx2(buf, width, shift[2]);
2189 store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32);
2190 }
2191 // Zero out the bottom 16x32 area.
2192 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
2193 }
2194
btf_16_avx2(__m256i * w0,__m256i * w1,__m256i * in0,__m256i * in1,__m128i * out0,__m128i * out1,__m128i * out2,__m128i * out3,const __m256i * __rounding,int8_t * cos_bit)2195 static inline void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
2196 __m256i *in1, __m128i *out0, __m128i *out1,
2197 __m128i *out2, __m128i *out3,
2198 const __m256i *__rounding, int8_t *cos_bit) {
2199 __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
2200 __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
2201 __m256i u0 = _mm256_madd_epi16(t0, *w0);
2202 __m256i u1 = _mm256_madd_epi16(t1, *w0);
2203 __m256i v0 = _mm256_madd_epi16(t0, *w1);
2204 __m256i v1 = _mm256_madd_epi16(t1, *w1);
2205
2206 __m256i a0 = _mm256_add_epi32(u0, *__rounding);
2207 __m256i a1 = _mm256_add_epi32(u1, *__rounding);
2208 __m256i b0 = _mm256_add_epi32(v0, *__rounding);
2209 __m256i b1 = _mm256_add_epi32(v1, *__rounding);
2210
2211 __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
2212 __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
2213 __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
2214 __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
2215
2216 __m256i temp0 = _mm256_packs_epi32(c0, c1);
2217 __m256i temp1 = _mm256_packs_epi32(d0, d1);
2218
2219 *out0 = _mm256_castsi256_si128(temp0);
2220 *out1 = _mm256_castsi256_si128(temp1);
2221 *out2 = _mm256_extracti128_si256(temp0, 0x01);
2222 *out3 = _mm256_extracti128_si256(temp1, 0x01);
2223 }
2224
fdct8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2225 static inline void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
2226 int8_t cos_bit) {
2227 const int32_t *cospi = cospi_arr(cos_bit);
2228 const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2229
2230 __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
2231 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
2232 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
2233 __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
2234 __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
2235 __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
2236 __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
2237 __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
2238 __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
2239
2240 // stage 1
2241 __m256i x1[8];
2242 x1[0] = _mm256_adds_epi16(input[0], input[7]);
2243 x1[7] = _mm256_subs_epi16(input[0], input[7]);
2244 x1[1] = _mm256_adds_epi16(input[1], input[6]);
2245 x1[6] = _mm256_subs_epi16(input[1], input[6]);
2246 x1[2] = _mm256_adds_epi16(input[2], input[5]);
2247 x1[5] = _mm256_subs_epi16(input[2], input[5]);
2248 x1[3] = _mm256_adds_epi16(input[3], input[4]);
2249 x1[4] = _mm256_subs_epi16(input[3], input[4]);
2250
2251 // stage 2
2252 __m256i x2[8];
2253 x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
2254 x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
2255 x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
2256 x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
2257 x2[4] = x1[4];
2258 btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
2259 cos_bit);
2260 x2[5] = x1[5];
2261 x2[6] = x1[6];
2262 x2[7] = x1[7];
2263
2264 // stage 3
2265 __m256i x3[8];
2266 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
2267 cos_bit);
2268 x3[0] = x2[0];
2269 x3[1] = x2[1];
2270 btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
2271 cos_bit);
2272 x3[2] = x2[2];
2273 x3[3] = x2[3];
2274 x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
2275 x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
2276 x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
2277 x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
2278
2279 // stage 4
2280 __m256i x4[8];
2281 x4[0] = x3[0];
2282 x4[1] = x3[1];
2283 x4[2] = x3[2];
2284 x4[3] = x3[3];
2285 btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
2286 cos_bit);
2287 x4[4] = x3[4];
2288 x4[7] = x3[7];
2289 btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
2290 cos_bit);
2291 x4[5] = x3[5];
2292 x4[6] = x3[6];
2293 // stage 5
2294 output[0] = x4[0];
2295 output[1] = x4[4];
2296 output[2] = x4[2];
2297 output[3] = x4[6];
2298 output[4] = x4[1];
2299 output[5] = x4[5];
2300 output[6] = x4[3];
2301 output[7] = x4[7];
2302 }
2303
fadst8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2304 static inline void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
2305 int8_t cos_bit) {
2306 const int32_t *cospi = cospi_arr(cos_bit);
2307 const __m256i __zero = _mm256_setzero_si256();
2308 const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2309
2310 __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
2311 __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
2312 __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
2313 __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
2314 __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
2315 __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
2316 __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
2317 __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
2318 __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
2319 __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
2320 __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
2321 __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
2322 __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
2323
2324 // stage 1
2325 __m256i x1[8];
2326 x1[0] = input[0];
2327 x1[1] = _mm256_subs_epi16(__zero, input[7]);
2328 x1[2] = _mm256_subs_epi16(__zero, input[3]);
2329 x1[3] = input[4];
2330 x1[4] = _mm256_subs_epi16(__zero, input[1]);
2331 x1[5] = input[6];
2332 x1[6] = input[2];
2333 x1[7] = _mm256_subs_epi16(__zero, input[5]);
2334
2335 // stage 2
2336 __m256i x2[8];
2337 x2[0] = x1[0];
2338 x2[1] = x1[1];
2339 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
2340 cos_bit);
2341 x2[2] = x1[2];
2342 x2[3] = x1[3];
2343 x2[4] = x1[4];
2344 x2[5] = x1[5];
2345 btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
2346 cos_bit);
2347 x2[6] = x1[6];
2348 x2[7] = x1[7];
2349
2350 // stage 3
2351 __m256i x3[8];
2352 x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
2353 x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
2354 x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
2355 x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
2356 x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
2357 x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
2358 x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
2359 x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
2360
2361 // stage 4
2362 __m256i x4[8];
2363 x4[0] = x3[0];
2364 x4[1] = x3[1];
2365 x4[2] = x3[2];
2366 x4[3] = x3[3];
2367 btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
2368 cos_bit);
2369 x4[4] = x3[4];
2370 x4[5] = x3[5];
2371 btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
2372 cos_bit);
2373 x4[6] = x3[6];
2374 x4[7] = x3[7];
2375
2376 // stage 5
2377 __m256i x5[8];
2378 x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
2379 x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
2380 x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
2381 x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
2382 x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
2383 x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
2384 x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
2385 x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
2386
2387 // stage 6
2388 __m256i x6[8];
2389 btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
2390 cos_bit);
2391 x6[0] = x5[0];
2392 x6[1] = x5[1];
2393 btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
2394 cos_bit);
2395 x6[2] = x5[2];
2396 x6[3] = x5[3];
2397 btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
2398 cos_bit);
2399 x6[4] = x5[4];
2400 x6[5] = x5[5];
2401 btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
2402 cos_bit);
2403 x6[6] = x5[6];
2404 x6[7] = x5[7];
2405
2406 // stage 7
2407 output[0] = x6[1];
2408 output[1] = x6[6];
2409 output[2] = x6[3];
2410 output[3] = x6[4];
2411 output[4] = x6[5];
2412 output[5] = x6[2];
2413 output[6] = x6[7];
2414 output[7] = x6[0];
2415 }
2416
fidentity8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2417 static inline void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
2418 int8_t cos_bit) {
2419 (void)cos_bit;
2420
2421 output[0] = _mm256_adds_epi16(input[0], input[0]);
2422 output[1] = _mm256_adds_epi16(input[1], input[1]);
2423 output[2] = _mm256_adds_epi16(input[2], input[2]);
2424 output[3] = _mm256_adds_epi16(input[3], input[3]);
2425 output[4] = _mm256_adds_epi16(input[4], input[4]);
2426 output[5] = _mm256_adds_epi16(input[5], input[5]);
2427 output[6] = _mm256_adds_epi16(input[6], input[6]);
2428 output[7] = _mm256_adds_epi16(input[7], input[7]);
2429 }
2430
fdct8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2431 static inline void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
2432 int8_t cos_bit) {
2433 const int32_t *cospi = cospi_arr(cos_bit);
2434 const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
2435 const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
2436 __m128i temp0, temp1, temp2, temp3;
2437 __m256i in0, in1;
2438 __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
2439 __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2440 __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2441 __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
2442 __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
2443 __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
2444 __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
2445 __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
2446 __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
2447 __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
2448 __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
2449 __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
2450 __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
2451 __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
2452 __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
2453 __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
2454 __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
2455 __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
2456
2457 __m256i cospi_arr[12];
2458
2459 cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
2460 cospi_m32_p32, 0x1);
2461 cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2462 cospi_p32_p32, 0x1);
2463 cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2464 cospi_p48_p16, 0x1);
2465 cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2466 cospi_m16_p48, 0x1);
2467 cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
2468 cospi_m48_m16, 0x1);
2469 cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
2470 cospi_m16_p48, 0x1);
2471 cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
2472 cospi_p24_p40, 0x1);
2473 cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
2474 cospi_m40_p24, 0x1);
2475 cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
2476 cospi_p28_p36, 0x1);
2477 cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
2478 cospi_m36_p28, 0x1);
2479 cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
2480 cospi_p12_p52, 0x1);
2481 cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
2482 cospi_m52_p12, 0x1);
2483
2484 __m256i x[8];
2485 x[0] =
2486 _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
2487 x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
2488 0x1);
2489 x[2] =
2490 _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
2491 x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
2492 0x1);
2493 x[4] =
2494 _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
2495 x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
2496 0x1);
2497 x[6] =
2498 _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
2499 x[7] =
2500 _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
2501
2502 // stage 1
2503 __m256i x1[8];
2504 x1[0] = _mm256_adds_epi16(x[0], x[1]);
2505 x1[7] = _mm256_subs_epi16(x[0], x[1]);
2506 x1[1] = _mm256_adds_epi16(x[2], x[3]);
2507 x1[6] = _mm256_subs_epi16(x[2], x[3]);
2508 x1[2] = _mm256_adds_epi16(x[4], x[5]);
2509 x1[5] = _mm256_subs_epi16(x[4], x[5]);
2510 x1[3] = _mm256_adds_epi16(x[6], x[7]);
2511 x1[4] = _mm256_subs_epi16(x[6], x[7]);
2512
2513 // stage 2
2514 __m256i x2[8];
2515 x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
2516 x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
2517 x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
2518 x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
2519 x2[2] = x1[4];
2520 x2[3] = x1[7];
2521 btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
2522 &temp2, &temp3, &__rounding_256, &cos_bit);
2523 x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
2524 x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
2525
2526 // stage 3
2527 __m256i x3[8];
2528 x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
2529 x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
2530 x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
2531 x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
2532 btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
2533 _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
2534 x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
2535 x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
2536 x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
2537 x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
2538 x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
2539
2540 // stage 4
2541 __m256i x4[8];
2542 x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
2543 x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
2544 btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
2545 &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
2546 x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
2547 x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
2548 x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
2549 x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
2550 in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
2551 in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
2552 btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
2553 &temp3, &__rounding_256, &cos_bit);
2554
2555 x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
2556 x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
2557
2558 // stage 5
2559 __m256i x5[4];
2560 in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
2561 in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
2562 btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
2563 &output[10], &output[6], &__rounding_256, &cos_bit);
2564 x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
2565 x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
2566 x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
2567 x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
2568
2569 // stage 6
2570 in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
2571 in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
2572 btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
2573 &output[9], &output[7], &__rounding_256, &cos_bit);
2574 in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
2575 in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
2576 btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
2577 &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
2578 }
2579
fadst8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2580 static inline void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
2581 int8_t cos_bit) {
2582 const int32_t *cospi = cospi_arr(cos_bit);
2583 const __m256i __zero = _mm256_setzero_si256();
2584 const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
2585 __m256i in0, in1;
2586 __m128i temp0, temp1, temp2, temp3;
2587
2588 __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2589 __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2590 __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2591 __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2592 __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2593 __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2594 __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2595 __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2596 __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2597 __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2598 __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2599 __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2600 __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2601 __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2602 __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2603 __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2604 __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2605 __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2606 __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2607 __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2608 __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2609 __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2610 __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2611 __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2612 __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2613 __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2614 __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2615
2616 __m256i cospi_arr[20];
2617
2618 cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2619 cospi_p32_p32, 0x1);
2620 cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2621 cospi_p32_m32, 0x1);
2622 cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2623 cospi_p32_p32, 0x1);
2624 cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2625 cospi_p32_m32, 0x1);
2626 cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
2627 cospi_m48_p16, 0x1);
2628 cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
2629 cospi_p16_p48, 0x1);
2630 cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
2631 cospi_m48_p16, 0x1);
2632 cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
2633 cospi_p16_p48, 0x1);
2634 cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
2635 cospi_p40_p24, 0x1);
2636 cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
2637 cospi_p24_m40, 0x1);
2638 cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
2639 cospi_m24_p40, 0x1);
2640 cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
2641 cospi_p40_p24, 0x1);
2642 cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
2643 cospi_p10_p54, 0x1);
2644 cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
2645 cospi_p54_m10, 0x1);
2646 cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
2647 cospi_p26_p38, 0x1);
2648 cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
2649 cospi_p38_m26, 0x1);
2650 cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
2651 cospi_p42_p22, 0x1);
2652 cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
2653 cospi_p22_m42, 0x1);
2654 cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
2655 cospi_p58_p06, 0x1);
2656 cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
2657 cospi_p06_m58, 0x1);
2658
2659 __m256i x[8];
2660 x[0] =
2661 _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
2662 x[1] =
2663 _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
2664 x[2] =
2665 _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
2666 x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
2667 0x1);
2668 x[4] =
2669 _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
2670 x[5] =
2671 _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
2672 x[6] =
2673 _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
2674 x[7] =
2675 _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
2676
2677 // stage 1
2678 __m256i x1[8];
2679 x1[0] = x[0];
2680 x1[1] = _mm256_subs_epi16(__zero, x[7]);
2681 x1[2] = x[2];
2682 x1[3] = _mm256_subs_epi16(__zero, x[5]);
2683 x1[4] = _mm256_subs_epi16(__zero, x[4]);
2684 x1[5] = x[3];
2685 x1[6] = _mm256_subs_epi16(__zero, x[6]);
2686 x1[7] = x[1];
2687
2688 // stage 2
2689 __m256i x2[8];
2690 x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
2691 x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
2692 x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
2693 x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
2694 in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
2695 in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
2696 btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
2697 &temp3, &__rounding_256, &cos_bit);
2698 x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2699 x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2700 in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
2701 in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
2702 btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
2703 &temp3, &__rounding_256, &cos_bit);
2704 x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2705 x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2706
2707 // stage 3
2708 __m256i x3[8];
2709 x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
2710 x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
2711 x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
2712 x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
2713 x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
2714 x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
2715 x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
2716 x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
2717
2718 // stage 4
2719 __m256i x4[8];
2720 x4[0] = x3[0];
2721 x4[1] = x3[1];
2722 x4[4] = x3[4];
2723 x4[5] = x3[5];
2724 in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
2725 in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
2726 btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
2727 &temp3, &__rounding_256, &cos_bit);
2728 x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2729 x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2730 in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
2731 in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
2732 btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
2733 &temp3, &__rounding_256, &cos_bit);
2734 x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2735 x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2736
2737 // stage 5
2738 __m256i x5[8];
2739 x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
2740 x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
2741 x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
2742 x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
2743 x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
2744 x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
2745 x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
2746 x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
2747
2748 // stage 6
2749 __m256i x6[8];
2750 x6[0] = x5[0];
2751 x6[1] = x5[2];
2752 x6[2] = x5[1];
2753 x6[3] = x5[3];
2754 in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
2755 in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
2756 btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
2757 &temp3, &__rounding_256, &cos_bit);
2758 x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2759 x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2760 in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
2761 in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
2762 btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
2763 &temp2, &temp3, &__rounding_256, &cos_bit);
2764 x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2765 x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2766
2767 // stage 7
2768 __m256i x7[8];
2769 x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
2770 x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
2771 x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
2772 x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
2773 x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
2774 x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
2775 x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
2776 x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
2777
2778 // stage 8
2779 in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
2780 in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
2781 btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
2782 &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
2783 in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
2784 in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
2785 btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
2786 &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
2787 in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
2788 in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
2789 btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
2790 &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
2791 in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
2792 in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
2793 btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
2794 &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
2795 }
2796
fidentity8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2797 static inline void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
2798 int8_t cos_bit) {
2799 (void)cos_bit;
2800 const __m256i one = _mm256_set1_epi16(1);
2801 __m256i temp;
2802 for (int i = 0; i < 16; i += 2) {
2803 temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
2804 input[i + 1], 0x1);
2805 const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
2806 const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
2807 const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
2808 const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
2809 temp = _mm256_packs_epi32(b_lo, b_hi);
2810 output[i] = _mm256_castsi256_si128(temp);
2811 output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
2812 }
2813 }
2814
2815 static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
2816 fdct8x8_new_avx2, // DCT_DCT
2817 fdct8x8_new_avx2, // ADST_DCT
2818 fadst8x8_new_avx2, // DCT_ADST
2819 fadst8x8_new_avx2, // ADST_ADST
2820 fdct8x8_new_avx2, // FLIPADST_DCT
2821 fadst8x8_new_avx2, // DCT_FLIPADST
2822 fadst8x8_new_avx2, // FLIPADST_FLIPADST
2823 fadst8x8_new_avx2, // ADST_FLIPADST
2824 fadst8x8_new_avx2, // FLIPADST_ADST
2825 fidentity8x8_new_avx2, // IDTX
2826 fidentity8x8_new_avx2, // V_DCT
2827 fdct8x8_new_avx2, // H_DCT
2828 fidentity8x8_new_avx2, // V_ADST
2829 fadst8x8_new_avx2, // H_ADST
2830 fidentity8x8_new_avx2, // V_FLIPADST
2831 fadst8x8_new_avx2 // H_FLIPADST
2832 };
2833
2834 static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
2835 fdct8x16_new_avx2, // DCT_DCT
2836 fadst8x16_new_avx2, // ADST_DCT
2837 fdct8x16_new_avx2, // DCT_ADST
2838 fadst8x16_new_avx2, // ADST_ADST
2839 fadst8x16_new_avx2, // FLIPADST_DCT
2840 fdct8x16_new_avx2, // DCT_FLIPADST
2841 fadst8x16_new_avx2, // FLIPADST_FLIPADST
2842 fadst8x16_new_avx2, // ADST_FLIPADST
2843 fadst8x16_new_avx2, // FLIPADST_ADST
2844 fidentity8x16_new_avx2, // IDTX
2845 fdct8x16_new_avx2, // V_DCT
2846 fidentity8x16_new_avx2, // H_DCT
2847 fadst8x16_new_avx2, // V_ADST
2848 fidentity8x16_new_avx2, // H_ADST
2849 fadst8x16_new_avx2, // V_FLIPADST
2850 fidentity8x16_new_avx2 // H_FLIPADST
2851 };
2852
2853 static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
2854 fdct8x8_new_avx2, // DCT_DCT
2855 fadst8x8_new_avx2, // ADST_DCT
2856 fdct8x8_new_avx2, // DCT_ADST
2857 fadst8x8_new_avx2, // ADST_ADST
2858 fadst8x8_new_avx2, // FLIPADST_DCT
2859 fdct8x8_new_avx2, // DCT_FLIPADST
2860 fadst8x8_new_avx2, // FLIPADST_FLIPADST
2861 fadst8x8_new_avx2, // ADST_FLIPADST
2862 fadst8x8_new_avx2, // FLIPADST_ADST
2863 fidentity8x8_new_avx2, // IDTX
2864 fdct8x8_new_avx2, // V_DCT
2865 fidentity8x8_new_avx2, // H_DCT
2866 fadst8x8_new_avx2, // V_ADST
2867 fidentity8x8_new_avx2, // H_ADST
2868 fadst8x8_new_avx2, // V_FLIPADST
2869 fidentity8x8_new_avx2, // H_FLIPADST
2870 };
2871
2872 static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
2873 fdct8x16_new_avx2, // DCT_DCT
2874 fdct8x16_new_avx2, // ADST_DCT
2875 fadst8x16_new_avx2, // DCT_ADST
2876 fadst8x16_new_avx2, // ADST_ADST
2877 fdct8x16_new_avx2, // FLIPADST_DCT
2878 fadst8x16_new_avx2, // DCT_FLIPADST
2879 fadst8x16_new_avx2, // FLIPADST_FLIPADST
2880 fadst8x16_new_avx2, // ADST_FLIPADST
2881 fadst8x16_new_avx2, // FLIPADST_ADST
2882 fidentity8x16_new_avx2, // IDTX
2883 fidentity8x16_new_avx2, // V_DCT
2884 fdct8x16_new_avx2, // H_DCT
2885 fidentity8x16_new_avx2, // V_ADST
2886 fadst8x16_new_avx2, // H_ADST
2887 fidentity8x16_new_avx2, // V_FLIPADST
2888 fadst8x16_new_avx2 // H_FLIPADST
2889 };
2890
lowbd_fwd_txfm2d_8x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2891 static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
2892 int stride, TX_TYPE tx_type, int bd) {
2893 (void)bd;
2894 __m128i buf0[16], buf1[16];
2895 __m256i buf2[8];
2896 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
2897 const int txw_idx = get_txw_idx(TX_8X16);
2898 const int txh_idx = get_txh_idx(TX_8X16);
2899 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2900 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2901 const int width = 8;
2902 const int height = 16;
2903 const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
2904 const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
2905 int ud_flip, lr_flip;
2906
2907 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2908 if (ud_flip) {
2909 load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2910 } else {
2911 load_buffer_16bit_to_16bit(input, stride, buf0, height);
2912 }
2913 round_shift_16bit(buf0, height, shift[0]);
2914 col_txfm(buf0, buf0, cos_bit_col);
2915 round_shift_16bit(buf0, height, shift[1]);
2916 transpose_16bit_8x8(buf0, buf1);
2917 transpose_16bit_8x8(buf0 + 8, buf1 + 8);
2918
2919 __m128i *bufl, *bufu;
2920 if (lr_flip) {
2921 bufl = buf0;
2922 bufu = buf0 + 8;
2923 flip_buf_sse2(buf1 + width * 0, bufl, width);
2924 flip_buf_sse2(buf1 + width * 1, bufu, width);
2925 } else {
2926 bufl = buf1 + width * 0;
2927 bufu = buf1 + width * 1;
2928 }
2929 pack_reg(bufl, bufu, buf2);
2930 row_txfm(buf2, buf2, cos_bit_row);
2931 round_shift_16bit_w16_avx2(buf2, width, shift[2]);
2932 store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width);
2933 }
2934
lowbd_fwd_txfm2d_16x8_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2935 static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
2936 int stride, TX_TYPE tx_type, int bd) {
2937 (void)bd;
2938 __m128i buf0[16], buf1[16];
2939 __m256i buf2[8];
2940 const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
2941 const int txw_idx = get_txw_idx(TX_16X8);
2942 const int txh_idx = get_txh_idx(TX_16X8);
2943 const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2944 const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2945 const int width = 16;
2946 const int height = 8;
2947 const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
2948 const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
2949 __m128i *buf;
2950 int ud_flip, lr_flip;
2951
2952 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2953
2954 if (ud_flip) {
2955 load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
2956 load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
2957 } else {
2958 load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
2959 load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
2960 }
2961 pack_reg(buf0, &buf0[8], buf2);
2962 round_shift_16bit_w16_avx2(buf2, height, shift[0]);
2963 col_txfm(buf2, buf2, cos_bit_col);
2964 round_shift_16bit_w16_avx2(buf2, height, shift[1]);
2965 transpose_16bit_16x8_avx2(buf2, buf2);
2966 extract_reg(buf2, buf1);
2967
2968 if (lr_flip) {
2969 buf = buf0;
2970 flip_buf_sse2(buf1, buf, width);
2971 } else {
2972 buf = buf1;
2973 }
2974 row_txfm(buf, buf, cos_bit_row);
2975 round_shift_16bit(buf, width, shift[2]);
2976 store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
2977 }
2978
2979 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
2980 av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
2981 av1_lowbd_fwd_txfm2d_8x8_avx2, // 8x8 transform
2982 lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform
2983 lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform
2984 lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
2985 av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
2986 av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
2987 lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform
2988 lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform
2989 lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
2990 lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
2991 lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
2992 lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform
2993 av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
2994 av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
2995 av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
2996 av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
2997 lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform
2998 lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform
2999 };
3000
av1_lowbd_fwd_txfm_avx2(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)3001 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
3002 int diff_stride, TxfmParam *txfm_param) {
3003 FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
3004 if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
3005 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
3006 } else {
3007 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
3008 txfm_param->bd);
3009 }
3010 }
3011