xref: /aosp_15_r20/external/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/av1_rtcd.h"
13 
14 #include "av1/common/enums.h"
15 #include "av1/common/av1_txfm.h"
16 #include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
17 #include "av1/common/x86/av1_txfm_sse2.h"
18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
19 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
21 #include "aom_dsp/x86/txfm_common_avx2.h"
22 
fdct16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)23 static inline void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
24                                       int8_t cos_bit) {
25   const int32_t *cospi = cospi_arr(cos_bit);
26   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
27 
28   __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
29   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
30   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
31   __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
32   __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
33   __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
34   __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
35   __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
36   __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
37   __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
38   __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
39   __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
40   __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
41   __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
42   __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
43   __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
44   __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
45   __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
46 
47   // stage 1
48   __m256i x1[16];
49   btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
50   btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
51   btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
52   btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
53   btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
54   btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
55   btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
56   btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
57 
58   // stage 2
59   btf_16_adds_subs_avx2(&x1[0], &x1[7]);
60   btf_16_adds_subs_avx2(&x1[1], &x1[6]);
61   btf_16_adds_subs_avx2(&x1[2], &x1[5]);
62   btf_16_adds_subs_avx2(&x1[3], &x1[4]);
63   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
64   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
65 
66   // stage 3
67   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
68   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
69   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
70   btf_16_adds_subs_avx2(&x1[8], &x1[11]);
71   btf_16_adds_subs_avx2(&x1[9], &x1[10]);
72   btf_16_adds_subs_avx2(&x1[15], &x1[12]);
73   btf_16_adds_subs_avx2(&x1[14], &x1[13]);
74 
75   // stage 4
76   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
77   btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
78   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
79   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
80   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
81   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
82 
83   // stage 5
84   btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
85   btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
86   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
87   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
88   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
89   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
90 
91   // stage 6
92   btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
93   btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
94   btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
95   btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
96 
97   // stage 7
98   output[0] = x1[0];
99   output[1] = x1[8];
100   output[2] = x1[4];
101   output[3] = x1[12];
102   output[4] = x1[2];
103   output[5] = x1[10];
104   output[6] = x1[6];
105   output[7] = x1[14];
106   output[8] = x1[1];
107   output[9] = x1[9];
108   output[10] = x1[5];
109   output[11] = x1[13];
110   output[12] = x1[3];
111   output[13] = x1[11];
112   output[14] = x1[7];
113   output[15] = x1[15];
114 }
115 
fdct16x32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)116 static inline void fdct16x32_avx2(const __m256i *input, __m256i *output,
117                                   int8_t cos_bit) {
118   const int32_t *cospi = cospi_arr(cos_bit);
119   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
120 
121   __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
122   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
123   __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
124   __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
125   __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
126   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
127   __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
128   __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
129   __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
130   __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
131   __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
132   __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
133   __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
134   __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
135   __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
136   __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
137   __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
138   __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
139   __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
140   __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
141   __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
142   __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
143   __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
144   __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
145   __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
146   __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
147   __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
148   __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
149   __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
150   __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
151   __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
152   __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
153   __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
154   __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
155   __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
156   __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
157 
158   // stage 1
159   __m256i x1[32];
160   btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
161   btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
162   btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
163   btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
164   btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
165   btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
166   btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
167   btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
168   btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
169   btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
170   btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
171   btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
172   btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
173   btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
174   btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
175   btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
176 
177   // stage 2
178   btf_16_adds_subs_avx2(&x1[0], &x1[15]);
179   btf_16_adds_subs_avx2(&x1[1], &x1[14]);
180   btf_16_adds_subs_avx2(&x1[2], &x1[13]);
181   btf_16_adds_subs_avx2(&x1[3], &x1[12]);
182   btf_16_adds_subs_avx2(&x1[4], &x1[11]);
183   btf_16_adds_subs_avx2(&x1[5], &x1[10]);
184   btf_16_adds_subs_avx2(&x1[6], &x1[9]);
185   btf_16_adds_subs_avx2(&x1[7], &x1[8]);
186   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
187   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
188   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
189   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
190 
191   // stage 3
192   btf_16_adds_subs_avx2(&x1[0], &x1[7]);
193   btf_16_adds_subs_avx2(&x1[1], &x1[6]);
194   btf_16_adds_subs_avx2(&x1[2], &x1[5]);
195   btf_16_adds_subs_avx2(&x1[3], &x1[4]);
196   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
197   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
198   btf_16_adds_subs_avx2(&x1[16], &x1[23]);
199   btf_16_adds_subs_avx2(&x1[17], &x1[22]);
200   btf_16_adds_subs_avx2(&x1[18], &x1[21]);
201   btf_16_adds_subs_avx2(&x1[19], &x1[20]);
202   btf_16_adds_subs_avx2(&x1[31], &x1[24]);
203   btf_16_adds_subs_avx2(&x1[30], &x1[25]);
204   btf_16_adds_subs_avx2(&x1[29], &x1[26]);
205   btf_16_adds_subs_avx2(&x1[28], &x1[27]);
206 
207   // stage 4
208   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
209   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
210   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
211   btf_16_adds_subs_avx2(&x1[8], &x1[11]);
212   btf_16_adds_subs_avx2(&x1[9], &x1[10]);
213   btf_16_adds_subs_avx2(&x1[15], &x1[12]);
214   btf_16_adds_subs_avx2(&x1[14], &x1[13]);
215   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
216   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
217   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
218   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
219 
220   // stage 5
221   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
222   btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
223   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
224   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
225   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
226   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
227   btf_16_adds_subs_avx2(&x1[16], &x1[19]);
228   btf_16_adds_subs_avx2(&x1[17], &x1[18]);
229   btf_16_adds_subs_avx2(&x1[23], &x1[20]);
230   btf_16_adds_subs_avx2(&x1[22], &x1[21]);
231   btf_16_adds_subs_avx2(&x1[24], &x1[27]);
232   btf_16_adds_subs_avx2(&x1[25], &x1[26]);
233   btf_16_adds_subs_avx2(&x1[31], &x1[28]);
234   btf_16_adds_subs_avx2(&x1[30], &x1[29]);
235 
236   // stage 6
237   btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
238   btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
239   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
240   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
241   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
242   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
243   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
244   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
245   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
246   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
247 
248   // stage 7
249   btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
250   btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
251   btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
252   btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
253   btf_16_adds_subs_avx2(&x1[16], &x1[17]);
254   btf_16_adds_subs_avx2(&x1[19], &x1[18]);
255   btf_16_adds_subs_avx2(&x1[20], &x1[21]);
256   btf_16_adds_subs_avx2(&x1[23], &x1[22]);
257   btf_16_adds_subs_avx2(&x1[24], &x1[25]);
258   btf_16_adds_subs_avx2(&x1[27], &x1[26]);
259   btf_16_adds_subs_avx2(&x1[28], &x1[29]);
260   btf_16_adds_subs_avx2(&x1[31], &x1[30]);
261 
262   // stage 8
263   btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
264   btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
265   btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
266   btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
267   btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
268   btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
269   btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
270   btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
271 
272   // stage 9
273   output[0] = x1[0];
274   output[1] = x1[16];
275   output[2] = x1[8];
276   output[3] = x1[24];
277   output[4] = x1[4];
278   output[5] = x1[20];
279   output[6] = x1[12];
280   output[7] = x1[28];
281   output[8] = x1[2];
282   output[9] = x1[18];
283   output[10] = x1[10];
284   output[11] = x1[26];
285   output[12] = x1[6];
286   output[13] = x1[22];
287   output[14] = x1[14];
288   output[15] = x1[30];
289   output[16] = x1[1];
290   output[17] = x1[17];
291   output[18] = x1[9];
292   output[19] = x1[25];
293   output[20] = x1[5];
294   output[21] = x1[21];
295   output[22] = x1[13];
296   output[23] = x1[29];
297   output[24] = x1[3];
298   output[25] = x1[19];
299   output[26] = x1[11];
300   output[27] = x1[27];
301   output[28] = x1[7];
302   output[29] = x1[23];
303   output[30] = x1[15];
304   output[31] = x1[31];
305 }
306 
fdct16x64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)307 static inline void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
308                                       int8_t cos_bit) {
309   const int32_t *cospi = cospi_arr(cos_bit);
310   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
311 
312   __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
313   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
314   __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
315   __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
316   __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
317   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
318   __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
319   __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
320   __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
321   __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
322   __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
323   __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
324   __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
325   __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
326   __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
327   __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
328   __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
329   __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
330   __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
331   __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
332   __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
333   __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
334   __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
335   __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
336   __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
337   __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
338   __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
339   __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
340   __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
341   __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
342   __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
343   __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
344   __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
345   __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
346   __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
347   __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
348   __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
349   __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
350   __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
351   __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
352   __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
353   __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
354   __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
355   __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
356   __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
357   __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
358   __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
359   __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
360   __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
361   __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
362   __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
363   __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
364   __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
365   __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
366   __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
367   __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
368   __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
369   __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
370   __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
371   __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
372   __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
373   __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
374   __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
375   __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
376   __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
377   __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
378   __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
379   __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
380   __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
381   __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
382   __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
383   __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
384 
385   // stage 1
386   __m256i x1[64];
387   btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
388   btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
389   btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
390   btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
391   btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
392   btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
393   btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
394   btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
395   btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
396   btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
397   btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
398   btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
399   btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
400   btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
401   btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
402   btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
403   btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
404   btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
405   btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
406   btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
407   btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
408   btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
409   btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
410   btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
411   btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
412   btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
413   btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
414   btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
415   btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
416   btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
417   btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
418   btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
419 
420   // stage 2
421   btf_16_adds_subs_avx2(&x1[0], &x1[31]);
422   btf_16_adds_subs_avx2(&x1[1], &x1[30]);
423   btf_16_adds_subs_avx2(&x1[2], &x1[29]);
424   btf_16_adds_subs_avx2(&x1[3], &x1[28]);
425   btf_16_adds_subs_avx2(&x1[4], &x1[27]);
426   btf_16_adds_subs_avx2(&x1[5], &x1[26]);
427   btf_16_adds_subs_avx2(&x1[6], &x1[25]);
428   btf_16_adds_subs_avx2(&x1[7], &x1[24]);
429   btf_16_adds_subs_avx2(&x1[8], &x1[23]);
430   btf_16_adds_subs_avx2(&x1[9], &x1[22]);
431   btf_16_adds_subs_avx2(&x1[10], &x1[21]);
432   btf_16_adds_subs_avx2(&x1[11], &x1[20]);
433   btf_16_adds_subs_avx2(&x1[12], &x1[19]);
434   btf_16_adds_subs_avx2(&x1[13], &x1[18]);
435   btf_16_adds_subs_avx2(&x1[14], &x1[17]);
436   btf_16_adds_subs_avx2(&x1[15], &x1[16]);
437   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
438   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
439   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
440   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
441   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
442   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
443   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
444   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
445 
446   // stage 3
447   btf_16_adds_subs_avx2(&x1[0], &x1[15]);
448   btf_16_adds_subs_avx2(&x1[1], &x1[14]);
449   btf_16_adds_subs_avx2(&x1[2], &x1[13]);
450   btf_16_adds_subs_avx2(&x1[3], &x1[12]);
451   btf_16_adds_subs_avx2(&x1[4], &x1[11]);
452   btf_16_adds_subs_avx2(&x1[5], &x1[10]);
453   btf_16_adds_subs_avx2(&x1[6], &x1[9]);
454   btf_16_adds_subs_avx2(&x1[7], &x1[8]);
455   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
456   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
457   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
458   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
459   btf_16_adds_subs_avx2(&x1[32], &x1[47]);
460   btf_16_adds_subs_avx2(&x1[33], &x1[46]);
461   btf_16_adds_subs_avx2(&x1[34], &x1[45]);
462   btf_16_adds_subs_avx2(&x1[35], &x1[44]);
463   btf_16_adds_subs_avx2(&x1[36], &x1[43]);
464   btf_16_adds_subs_avx2(&x1[37], &x1[42]);
465   btf_16_adds_subs_avx2(&x1[38], &x1[41]);
466   btf_16_adds_subs_avx2(&x1[39], &x1[40]);
467   btf_16_adds_subs_avx2(&x1[63], &x1[48]);
468   btf_16_adds_subs_avx2(&x1[62], &x1[49]);
469   btf_16_adds_subs_avx2(&x1[61], &x1[50]);
470   btf_16_adds_subs_avx2(&x1[60], &x1[51]);
471   btf_16_adds_subs_avx2(&x1[59], &x1[52]);
472   btf_16_adds_subs_avx2(&x1[58], &x1[53]);
473   btf_16_adds_subs_avx2(&x1[57], &x1[54]);
474   btf_16_adds_subs_avx2(&x1[56], &x1[55]);
475 
476   // stage 4
477   btf_16_adds_subs_avx2(&x1[0], &x1[7]);
478   btf_16_adds_subs_avx2(&x1[1], &x1[6]);
479   btf_16_adds_subs_avx2(&x1[2], &x1[5]);
480   btf_16_adds_subs_avx2(&x1[3], &x1[4]);
481   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
482   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
483   btf_16_adds_subs_avx2(&x1[16], &x1[23]);
484   btf_16_adds_subs_avx2(&x1[17], &x1[22]);
485   btf_16_adds_subs_avx2(&x1[18], &x1[21]);
486   btf_16_adds_subs_avx2(&x1[19], &x1[20]);
487   btf_16_adds_subs_avx2(&x1[31], &x1[24]);
488   btf_16_adds_subs_avx2(&x1[30], &x1[25]);
489   btf_16_adds_subs_avx2(&x1[29], &x1[26]);
490   btf_16_adds_subs_avx2(&x1[28], &x1[27]);
491   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
492   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
493   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
494   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
495   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
496   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
497   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
498   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
499 
500   // stage 5
501   btf_16_adds_subs_avx2(&x1[0], &x1[3]);
502   btf_16_adds_subs_avx2(&x1[1], &x1[2]);
503   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
504   btf_16_adds_subs_avx2(&x1[8], &x1[11]);
505   btf_16_adds_subs_avx2(&x1[9], &x1[10]);
506   btf_16_adds_subs_avx2(&x1[15], &x1[12]);
507   btf_16_adds_subs_avx2(&x1[14], &x1[13]);
508   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
509   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
510   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
511   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
512   btf_16_adds_subs_avx2(&x1[32], &x1[39]);
513   btf_16_adds_subs_avx2(&x1[33], &x1[38]);
514   btf_16_adds_subs_avx2(&x1[34], &x1[37]);
515   btf_16_adds_subs_avx2(&x1[35], &x1[36]);
516   btf_16_adds_subs_avx2(&x1[47], &x1[40]);
517   btf_16_adds_subs_avx2(&x1[46], &x1[41]);
518   btf_16_adds_subs_avx2(&x1[45], &x1[42]);
519   btf_16_adds_subs_avx2(&x1[44], &x1[43]);
520   btf_16_adds_subs_avx2(&x1[48], &x1[55]);
521   btf_16_adds_subs_avx2(&x1[49], &x1[54]);
522   btf_16_adds_subs_avx2(&x1[50], &x1[53]);
523   btf_16_adds_subs_avx2(&x1[51], &x1[52]);
524   btf_16_adds_subs_avx2(&x1[63], &x1[56]);
525   btf_16_adds_subs_avx2(&x1[62], &x1[57]);
526   btf_16_adds_subs_avx2(&x1[61], &x1[58]);
527   btf_16_adds_subs_avx2(&x1[60], &x1[59]);
528 
529   // stage 6
530   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
531   btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
532   btf_16_adds_subs_avx2(&x1[4], &x1[5]);
533   btf_16_adds_subs_avx2(&x1[7], &x1[6]);
534   btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
535   btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
536   btf_16_adds_subs_avx2(&x1[16], &x1[19]);
537   btf_16_adds_subs_avx2(&x1[17], &x1[18]);
538   btf_16_adds_subs_avx2(&x1[23], &x1[20]);
539   btf_16_adds_subs_avx2(&x1[22], &x1[21]);
540   btf_16_adds_subs_avx2(&x1[24], &x1[27]);
541   btf_16_adds_subs_avx2(&x1[25], &x1[26]);
542   btf_16_adds_subs_avx2(&x1[31], &x1[28]);
543   btf_16_adds_subs_avx2(&x1[30], &x1[29]);
544   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
545   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
546   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
547   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
548   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
549   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
550   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
551   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
552 
553   // stage 7
554   btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
555   btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
556   btf_16_adds_subs_avx2(&x1[8], &x1[9]);
557   btf_16_adds_subs_avx2(&x1[11], &x1[10]);
558   btf_16_adds_subs_avx2(&x1[12], &x1[13]);
559   btf_16_adds_subs_avx2(&x1[15], &x1[14]);
560   btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
561   btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
562   btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
563   btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
564   btf_16_adds_subs_avx2(&x1[32], &x1[35]);
565   btf_16_adds_subs_avx2(&x1[33], &x1[34]);
566   btf_16_adds_subs_avx2(&x1[39], &x1[36]);
567   btf_16_adds_subs_avx2(&x1[38], &x1[37]);
568   btf_16_adds_subs_avx2(&x1[40], &x1[43]);
569   btf_16_adds_subs_avx2(&x1[41], &x1[42]);
570   btf_16_adds_subs_avx2(&x1[47], &x1[44]);
571   btf_16_adds_subs_avx2(&x1[46], &x1[45]);
572   btf_16_adds_subs_avx2(&x1[48], &x1[51]);
573   btf_16_adds_subs_avx2(&x1[49], &x1[50]);
574   btf_16_adds_subs_avx2(&x1[55], &x1[52]);
575   btf_16_adds_subs_avx2(&x1[54], &x1[53]);
576   btf_16_adds_subs_avx2(&x1[56], &x1[59]);
577   btf_16_adds_subs_avx2(&x1[57], &x1[58]);
578   btf_16_adds_subs_avx2(&x1[63], &x1[60]);
579   btf_16_adds_subs_avx2(&x1[62], &x1[61]);
580 
581   // stage 8
582   btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
583   btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
584   btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
585   btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
586   btf_16_adds_subs_avx2(&x1[16], &x1[17]);
587   btf_16_adds_subs_avx2(&x1[19], &x1[18]);
588   btf_16_adds_subs_avx2(&x1[20], &x1[21]);
589   btf_16_adds_subs_avx2(&x1[23], &x1[22]);
590   btf_16_adds_subs_avx2(&x1[24], &x1[25]);
591   btf_16_adds_subs_avx2(&x1[27], &x1[26]);
592   btf_16_adds_subs_avx2(&x1[28], &x1[29]);
593   btf_16_adds_subs_avx2(&x1[31], &x1[30]);
594   btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
595   btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
596   btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
597   btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
598   btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
599   btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
600   btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
601   btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
602 
603   // stage 9
604   btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
605   btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
606   btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
607   btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
608   btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
609   btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
610   btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
611   btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
612   btf_16_adds_subs_avx2(&x1[32], &x1[33]);
613   btf_16_adds_subs_avx2(&x1[35], &x1[34]);
614   btf_16_adds_subs_avx2(&x1[36], &x1[37]);
615   btf_16_adds_subs_avx2(&x1[39], &x1[38]);
616   btf_16_adds_subs_avx2(&x1[40], &x1[41]);
617   btf_16_adds_subs_avx2(&x1[43], &x1[42]);
618   btf_16_adds_subs_avx2(&x1[44], &x1[45]);
619   btf_16_adds_subs_avx2(&x1[47], &x1[46]);
620   btf_16_adds_subs_avx2(&x1[48], &x1[49]);
621   btf_16_adds_subs_avx2(&x1[51], &x1[50]);
622   btf_16_adds_subs_avx2(&x1[52], &x1[53]);
623   btf_16_adds_subs_avx2(&x1[55], &x1[54]);
624   btf_16_adds_subs_avx2(&x1[56], &x1[57]);
625   btf_16_adds_subs_avx2(&x1[59], &x1[58]);
626   btf_16_adds_subs_avx2(&x1[60], &x1[61]);
627   btf_16_adds_subs_avx2(&x1[63], &x1[62]);
628 
629   // stage 10
630   btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
631   btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
632   btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
633   btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
634   btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
635   btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
636   btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
637   btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
638   btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
639   btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
640   btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
641   btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
642   btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
643   btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
644   btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
645   btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
646 
647   // stage 11
648   output[0] = x1[0];
649   output[1] = x1[32];
650   output[2] = x1[16];
651   output[3] = x1[48];
652   output[4] = x1[8];
653   output[5] = x1[40];
654   output[6] = x1[24];
655   output[7] = x1[56];
656   output[8] = x1[4];
657   output[9] = x1[36];
658   output[10] = x1[20];
659   output[11] = x1[52];
660   output[12] = x1[12];
661   output[13] = x1[44];
662   output[14] = x1[28];
663   output[15] = x1[60];
664   output[16] = x1[2];
665   output[17] = x1[34];
666   output[18] = x1[18];
667   output[19] = x1[50];
668   output[20] = x1[10];
669   output[21] = x1[42];
670   output[22] = x1[26];
671   output[23] = x1[58];
672   output[24] = x1[6];
673   output[25] = x1[38];
674   output[26] = x1[22];
675   output[27] = x1[54];
676   output[28] = x1[14];
677   output[29] = x1[46];
678   output[30] = x1[30];
679   output[31] = x1[62];
680   output[32] = x1[1];
681   output[33] = x1[33];
682   output[34] = x1[17];
683   output[35] = x1[49];
684   output[36] = x1[9];
685   output[37] = x1[41];
686   output[38] = x1[25];
687   output[39] = x1[57];
688   output[40] = x1[5];
689   output[41] = x1[37];
690   output[42] = x1[21];
691   output[43] = x1[53];
692   output[44] = x1[13];
693   output[45] = x1[45];
694   output[46] = x1[29];
695   output[47] = x1[61];
696   output[48] = x1[3];
697   output[49] = x1[35];
698   output[50] = x1[19];
699   output[51] = x1[51];
700   output[52] = x1[11];
701   output[53] = x1[43];
702   output[54] = x1[27];
703   output[55] = x1[59];
704   output[56] = x1[7];
705   output[57] = x1[39];
706   output[58] = x1[23];
707   output[59] = x1[55];
708   output[60] = x1[15];
709   output[61] = x1[47];
710   output[62] = x1[31];
711   output[63] = x1[63];
712 }
713 
fdct32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)714 static inline void fdct32_avx2(const __m256i *input, __m256i *output,
715                                int8_t cos_bit) {
716   __m256i x1[32];
717   const int32_t *cospi = cospi_arr(cos_bit);
718   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
719   // stage 0
720   // stage 1
721   btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
722   btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
723   btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
724   btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
725   btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
726   btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
727   btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
728   btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
729   btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
730   btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
731   btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
732   btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
733   btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
734   btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
735   btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
736   btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
737 
738   // stage 2
739   btf_32_add_sub_avx2(&x1[0], &x1[15]);
740   btf_32_add_sub_avx2(&x1[1], &x1[14]);
741   btf_32_add_sub_avx2(&x1[2], &x1[13]);
742   btf_32_add_sub_avx2(&x1[3], &x1[12]);
743   btf_32_add_sub_avx2(&x1[4], &x1[11]);
744   btf_32_add_sub_avx2(&x1[5], &x1[10]);
745   btf_32_add_sub_avx2(&x1[6], &x1[9]);
746   btf_32_add_sub_avx2(&x1[7], &x1[8]);
747   btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
748   btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
749   btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
750   btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
751 
752   // stage 3
753   btf_32_add_sub_avx2(&x1[0], &x1[7]);
754   btf_32_add_sub_avx2(&x1[1], &x1[6]);
755   btf_32_add_sub_avx2(&x1[2], &x1[5]);
756   btf_32_add_sub_avx2(&x1[3], &x1[4]);
757   btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
758   btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
759   btf_32_add_sub_avx2(&x1[16], &x1[23]);
760   btf_32_add_sub_avx2(&x1[17], &x1[22]);
761   btf_32_add_sub_avx2(&x1[18], &x1[21]);
762   btf_32_add_sub_avx2(&x1[19], &x1[20]);
763   btf_32_add_sub_avx2(&x1[31], &x1[24]);
764   btf_32_add_sub_avx2(&x1[30], &x1[25]);
765   btf_32_add_sub_avx2(&x1[29], &x1[26]);
766   btf_32_add_sub_avx2(&x1[28], &x1[27]);
767 
768   // stage 4
769   btf_32_add_sub_avx2(&x1[0], &x1[3]);
770   btf_32_add_sub_avx2(&x1[1], &x1[2]);
771   btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
772   btf_32_add_sub_avx2(&x1[8], &x1[11]);
773   btf_32_add_sub_avx2(&x1[9], &x1[10]);
774   btf_32_add_sub_avx2(&x1[15], &x1[12]);
775   btf_32_add_sub_avx2(&x1[14], &x1[13]);
776   btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
777   btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
778   btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
779   btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
780 
781   // stage 5
782   btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
783   btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
784   btf_32_add_sub_avx2(&x1[4], &x1[5]);
785   btf_32_add_sub_avx2(&x1[7], &x1[6]);
786   btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
787   btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
788   btf_32_add_sub_avx2(&x1[16], &x1[19]);
789   btf_32_add_sub_avx2(&x1[17], &x1[18]);
790   btf_32_add_sub_avx2(&x1[23], &x1[20]);
791   btf_32_add_sub_avx2(&x1[22], &x1[21]);
792   btf_32_add_sub_avx2(&x1[24], &x1[27]);
793   btf_32_add_sub_avx2(&x1[25], &x1[26]);
794   btf_32_add_sub_avx2(&x1[31], &x1[28]);
795   btf_32_add_sub_avx2(&x1[30], &x1[29]);
796 
797   // stage 6
798   btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
799   btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
800   btf_32_add_sub_avx2(&x1[8], &x1[9]);
801   btf_32_add_sub_avx2(&x1[11], &x1[10]);
802   btf_32_add_sub_avx2(&x1[12], &x1[13]);
803   btf_32_add_sub_avx2(&x1[15], &x1[14]);
804   btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
805   btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
806   btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
807   btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
808 
809   // stage 7
810   btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
811   btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
812   btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
813   btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
814   btf_32_add_sub_avx2(&x1[16], &x1[17]);
815   btf_32_add_sub_avx2(&x1[19], &x1[18]);
816   btf_32_add_sub_avx2(&x1[20], &x1[21]);
817   btf_32_add_sub_avx2(&x1[23], &x1[22]);
818   btf_32_add_sub_avx2(&x1[24], &x1[25]);
819   btf_32_add_sub_avx2(&x1[27], &x1[26]);
820   btf_32_add_sub_avx2(&x1[28], &x1[29]);
821   btf_32_add_sub_avx2(&x1[31], &x1[30]);
822 
823   // stage 8
824   btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
825   btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
826   btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
827   btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
828   btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
829   btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
830   btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
831   btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
832 
833   // stage 9
834   output[0] = x1[0];
835   output[1] = x1[16];
836   output[2] = x1[8];
837   output[3] = x1[24];
838   output[4] = x1[4];
839   output[5] = x1[20];
840   output[6] = x1[12];
841   output[7] = x1[28];
842   output[8] = x1[2];
843   output[9] = x1[18];
844   output[10] = x1[10];
845   output[11] = x1[26];
846   output[12] = x1[6];
847   output[13] = x1[22];
848   output[14] = x1[14];
849   output[15] = x1[30];
850   output[16] = x1[1];
851   output[17] = x1[17];
852   output[18] = x1[9];
853   output[19] = x1[25];
854   output[20] = x1[5];
855   output[21] = x1[21];
856   output[22] = x1[13];
857   output[23] = x1[29];
858   output[24] = x1[3];
859   output[25] = x1[19];
860   output[26] = x1[11];
861   output[27] = x1[27];
862   output[28] = x1[7];
863   output[29] = x1[23];
864   output[30] = x1[15];
865   output[31] = x1[31];
866 }
867 
fdct64_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)868 static inline void fdct64_new_avx2(const __m256i *input, __m256i *output,
869                                    int8_t cos_bit) {
870   const int32_t *cospi = cospi_arr(cos_bit);
871   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
872 
873   __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
874   __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
875   __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
876   __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
877   __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
878   __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
879   __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
880   __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
881   __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
882   __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
883   __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
884   __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
885   __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
886   __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
887   __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
888   __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
889   __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
890   __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
891   __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
892   __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
893   __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
894   __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
895   __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
896   __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
897   __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
898   __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
899   __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
900   __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
901   __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
902   __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
903   __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
904   __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
905   __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
906   __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
907   __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
908   __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
909   __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
910   __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
911   __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
912   __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
913   __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
914   __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
915   __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
916   __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
917   __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
918   __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
919   __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
920   __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
921   __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
922   __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
923   __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
924   __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
925   __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
926   __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
927   __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
928   __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
929   __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
930   __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
931   __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
932   __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
933   __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
934   __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
935   __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
936   __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
937   __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
938   __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
939   __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
940   __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
941   __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
942   __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
943   __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
944   __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
945   __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
946   __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
947   __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
948   __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
949   __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
950   __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
951 
952   // stage 1
953   __m256i x1[64];
954   btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
955   btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
956   btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
957   btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
958   btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
959   btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
960   btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
961   btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
962   btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
963   btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
964   btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
965   btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
966   btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
967   btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
968   btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
969   btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
970   btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
971   btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
972   btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
973   btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
974   btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
975   btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
976   btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
977   btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
978   btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
979   btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
980   btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
981   btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
982   btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
983   btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
984   btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
985   btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
986 
987   // stage 2
988   btf_32_add_sub_avx2(&x1[0], &x1[31]);
989   btf_32_add_sub_avx2(&x1[1], &x1[30]);
990   btf_32_add_sub_avx2(&x1[2], &x1[29]);
991   btf_32_add_sub_avx2(&x1[3], &x1[28]);
992   btf_32_add_sub_avx2(&x1[4], &x1[27]);
993   btf_32_add_sub_avx2(&x1[5], &x1[26]);
994   btf_32_add_sub_avx2(&x1[6], &x1[25]);
995   btf_32_add_sub_avx2(&x1[7], &x1[24]);
996   btf_32_add_sub_avx2(&x1[8], &x1[23]);
997   btf_32_add_sub_avx2(&x1[9], &x1[22]);
998   btf_32_add_sub_avx2(&x1[10], &x1[21]);
999   btf_32_add_sub_avx2(&x1[11], &x1[20]);
1000   btf_32_add_sub_avx2(&x1[12], &x1[19]);
1001   btf_32_add_sub_avx2(&x1[13], &x1[18]);
1002   btf_32_add_sub_avx2(&x1[14], &x1[17]);
1003   btf_32_add_sub_avx2(&x1[15], &x1[16]);
1004   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
1005   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
1006   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
1007   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
1008   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
1009   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
1010   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
1011   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
1012 
1013   // stage 3
1014   btf_32_add_sub_avx2(&x1[0], &x1[15]);
1015   btf_32_add_sub_avx2(&x1[1], &x1[14]);
1016   btf_32_add_sub_avx2(&x1[2], &x1[13]);
1017   btf_32_add_sub_avx2(&x1[3], &x1[12]);
1018   btf_32_add_sub_avx2(&x1[4], &x1[11]);
1019   btf_32_add_sub_avx2(&x1[5], &x1[10]);
1020   btf_32_add_sub_avx2(&x1[6], &x1[9]);
1021   btf_32_add_sub_avx2(&x1[7], &x1[8]);
1022   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
1023   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
1024   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
1025   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
1026   btf_32_add_sub_avx2(&x1[32], &x1[47]);
1027   btf_32_add_sub_avx2(&x1[33], &x1[46]);
1028   btf_32_add_sub_avx2(&x1[34], &x1[45]);
1029   btf_32_add_sub_avx2(&x1[35], &x1[44]);
1030   btf_32_add_sub_avx2(&x1[36], &x1[43]);
1031   btf_32_add_sub_avx2(&x1[37], &x1[42]);
1032   btf_32_add_sub_avx2(&x1[38], &x1[41]);
1033   btf_32_add_sub_avx2(&x1[39], &x1[40]);
1034   btf_32_add_sub_avx2(&x1[63], &x1[48]);
1035   btf_32_add_sub_avx2(&x1[62], &x1[49]);
1036   btf_32_add_sub_avx2(&x1[61], &x1[50]);
1037   btf_32_add_sub_avx2(&x1[60], &x1[51]);
1038   btf_32_add_sub_avx2(&x1[59], &x1[52]);
1039   btf_32_add_sub_avx2(&x1[58], &x1[53]);
1040   btf_32_add_sub_avx2(&x1[57], &x1[54]);
1041   btf_32_add_sub_avx2(&x1[56], &x1[55]);
1042 
1043   // stage 4
1044   btf_32_add_sub_avx2(&x1[0], &x1[7]);
1045   btf_32_add_sub_avx2(&x1[1], &x1[6]);
1046   btf_32_add_sub_avx2(&x1[2], &x1[5]);
1047   btf_32_add_sub_avx2(&x1[3], &x1[4]);
1048   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
1049   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
1050   btf_32_add_sub_avx2(&x1[16], &x1[23]);
1051   btf_32_add_sub_avx2(&x1[17], &x1[22]);
1052   btf_32_add_sub_avx2(&x1[18], &x1[21]);
1053   btf_32_add_sub_avx2(&x1[19], &x1[20]);
1054   btf_32_add_sub_avx2(&x1[31], &x1[24]);
1055   btf_32_add_sub_avx2(&x1[30], &x1[25]);
1056   btf_32_add_sub_avx2(&x1[29], &x1[26]);
1057   btf_32_add_sub_avx2(&x1[28], &x1[27]);
1058   btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
1059   btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
1060   btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
1061   btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
1062   btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
1063   btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
1064   btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
1065   btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
1066 
1067   // stage 5
1068   btf_32_add_sub_avx2(&x1[0], &x1[3]);
1069   btf_32_add_sub_avx2(&x1[1], &x1[2]);
1070   btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
1071   btf_32_add_sub_avx2(&x1[8], &x1[11]);
1072   btf_32_add_sub_avx2(&x1[9], &x1[10]);
1073   btf_32_add_sub_avx2(&x1[15], &x1[12]);
1074   btf_32_add_sub_avx2(&x1[14], &x1[13]);
1075   btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
1076   btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
1077   btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
1078   btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
1079   btf_32_add_sub_avx2(&x1[32], &x1[39]);
1080   btf_32_add_sub_avx2(&x1[33], &x1[38]);
1081   btf_32_add_sub_avx2(&x1[34], &x1[37]);
1082   btf_32_add_sub_avx2(&x1[35], &x1[36]);
1083   btf_32_add_sub_avx2(&x1[47], &x1[40]);
1084   btf_32_add_sub_avx2(&x1[46], &x1[41]);
1085   btf_32_add_sub_avx2(&x1[45], &x1[42]);
1086   btf_32_add_sub_avx2(&x1[44], &x1[43]);
1087   btf_32_add_sub_avx2(&x1[48], &x1[55]);
1088   btf_32_add_sub_avx2(&x1[49], &x1[54]);
1089   btf_32_add_sub_avx2(&x1[50], &x1[53]);
1090   btf_32_add_sub_avx2(&x1[51], &x1[52]);
1091   btf_32_add_sub_avx2(&x1[63], &x1[56]);
1092   btf_32_add_sub_avx2(&x1[62], &x1[57]);
1093   btf_32_add_sub_avx2(&x1[61], &x1[58]);
1094   btf_32_add_sub_avx2(&x1[60], &x1[59]);
1095 
1096   // stage 6
1097   btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
1098   btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
1099   btf_32_add_sub_avx2(&x1[4], &x1[5]);
1100   btf_32_add_sub_avx2(&x1[7], &x1[6]);
1101   btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
1102   btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
1103   btf_32_add_sub_avx2(&x1[16], &x1[19]);
1104   btf_32_add_sub_avx2(&x1[17], &x1[18]);
1105   btf_32_add_sub_avx2(&x1[23], &x1[20]);
1106   btf_32_add_sub_avx2(&x1[22], &x1[21]);
1107   btf_32_add_sub_avx2(&x1[24], &x1[27]);
1108   btf_32_add_sub_avx2(&x1[25], &x1[26]);
1109   btf_32_add_sub_avx2(&x1[31], &x1[28]);
1110   btf_32_add_sub_avx2(&x1[30], &x1[29]);
1111   btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
1112   btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
1113   btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
1114   btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
1115   btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
1116   btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
1117   btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
1118   btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
1119 
1120   // stage 7
1121   btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
1122   btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
1123   btf_32_add_sub_avx2(&x1[8], &x1[9]);
1124   btf_32_add_sub_avx2(&x1[11], &x1[10]);
1125   btf_32_add_sub_avx2(&x1[12], &x1[13]);
1126   btf_32_add_sub_avx2(&x1[15], &x1[14]);
1127   btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
1128   btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
1129   btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
1130   btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
1131   btf_32_add_sub_avx2(&x1[32], &x1[35]);
1132   btf_32_add_sub_avx2(&x1[33], &x1[34]);
1133   btf_32_add_sub_avx2(&x1[39], &x1[36]);
1134   btf_32_add_sub_avx2(&x1[38], &x1[37]);
1135   btf_32_add_sub_avx2(&x1[40], &x1[43]);
1136   btf_32_add_sub_avx2(&x1[41], &x1[42]);
1137   btf_32_add_sub_avx2(&x1[47], &x1[44]);
1138   btf_32_add_sub_avx2(&x1[46], &x1[45]);
1139   btf_32_add_sub_avx2(&x1[48], &x1[51]);
1140   btf_32_add_sub_avx2(&x1[49], &x1[50]);
1141   btf_32_add_sub_avx2(&x1[55], &x1[52]);
1142   btf_32_add_sub_avx2(&x1[54], &x1[53]);
1143   btf_32_add_sub_avx2(&x1[56], &x1[59]);
1144   btf_32_add_sub_avx2(&x1[57], &x1[58]);
1145   btf_32_add_sub_avx2(&x1[63], &x1[60]);
1146   btf_32_add_sub_avx2(&x1[62], &x1[61]);
1147 
1148   // stage 8
1149   btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
1150   btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
1151   btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
1152   btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
1153   btf_32_add_sub_avx2(&x1[16], &x1[17]);
1154   btf_32_add_sub_avx2(&x1[19], &x1[18]);
1155   btf_32_add_sub_avx2(&x1[20], &x1[21]);
1156   btf_32_add_sub_avx2(&x1[23], &x1[22]);
1157   btf_32_add_sub_avx2(&x1[24], &x1[25]);
1158   btf_32_add_sub_avx2(&x1[27], &x1[26]);
1159   btf_32_add_sub_avx2(&x1[28], &x1[29]);
1160   btf_32_add_sub_avx2(&x1[31], &x1[30]);
1161   btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
1162   btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
1163   btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
1164   btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
1165   btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
1166   btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
1167   btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
1168   btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
1169 
1170   // stage 9
1171   btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
1172   btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
1173   btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
1174   btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
1175   btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
1176   btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
1177   btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
1178   btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
1179   btf_32_add_sub_avx2(&x1[32], &x1[33]);
1180   btf_32_add_sub_avx2(&x1[35], &x1[34]);
1181   btf_32_add_sub_avx2(&x1[36], &x1[37]);
1182   btf_32_add_sub_avx2(&x1[39], &x1[38]);
1183   btf_32_add_sub_avx2(&x1[40], &x1[41]);
1184   btf_32_add_sub_avx2(&x1[43], &x1[42]);
1185   btf_32_add_sub_avx2(&x1[44], &x1[45]);
1186   btf_32_add_sub_avx2(&x1[47], &x1[46]);
1187   btf_32_add_sub_avx2(&x1[48], &x1[49]);
1188   btf_32_add_sub_avx2(&x1[51], &x1[50]);
1189   btf_32_add_sub_avx2(&x1[52], &x1[53]);
1190   btf_32_add_sub_avx2(&x1[55], &x1[54]);
1191   btf_32_add_sub_avx2(&x1[56], &x1[57]);
1192   btf_32_add_sub_avx2(&x1[59], &x1[58]);
1193   btf_32_add_sub_avx2(&x1[60], &x1[61]);
1194   btf_32_add_sub_avx2(&x1[63], &x1[62]);
1195 
1196   // stage 10
1197   btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
1198   btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
1199   btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
1200   btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
1201   btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
1202   btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
1203   btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
1204   btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
1205   btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
1206   btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
1207   btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
1208   btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
1209   btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
1210   btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
1211   btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
1212   btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
1213 
1214   // stage 11
1215   output[0] = x1[0];
1216   output[1] = x1[32];
1217   output[2] = x1[16];
1218   output[3] = x1[48];
1219   output[4] = x1[8];
1220   output[5] = x1[40];
1221   output[6] = x1[24];
1222   output[7] = x1[56];
1223   output[8] = x1[4];
1224   output[9] = x1[36];
1225   output[10] = x1[20];
1226   output[11] = x1[52];
1227   output[12] = x1[12];
1228   output[13] = x1[44];
1229   output[14] = x1[28];
1230   output[15] = x1[60];
1231   output[16] = x1[2];
1232   output[17] = x1[34];
1233   output[18] = x1[18];
1234   output[19] = x1[50];
1235   output[20] = x1[10];
1236   output[21] = x1[42];
1237   output[22] = x1[26];
1238   output[23] = x1[58];
1239   output[24] = x1[6];
1240   output[25] = x1[38];
1241   output[26] = x1[22];
1242   output[27] = x1[54];
1243   output[28] = x1[14];
1244   output[29] = x1[46];
1245   output[30] = x1[30];
1246   output[31] = x1[62];
1247   output[32] = x1[1];
1248   output[33] = x1[33];
1249   output[34] = x1[17];
1250   output[35] = x1[49];
1251   output[36] = x1[9];
1252   output[37] = x1[41];
1253   output[38] = x1[25];
1254   output[39] = x1[57];
1255   output[40] = x1[5];
1256   output[41] = x1[37];
1257   output[42] = x1[21];
1258   output[43] = x1[53];
1259   output[44] = x1[13];
1260   output[45] = x1[45];
1261   output[46] = x1[29];
1262   output[47] = x1[61];
1263   output[48] = x1[3];
1264   output[49] = x1[35];
1265   output[50] = x1[19];
1266   output[51] = x1[51];
1267   output[52] = x1[11];
1268   output[53] = x1[43];
1269   output[54] = x1[27];
1270   output[55] = x1[59];
1271   output[56] = x1[7];
1272   output[57] = x1[39];
1273   output[58] = x1[23];
1274   output[59] = x1[55];
1275   output[60] = x1[15];
1276   output[61] = x1[47];
1277   output[62] = x1[31];
1278   output[63] = x1[63];
1279 }
1280 
fadst16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1281 static inline void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
1282                                        int8_t cos_bit) {
1283   const int32_t *cospi = cospi_arr(cos_bit);
1284   const __m256i __zero = _mm256_setzero_si256();
1285   const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
1286 
1287   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
1288   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
1289   __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
1290   __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
1291   __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
1292   __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
1293   __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
1294   __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
1295   __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
1296   __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
1297   __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
1298   __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
1299   __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
1300   __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
1301   __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
1302   __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
1303   __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
1304   __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
1305   __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
1306   __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
1307   __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
1308   __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
1309   __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
1310   __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
1311   __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
1312   __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
1313   __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
1314 
1315   // stage 1
1316   __m256i x1[16];
1317   x1[0] = input[0];
1318   x1[1] = _mm256_subs_epi16(__zero, input[15]);
1319   x1[2] = _mm256_subs_epi16(__zero, input[7]);
1320   x1[3] = input[8];
1321   x1[4] = _mm256_subs_epi16(__zero, input[3]);
1322   x1[5] = input[12];
1323   x1[6] = input[4];
1324   x1[7] = _mm256_subs_epi16(__zero, input[11]);
1325   x1[8] = _mm256_subs_epi16(__zero, input[1]);
1326   x1[9] = input[14];
1327   x1[10] = input[6];
1328   x1[11] = _mm256_subs_epi16(__zero, input[9]);
1329   x1[12] = input[2];
1330   x1[13] = _mm256_subs_epi16(__zero, input[13]);
1331   x1[14] = _mm256_subs_epi16(__zero, input[5]);
1332   x1[15] = input[10];
1333 
1334   // stage 2
1335   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
1336   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
1337   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
1338   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
1339 
1340   // stage 3
1341   btf_16_adds_subs_avx2(&x1[0], &x1[2]);
1342   btf_16_adds_subs_avx2(&x1[1], &x1[3]);
1343   btf_16_adds_subs_avx2(&x1[4], &x1[6]);
1344   btf_16_adds_subs_avx2(&x1[5], &x1[7]);
1345   btf_16_adds_subs_avx2(&x1[8], &x1[10]);
1346   btf_16_adds_subs_avx2(&x1[9], &x1[11]);
1347   btf_16_adds_subs_avx2(&x1[12], &x1[14]);
1348   btf_16_adds_subs_avx2(&x1[13], &x1[15]);
1349 
1350   // stage 4
1351   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
1352   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
1353   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
1354   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
1355 
1356   // stage 5
1357   btf_16_adds_subs_avx2(&x1[0], &x1[4]);
1358   btf_16_adds_subs_avx2(&x1[1], &x1[5]);
1359   btf_16_adds_subs_avx2(&x1[2], &x1[6]);
1360   btf_16_adds_subs_avx2(&x1[3], &x1[7]);
1361   btf_16_adds_subs_avx2(&x1[8], &x1[12]);
1362   btf_16_adds_subs_avx2(&x1[9], &x1[13]);
1363   btf_16_adds_subs_avx2(&x1[10], &x1[14]);
1364   btf_16_adds_subs_avx2(&x1[11], &x1[15]);
1365 
1366   // stage 6
1367   btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
1368   btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
1369   btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
1370   btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
1371 
1372   // stage 7
1373   btf_16_adds_subs_avx2(&x1[0], &x1[8]);
1374   btf_16_adds_subs_avx2(&x1[1], &x1[9]);
1375   btf_16_adds_subs_avx2(&x1[2], &x1[10]);
1376   btf_16_adds_subs_avx2(&x1[3], &x1[11]);
1377   btf_16_adds_subs_avx2(&x1[4], &x1[12]);
1378   btf_16_adds_subs_avx2(&x1[5], &x1[13]);
1379   btf_16_adds_subs_avx2(&x1[6], &x1[14]);
1380   btf_16_adds_subs_avx2(&x1[7], &x1[15]);
1381 
1382   // stage 8
1383   btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
1384   btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
1385   btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
1386   btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
1387   btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
1388   btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
1389   btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
1390   btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
1391 
1392   // stage 9
1393   output[0] = x1[1];
1394   output[1] = x1[14];
1395   output[2] = x1[3];
1396   output[3] = x1[12];
1397   output[4] = x1[5];
1398   output[5] = x1[10];
1399   output[6] = x1[7];
1400   output[7] = x1[8];
1401   output[8] = x1[9];
1402   output[9] = x1[6];
1403   output[10] = x1[11];
1404   output[11] = x1[4];
1405   output[12] = x1[13];
1406   output[13] = x1[2];
1407   output[14] = x1[15];
1408   output[15] = x1[0];
1409 }
1410 
fidentity16x16_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1411 static inline void fidentity16x16_new_avx2(const __m256i *input,
1412                                            __m256i *output, int8_t cos_bit) {
1413   (void)cos_bit;
1414   const __m256i one = _mm256_set1_epi16(1);
1415 
1416   for (int i = 0; i < 16; ++i) {
1417     const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
1418     const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
1419     const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
1420     const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
1421     output[i] = _mm256_packs_epi32(b_lo, b_hi);
1422   }
1423 }
1424 
fidentity16x32_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)1425 static inline void fidentity16x32_avx2(const __m256i *input, __m256i *output,
1426                                        int8_t cos_bit) {
1427   (void)cos_bit;
1428   for (int i = 0; i < 32; ++i) {
1429     output[i] = _mm256_slli_epi16(input[i], 2);
1430   }
1431 }
1432 
store_output_32bit_w16(int32_t * const out,const __m256i * const in1,const __m256i * const in2,const int stride,const int out_size)1433 static inline void store_output_32bit_w16(int32_t *const out,
1434                                           const __m256i *const in1,
1435                                           const __m256i *const in2,
1436                                           const int stride,
1437                                           const int out_size) {
1438   for (int i = 0; i < out_size; ++i) {
1439     _mm256_store_si256((__m256i *)(out + stride * i), in1[i]);
1440     _mm256_store_si256((__m256i *)(out + stride * i + 8), in2[i]);
1441   }
1442 }
1443 
1444 // Store 8 16 bit values. Sign extend the values.
store_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * out,const int stride,const int out_size)1445 static inline void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
1446                                                         int32_t *out,
1447                                                         const int stride,
1448                                                         const int out_size) {
1449   for (int i = 0; i < out_size; ++i) {
1450     _mm256_store_si256((__m256i *)(out),
1451                        _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
1452     _mm256_store_si256(
1453         (__m256i *)(out + 8),
1454         _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
1455     out += stride;
1456   }
1457 }
1458 
store_rect_16bit_to_32bit_avx2(const __m256i a,int32_t * const b)1459 static inline void store_rect_16bit_to_32bit_avx2(const __m256i a,
1460                                                   int32_t *const b) {
1461   const __m256i one = _mm256_set1_epi16(1);
1462   const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
1463   const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
1464   const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
1465   const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
1466   const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
1467   _mm256_store_si256((__m256i *)b, b_lo);
1468   _mm256_store_si256((__m256i *)(b + 8), b_hi);
1469 }
1470 
store_rect_buffer_16bit_to_32bit_w16_avx2(const __m256i * const in,int32_t * const out,const int stride,const int out_size)1471 static inline void store_rect_buffer_16bit_to_32bit_w16_avx2(
1472     const __m256i *const in, int32_t *const out, const int stride,
1473     const int out_size) {
1474   for (int i = 0; i < out_size; ++i) {
1475     store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
1476   }
1477 }
1478 
1479 typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
1480                                   int8_t cos_bit);
1481 
1482 static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
1483   fdct16x32_avx2,       // DCT_DCT
1484   NULL,                 // ADST_DCT
1485   NULL,                 // DCT_ADST
1486   NULL,                 // ADST_ADST
1487   NULL,                 // FLIPADST_DCT
1488   NULL,                 // DCT_FLIPADST
1489   NULL,                 // FLIPADST_FLIPADST
1490   NULL,                 // ADST_FLIPADST
1491   NULL,                 // FLIPADST_ADST
1492   fidentity16x32_avx2,  // IDTX
1493   fdct16x32_avx2,       // V_DCT
1494   fidentity16x32_avx2,  // H_DCT
1495   NULL,                 // V_ADST
1496   NULL,                 // H_ADST
1497   NULL,                 // V_FLIPADST
1498   NULL                  // H_FLIPADST
1499 };
1500 
1501 static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
1502   fdct16x32_avx2,       // DCT_DCT
1503   NULL,                 // ADST_DCT
1504   NULL,                 // DCT_ADST
1505   NULL,                 // ADST_ADST
1506   NULL,                 // FLIPADST_DCT
1507   NULL,                 // DCT_FLIPADST
1508   NULL,                 // FLIPADST_FLIPADST
1509   NULL,                 // ADST_FLIPADST
1510   NULL,                 // FLIPADST_ADST
1511   fidentity16x32_avx2,  // IDTX
1512   fidentity16x32_avx2,  // V_DCT
1513   fdct16x32_avx2,       // H_DCT
1514   NULL,                 // V_ADST
1515   NULL,                 // H_ADST
1516   NULL,                 // V_FLIPADST
1517   NULL                  // H_FLIPADST
1518 };
1519 
1520 static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
1521   fdct16x16_new_avx2,       // DCT_DCT
1522   fadst16x16_new_avx2,      // ADST_DCT
1523   fdct16x16_new_avx2,       // DCT_ADST
1524   fadst16x16_new_avx2,      // ADST_ADST
1525   fadst16x16_new_avx2,      // FLIPADST_DCT
1526   fdct16x16_new_avx2,       // DCT_FLIPADST
1527   fadst16x16_new_avx2,      // FLIPADST_FLIPADST
1528   fadst16x16_new_avx2,      // ADST_FLIPADST
1529   fadst16x16_new_avx2,      // FLIPADST_ADST
1530   fidentity16x16_new_avx2,  // IDTX
1531   fdct16x16_new_avx2,       // V_DCT
1532   fidentity16x16_new_avx2,  // H_DCT
1533   fadst16x16_new_avx2,      // V_ADST
1534   fidentity16x16_new_avx2,  // H_ADST
1535   fadst16x16_new_avx2,      // V_FLIPADST
1536   fidentity16x16_new_avx2   // H_FLIPADST
1537 };
1538 
1539 static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
1540   fdct16x16_new_avx2,       // DCT_DCT
1541   fdct16x16_new_avx2,       // ADST_DCT
1542   fadst16x16_new_avx2,      // DCT_ADST
1543   fadst16x16_new_avx2,      // ADST_ADST
1544   fdct16x16_new_avx2,       // FLIPADST_DCT
1545   fadst16x16_new_avx2,      // DCT_FLIPADST
1546   fadst16x16_new_avx2,      // FLIPADST_FLIPADST
1547   fadst16x16_new_avx2,      // ADST_FLIPADST
1548   fadst16x16_new_avx2,      // FLIPADST_ADST
1549   fidentity16x16_new_avx2,  // IDTX
1550   fidentity16x16_new_avx2,  // V_DCT
1551   fdct16x16_new_avx2,       // H_DCT
1552   fidentity16x16_new_avx2,  // V_ADST
1553   fadst16x16_new_avx2,      // H_ADST
1554   fidentity16x16_new_avx2,  // V_FLIPADST
1555   fadst16x16_new_avx2       // H_FLIPADST
1556 };
1557 
1558 static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
1559   fdct8x8_new_sse2,       // DCT_DCT
1560   fadst8x8_new_sse2,      // ADST_DCT
1561   fdct8x8_new_sse2,       // DCT_ADST
1562   fadst8x8_new_sse2,      // ADST_ADST
1563   fadst8x8_new_sse2,      // FLIPADST_DCT
1564   fdct8x8_new_sse2,       // DCT_FLIPADST
1565   fadst8x8_new_sse2,      // FLIPADST_FLIPADST
1566   fadst8x8_new_sse2,      // ADST_FLIPADST
1567   fadst8x8_new_sse2,      // FLIPADST_ADST
1568   fidentity8x8_new_sse2,  // IDTX
1569   fdct8x8_new_sse2,       // V_DCT
1570   fidentity8x8_new_sse2,  // H_DCT
1571   fadst8x8_new_sse2,      // V_ADST
1572   fidentity8x8_new_sse2,  // H_ADST
1573   fadst8x8_new_sse2,      // V_FLIPADST
1574   fidentity8x8_new_sse2,  // H_FLIPADST
1575 };
1576 
1577 static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
1578   fdct8x8_new_sse2,       // DCT_DCT
1579   fdct8x8_new_sse2,       // ADST_DCT
1580   fadst8x8_new_sse2,      // DCT_ADST
1581   fadst8x8_new_sse2,      // ADST_ADST
1582   fdct8x8_new_sse2,       // FLIPADST_DCT
1583   fadst8x8_new_sse2,      // DCT_FLIPADST
1584   fadst8x8_new_sse2,      // FLIPADST_FLIPADST
1585   fadst8x8_new_sse2,      // ADST_FLIPADST
1586   fadst8x8_new_sse2,      // FLIPADST_ADST
1587   fidentity8x8_new_sse2,  // IDTX
1588   fidentity8x8_new_sse2,  // V_DCT
1589   fdct8x8_new_sse2,       // H_DCT
1590   fidentity8x8_new_sse2,  // V_ADST
1591   fadst8x8_new_sse2,      // H_ADST
1592   fidentity8x8_new_sse2,  // V_FLIPADST
1593   fadst8x8_new_sse2       // H_FLIPADST
1594 };
1595 
load_buffer_and_round_shift(const int16_t * in,int stride,__m128i * out,int bit)1596 static inline void load_buffer_and_round_shift(const int16_t *in, int stride,
1597                                                __m128i *out, int bit) {
1598   out[0] = _mm_load_si128((const __m128i *)(in + 0 * stride));
1599   out[1] = _mm_load_si128((const __m128i *)(in + 1 * stride));
1600   out[2] = _mm_load_si128((const __m128i *)(in + 2 * stride));
1601   out[3] = _mm_load_si128((const __m128i *)(in + 3 * stride));
1602   out[4] = _mm_load_si128((const __m128i *)(in + 4 * stride));
1603   out[5] = _mm_load_si128((const __m128i *)(in + 5 * stride));
1604   out[6] = _mm_load_si128((const __m128i *)(in + 6 * stride));
1605   out[7] = _mm_load_si128((const __m128i *)(in + 7 * stride));
1606   out[0] = _mm_slli_epi16(out[0], bit);
1607   out[1] = _mm_slli_epi16(out[1], bit);
1608   out[2] = _mm_slli_epi16(out[2], bit);
1609   out[3] = _mm_slli_epi16(out[3], bit);
1610   out[4] = _mm_slli_epi16(out[4], bit);
1611   out[5] = _mm_slli_epi16(out[5], bit);
1612   out[6] = _mm_slli_epi16(out[6], bit);
1613   out[7] = _mm_slli_epi16(out[7], bit);
1614 }
1615 
load_buffer_and_flip_round_shift(const int16_t * in,int stride,__m128i * out,int bit)1616 static inline void load_buffer_and_flip_round_shift(const int16_t *in,
1617                                                     int stride, __m128i *out,
1618                                                     int bit) {
1619   out[7] = load_16bit_to_16bit(in + 0 * stride);
1620   out[6] = load_16bit_to_16bit(in + 1 * stride);
1621   out[5] = load_16bit_to_16bit(in + 2 * stride);
1622   out[4] = load_16bit_to_16bit(in + 3 * stride);
1623   out[3] = load_16bit_to_16bit(in + 4 * stride);
1624   out[2] = load_16bit_to_16bit(in + 5 * stride);
1625   out[1] = load_16bit_to_16bit(in + 6 * stride);
1626   out[0] = load_16bit_to_16bit(in + 7 * stride);
1627   out[7] = _mm_slli_epi16(out[7], bit);
1628   out[6] = _mm_slli_epi16(out[6], bit);
1629   out[5] = _mm_slli_epi16(out[5], bit);
1630   out[4] = _mm_slli_epi16(out[4], bit);
1631   out[3] = _mm_slli_epi16(out[3], bit);
1632   out[2] = _mm_slli_epi16(out[2], bit);
1633   out[1] = _mm_slli_epi16(out[1], bit);
1634   out[0] = _mm_slli_epi16(out[0], bit);
1635 }
1636 
1637 #define TRANSPOSE_8X8_AVX2()                                         \
1638   {                                                                  \
1639     /* aa0:    00 10 01 11  02 12 03 13 | 40 50 41 51  42 52 43 53*/ \
1640     /* aa1:    04 14 05 15  06 16 07 17 | 44 54 45 55  46 56 47 57*/ \
1641     /* aa2:    20 30 21 31  22 32 23 33 | 60 70 61 71  62 72 63 73*/ \
1642     /* aa3:    24 34 25 35  26 36 27 37 | 64 74 65 75  66 76 67 77*/ \
1643     const __m256i aa0 = _mm256_unpacklo_epi16(b0, b1);               \
1644     const __m256i aa1 = _mm256_unpackhi_epi16(b0, b1);               \
1645     const __m256i aa2 = _mm256_unpacklo_epi16(b2, b3);               \
1646     const __m256i aa3 = _mm256_unpackhi_epi16(b2, b3);               \
1647     /* Unpack 32 bit elements resulting in: */                       \
1648     /* bb0: 00 10 20 30  01 11 21 31 | 40 50 60 70  41 51 61 71*/    \
1649     /* bb1: 02 12 22 32  03 13 23 33 | 42 52 62 72  43 53 63 73*/    \
1650     /* bb2: 04 14 24 34  05 15 25 35 | 44 54 64 74  45 55 65 75*/    \
1651     /* bb2: 06 16 26 36  07 17 27 37 | 46 56 66 76  47 57 67 77*/    \
1652     const __m256i bb0 = _mm256_unpacklo_epi32(aa0, aa2);             \
1653     const __m256i bb1 = _mm256_unpackhi_epi32(aa0, aa2);             \
1654     const __m256i bb2 = _mm256_unpacklo_epi32(aa1, aa3);             \
1655     const __m256i bb3 = _mm256_unpackhi_epi32(aa1, aa3);             \
1656     /* bb0: 00 10 20 30 40 50 60 70| 01 11 21 31 41 51 61 71*/       \
1657     /* bb1: 02 12 22 32 42 52 62 72| 03 13 23 33 43 53 63 73*/       \
1658     /* bb2: 04 14 24 34 44 54 64 74| 05 15 25 35 45 55 65 75*/       \
1659     /* bb2: 06 16 26 36 46 56 66 76| 07 17 27 37 47 57 67 77*/       \
1660     c0 = _mm256_permute4x64_epi64(bb0, 0xd8);                        \
1661     c1 = _mm256_permute4x64_epi64(bb1, 0xd8);                        \
1662     c2 = _mm256_permute4x64_epi64(bb2, 0xd8);                        \
1663     c3 = _mm256_permute4x64_epi64(bb3, 0xd8);                        \
1664   }
1665 
transpose_round_shift_flip_8x8(__m128i * const in,__m128i * const out,int bit)1666 static inline void transpose_round_shift_flip_8x8(__m128i *const in,
1667                                                   __m128i *const out, int bit) {
1668   __m256i c0, c1, c2, c3;
1669   bit = -bit;
1670   const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
1671   const __m256i s04 =
1672       _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
1673   const __m256i s15 =
1674       _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
1675   const __m256i s26 =
1676       _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
1677   const __m256i s37 =
1678       _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
1679 
1680   const __m256i a0 = _mm256_adds_epi16(s04, rounding);
1681   const __m256i a1 = _mm256_adds_epi16(s15, rounding);
1682   const __m256i a2 = _mm256_adds_epi16(s26, rounding);
1683   const __m256i a3 = _mm256_adds_epi16(s37, rounding);
1684 
1685   // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
1686   // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
1687   // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
1688   // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
1689   const __m256i b0 = _mm256_srai_epi16(a0, bit);
1690   const __m256i b1 = _mm256_srai_epi16(a1, bit);
1691   const __m256i b2 = _mm256_srai_epi16(a2, bit);
1692   const __m256i b3 = _mm256_srai_epi16(a3, bit);
1693 
1694   TRANSPOSE_8X8_AVX2()
1695 
1696   // Unpack 64 bit elements resulting in:
1697   // out[7]: 00 10 20 30  40 50 60 70
1698   // out[6]: 01 11 21 31  41 51 61 71
1699   // out[5]: 02 12 22 32  42 52 62 72
1700   // out[4]: 03 13 23 33  43 53 63 73
1701   // out[3]: 04 14 24 34  44 54 64 74
1702   // out[2]: 05 15 25 35  45 55 65 75
1703   // out[1]: 06 16 26 36  46 56 66 76
1704   // out[0]: 07 17 27 37  47 57 67 77
1705   out[7] = _mm256_castsi256_si128(c0);
1706   out[6] = _mm256_extractf128_si256(c0, 1);
1707   out[5] = _mm256_castsi256_si128(c1);
1708   out[4] = _mm256_extractf128_si256(c1, 1);
1709   out[3] = _mm256_castsi256_si128(c2);
1710   out[2] = _mm256_extractf128_si256(c2, 1);
1711   out[1] = _mm256_castsi256_si128(c3);
1712   out[0] = _mm256_extractf128_si256(c3, 1);
1713 }
1714 
transpose_round_shift_8x8(__m128i * const in,__m128i * const out,int bit)1715 static inline void transpose_round_shift_8x8(__m128i *const in,
1716                                              __m128i *const out, int bit) {
1717   __m256i c0, c1, c2, c3;
1718   bit = -bit;
1719   const __m256i rounding = _mm256_set1_epi16(1 << (bit - 1));
1720   const __m256i s04 =
1721       _mm256_insertf128_si256(_mm256_castsi128_si256(in[0]), in[4], 0x1);
1722   const __m256i s15 =
1723       _mm256_insertf128_si256(_mm256_castsi128_si256(in[1]), in[5], 0x1);
1724   const __m256i s26 =
1725       _mm256_insertf128_si256(_mm256_castsi128_si256(in[2]), in[6], 0x1);
1726   const __m256i s37 =
1727       _mm256_insertf128_si256(_mm256_castsi128_si256(in[3]), in[7], 0x1);
1728 
1729   const __m256i a0 = _mm256_adds_epi16(s04, rounding);
1730   const __m256i a1 = _mm256_adds_epi16(s15, rounding);
1731   const __m256i a2 = _mm256_adds_epi16(s26, rounding);
1732   const __m256i a3 = _mm256_adds_epi16(s37, rounding);
1733 
1734   // b0: 00 01 02 03  04 05 06 07 | 40 41 42 43  44 45 46 47
1735   // b1: 10 11 12 13  14 15 16 17 | 50 51 52 53  54 55 56 57
1736   // b2: 20 21 22 23  24 25 26 27 | 60 61 62 63  64 65 66 67
1737   // b3: 30 31 32 33  34 35 36 37 | 70 71 72 73  74 75 76 77
1738   const __m256i b0 = _mm256_srai_epi16(a0, bit);
1739   const __m256i b1 = _mm256_srai_epi16(a1, bit);
1740   const __m256i b2 = _mm256_srai_epi16(a2, bit);
1741   const __m256i b3 = _mm256_srai_epi16(a3, bit);
1742 
1743   TRANSPOSE_8X8_AVX2()
1744   // Unpack 64 bit elements resulting in:
1745   // out[7]: 00 10 20 30  40 50 60 70
1746   // out[6]: 01 11 21 31  41 51 61 71
1747   // out[5]: 02 12 22 32  42 52 62 72
1748   // out[4]: 03 13 23 33  43 53 63 73
1749   // out[3]: 04 14 24 34  44 54 64 74
1750   // out[2]: 05 15 25 35  45 55 65 75
1751   // out[1]: 06 16 26 36  46 56 66 76
1752   // out[0]: 07 17 27 37  47 57 67 77
1753   out[0] = _mm256_castsi256_si128(c0);
1754   out[1] = _mm256_extractf128_si256(c0, 1);
1755   out[2] = _mm256_castsi256_si128(c1);
1756   out[3] = _mm256_extractf128_si256(c1, 1);
1757   out[4] = _mm256_castsi256_si128(c2);
1758   out[5] = _mm256_extractf128_si256(c2, 1);
1759   out[6] = _mm256_castsi256_si128(c3);
1760   out[7] = _mm256_extractf128_si256(c3, 1);
1761 }
1762 
store_buffer_16bit_to_32bit_w8_avx2(const __m128i * const in,int32_t * const out,const int stride,const int out_size)1763 static inline void store_buffer_16bit_to_32bit_w8_avx2(const __m128i *const in,
1764                                                        int32_t *const out,
1765                                                        const int stride,
1766                                                        const int out_size) {
1767   for (int i = 0; i < out_size; ++i) {
1768     _mm256_store_si256((__m256i *)(out + i * stride),
1769                        _mm256_cvtepi16_epi32(in[i]));
1770   }
1771 }
1772 
av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1773 static void av1_lowbd_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *output,
1774                                           int stride, TX_TYPE tx_type, int bd) {
1775   (void)bd;
1776   __m128i buf0[8], buf1[8], *buf;
1777   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X8];
1778   const int txw_idx = get_txw_idx(TX_8X8);
1779   const int txh_idx = get_txh_idx(TX_8X8);
1780   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1781   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1782   const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
1783   const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
1784   int ud_flip, lr_flip;
1785 
1786   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1787   // Condition to check shift bit is avoided while round shifting, by assuming
1788   // that shift[0] will always be positive.
1789   assert(shift[0] > 0);
1790   if (ud_flip)
1791     load_buffer_and_flip_round_shift(input, stride, buf0, shift[0]);
1792   else
1793     load_buffer_and_round_shift(input, stride, buf0, shift[0]);
1794 
1795   col_txfm(buf0, buf0, cos_bit_col);
1796   // Condition to check shift bit is avoided while round shifting, by assuming
1797   // that shift[1] will always be negative.
1798   assert(shift[1] < 0);
1799 
1800   if (lr_flip) {
1801     transpose_round_shift_flip_8x8(buf0, buf1, shift[1]);
1802   } else {
1803     transpose_round_shift_8x8(buf0, buf1, shift[1]);
1804   }
1805 
1806   buf = buf1;
1807   row_txfm(buf, buf, cos_bit_row);
1808 
1809   // Round and shift operation is avoided here as the shift bit is assumed to be
1810   // zero always.
1811   assert(shift[2] == 0);
1812   store_buffer_16bit_to_32bit_w8_avx2(buf, output, 8, 8);
1813 }
1814 
lowbd_fwd_txfm2d_16x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1815 static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
1816                                         int stride, TX_TYPE tx_type, int bd) {
1817   (void)bd;
1818   const TX_SIZE tx_size = TX_16X16;
1819   __m256i buf0[16], buf1[16];
1820   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1821   const int txw_idx = get_txw_idx(tx_size);
1822   const int txh_idx = get_txh_idx(tx_size);
1823   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1824   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1825   const int width = tx_size_wide[tx_size];
1826   const int height = tx_size_high[tx_size];
1827   const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
1828   const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1829   int ud_flip, lr_flip;
1830 
1831   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1832   const int32_t i = 0;
1833   if (ud_flip) {
1834     load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
1835   } else {
1836     load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1837   }
1838   round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1839   col_txfm(buf0, buf0, cos_bit_col);
1840   round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1841   transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
1842 
1843   __m256i *buf;
1844   if (lr_flip) {
1845     buf = buf0;
1846     flip_buf_avx2(buf1 + width * i, buf, width);
1847   } else {
1848     buf = buf1 + width * i;
1849   }
1850   row_txfm(buf, buf, cos_bit_row);
1851   round_shift_16bit_w16_avx2(buf, width, shift[2]);
1852   store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
1853 }
1854 
lowbd_fwd_txfm2d_32x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1855 static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
1856                                         int stride, TX_TYPE tx_type, int bd) {
1857   (void)bd;
1858   const TX_SIZE tx_size = TX_32X32;
1859   __m256i buf0[32], buf1[128];
1860   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1861   const int txw_idx = get_txw_idx(tx_size);
1862   const int txh_idx = get_txh_idx(tx_size);
1863   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1864   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1865   const int width = tx_size_wide[tx_size];
1866   const int height = tx_size_high[tx_size];
1867   const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1868   const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
1869 
1870   int ud_flip, lr_flip;
1871   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1872 
1873   for (int i = 0; i < 2; i++) {
1874     if (ud_flip) {
1875       load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
1876                                            height);
1877     } else {
1878       load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1879     }
1880     round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1881     col_txfm(buf0, buf0, cos_bit_col);
1882     round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1883     transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
1884     transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
1885   }
1886 
1887   for (int i = 0; i < 2; i++) {
1888     __m256i *buf;
1889     if (lr_flip) {
1890       buf = buf0;
1891       flip_buf_avx2(buf1 + width * i, buf, width);
1892     } else {
1893       buf = buf1 + width * i;
1894     }
1895     row_txfm(buf, buf, cos_bit_row);
1896     round_shift_16bit_w16_avx2(buf, width, shift[2]);
1897     store_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height, width);
1898   }
1899 }
1900 
lowbd_fwd_txfm2d_64x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1901 static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
1902                                         int stride, TX_TYPE tx_type, int bd) {
1903   (void)bd;
1904   (void)tx_type;
1905   assert(tx_type == DCT_DCT);
1906   const TX_SIZE tx_size = TX_64X64;
1907   __m256i buf0[64], buf1[256];
1908   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1909   const int txw_idx = get_txw_idx(tx_size);
1910   const int txh_idx = get_txh_idx(tx_size);
1911   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1912   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1913   const int width = tx_size_wide[tx_size];
1914   const int height = tx_size_high[tx_size];
1915   const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
1916   const int width_div16 = (width >> 4);
1917   const int height_div16 = (height >> 4);
1918 
1919   for (int i = 0; i < width_div16; i++) {
1920     load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
1921     round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1922     col_txfm(buf0, buf0, cos_bit_col);
1923     round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1924     for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
1925       transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
1926     }
1927   }
1928 
1929   for (int i = 0; i < AOMMIN(2, height_div16); i++) {
1930     __m256i bufA[64];
1931     __m256i bufB[64];
1932     __m128i *buf = (__m128i *)(buf1 + width * i);
1933     for (int j = 0; j < width; ++j) {
1934       bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
1935       bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
1936     }
1937     fdct64_new_avx2(bufA, bufA, cos_bit_row);
1938     fdct64_new_avx2(bufB, bufB, cos_bit_row);
1939     round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
1940     round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
1941     store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
1942   }
1943 }
1944 
lowbd_fwd_txfm2d_16x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1945 static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
1946                                         int stride, TX_TYPE tx_type, int bd) {
1947   (void)bd;
1948   const TX_SIZE tx_size = TX_16X32;
1949   __m256i buf0[32], buf1[32];
1950   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
1951   const int txw_idx = get_txw_idx(tx_size);
1952   const int txh_idx = get_txh_idx(tx_size);
1953   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1954   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1955   const int width = tx_size_wide[tx_size];
1956   const int height = tx_size_high[tx_size];
1957   const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
1958   const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
1959 
1960   int ud_flip, lr_flip;
1961   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
1962 
1963   if (ud_flip) {
1964     load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
1965   } else {
1966     load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
1967   }
1968   round_shift_16bit_w16_avx2(buf0, height, shift[0]);
1969   col_txfm(buf0, buf0, cos_bit_col);
1970   round_shift_16bit_w16_avx2(buf0, height, shift[1]);
1971   transpose_16bit_16x16_avx2(buf0, buf1);
1972   transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
1973 
1974   for (int i = 0; i < 2; i++) {
1975     __m256i *buf;
1976     if (lr_flip) {
1977       buf = buf0;
1978       flip_buf_avx2(buf1 + width * i, buf, width);
1979     } else {
1980       buf = buf1 + width * i;
1981     }
1982     row_txfm(buf, buf, cos_bit_row);
1983     round_shift_16bit_w16_avx2(buf, width, shift[2]);
1984     store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + i * 16, height,
1985                                               width);
1986   }
1987 }
1988 
lowbd_fwd_txfm2d_32x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)1989 static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
1990                                         int stride, TX_TYPE tx_type, int bd) {
1991   (void)bd;
1992   __m256i buf0[32], buf1[64];
1993   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_32X16];
1994   const int txw_idx = get_txw_idx(TX_32X16);
1995   const int txh_idx = get_txh_idx(TX_32X16);
1996   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
1997   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
1998   const int width = 32;
1999   const int height = 16;
2000   const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
2001   const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
2002 
2003   int ud_flip, lr_flip;
2004   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2005 
2006   for (int i = 0; i < 2; i++) {
2007     if (ud_flip) {
2008       load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
2009                                            height);
2010     } else {
2011       load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2012     }
2013     round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2014     col_txfm(buf0, buf0, cos_bit_col);
2015     round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2016     transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
2017   }
2018 
2019   __m256i *buf;
2020   if (lr_flip) {
2021     buf = buf0;
2022     flip_buf_avx2(buf1, buf, width);
2023   } else {
2024     buf = buf1;
2025   }
2026   row_txfm(buf, buf, cos_bit_row);
2027   round_shift_16bit_w16_avx2(buf, width, shift[2]);
2028   store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, height, width);
2029 }
2030 
lowbd_fwd_txfm2d_64x32_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2031 static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
2032                                         int stride, TX_TYPE tx_type, int bd) {
2033   (void)bd;
2034   const TX_SIZE tx_size = TX_64X32;
2035   __m256i buf0[64], buf1[256];
2036   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2037   const int txw_idx = get_txw_idx(tx_size);
2038   const int txh_idx = get_txh_idx(tx_size);
2039   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2040   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2041   const int width = tx_size_wide[tx_size];
2042   const int height = tx_size_high[tx_size];
2043   const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
2044   const int width_div16 = (width >> 4);
2045   const int height_div16 = (height >> 4);
2046 
2047   for (int i = 0; i < width_div16; i++) {
2048     load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2049     round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2050     col_txfm(buf0, buf0, cos_bit_col);
2051     round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2052     for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
2053       transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2054     }
2055   }
2056   assert(tx_type == DCT_DCT);
2057   for (int i = 0; i < AOMMIN(2, height_div16); i++) {
2058     __m256i bufA[64];
2059     __m256i bufB[64];
2060     __m128i *buf = (__m128i *)(buf1 + width * i);
2061     for (int j = 0; j < width; ++j) {
2062       bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
2063       bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
2064     }
2065     fdct64_new_avx2(bufA, bufA, cos_bit_row);
2066     fdct64_new_avx2(bufB, bufB, cos_bit_row);
2067     round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
2068     round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
2069 
2070     store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
2071   }
2072 }
2073 
lowbd_fwd_txfm2d_32x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2074 static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
2075                                         int stride, TX_TYPE tx_type, int bd) {
2076   (void)bd;
2077   (void)tx_type;
2078   assert(tx_type == DCT_DCT);
2079   const TX_SIZE tx_size = TX_32X64;
2080   __m256i buf0[64], buf1[256];
2081   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2082   const int txw_idx = get_txw_idx(tx_size);
2083   const int txh_idx = get_txh_idx(tx_size);
2084   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2085   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2086   const int width = tx_size_wide[tx_size];
2087   const int height = tx_size_high[tx_size];
2088   const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
2089   const int width_div16 = (width >> 4);
2090   const int height_div16 = (height >> 4);
2091 
2092   for (int i = 0; i < width_div16; i++) {
2093     load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2094     round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2095     col_txfm(buf0, buf0, cos_bit_col);
2096     round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2097     for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
2098       transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2099     }
2100   }
2101 
2102   for (int i = 0; i < AOMMIN(2, height_div16); i++) {
2103     __m256i bufA[32];
2104     __m256i bufB[32];
2105     __m128i *buf = (__m128i *)(buf1 + width * i);
2106     for (int j = 0; j < width; ++j) {
2107       bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
2108       bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
2109     }
2110     fdct32_avx2(bufA, bufA, cos_bit_row);
2111     fdct32_avx2(bufB, bufB, cos_bit_row);
2112     round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2], NewSqrt2);
2113     round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2], NewSqrt2);
2114 
2115     store_output_32bit_w16(output + i * 16, bufA, bufB, 32, 32);
2116   }
2117 }
2118 
lowbd_fwd_txfm2d_16x64_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2119 static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
2120                                         int stride, TX_TYPE tx_type, int bd) {
2121   (void)bd;
2122   (void)tx_type;
2123   assert(tx_type == DCT_DCT);
2124   const TX_SIZE tx_size = TX_16X64;
2125   __m256i buf0[64], buf1[64];
2126   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2127   const int txw_idx = get_txw_idx(tx_size);
2128   const int txh_idx = get_txh_idx(tx_size);
2129   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2130   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2131   const int width = tx_size_wide[tx_size];
2132   const int height = tx_size_high[tx_size];
2133   const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
2134   const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
2135   const int width_div16 = (width >> 4);
2136   const int height_div16 = (height >> 4);
2137 
2138   for (int i = 0; i < width_div16; i++) {
2139     load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2140     round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2141     col_txfm(buf0, buf0, cos_bit_col);
2142     round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2143     for (int j = 0; j < height_div16; ++j) {
2144       transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2145     }
2146   }
2147 
2148   for (int i = 0; i < AOMMIN(2, height_div16); i++) {
2149     __m256i *buf = buf1 + width * i;
2150     row_txfm(buf, buf, cos_bit_row);
2151     round_shift_16bit_w16_avx2(buf, width, shift[2]);
2152     store_buffer_16bit_to_32bit_w16_avx2(buf, output + width * i, 32, width);
2153   }
2154 }
2155 
lowbd_fwd_txfm2d_64x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2156 static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
2157                                         int stride, TX_TYPE tx_type, int bd) {
2158   (void)bd;
2159   (void)tx_type;
2160   assert(tx_type == DCT_DCT);
2161   const TX_SIZE tx_size = TX_64X16;
2162   __m256i buf0[64], buf1[64];
2163   const int8_t *shift = av1_fwd_txfm_shift_ls[tx_size];
2164   const int txw_idx = get_txw_idx(tx_size);
2165   const int txh_idx = get_txh_idx(tx_size);
2166   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2167   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2168   const int width = tx_size_wide[tx_size];
2169   const int height = tx_size_high[tx_size];
2170   const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
2171   const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
2172   const int width_div16 = (width >> 4);
2173   const int height_div16 = (height >> 4);
2174 
2175   for (int i = 0; i < width_div16; i++) {
2176     load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
2177     round_shift_16bit_w16_avx2(buf0, height, shift[0]);
2178     col_txfm(buf0, buf0, cos_bit_col);
2179     round_shift_16bit_w16_avx2(buf0, height, shift[1]);
2180     for (int j = 0; j < height_div16; ++j) {
2181       transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
2182     }
2183   }
2184 
2185   for (int i = 0; i < height_div16; i++) {
2186     __m256i *buf = buf1 + width * i;
2187     row_txfm(buf, buf, cos_bit_row);
2188     round_shift_16bit_w16_avx2(buf, width, shift[2]);
2189     store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * i, 16, 32);
2190   }
2191   // Zero out the bottom 16x32 area.
2192   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
2193 }
2194 
btf_16_avx2(__m256i * w0,__m256i * w1,__m256i * in0,__m256i * in1,__m128i * out0,__m128i * out1,__m128i * out2,__m128i * out3,const __m256i * __rounding,int8_t * cos_bit)2195 static inline void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
2196                                __m256i *in1, __m128i *out0, __m128i *out1,
2197                                __m128i *out2, __m128i *out3,
2198                                const __m256i *__rounding, int8_t *cos_bit) {
2199   __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
2200   __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
2201   __m256i u0 = _mm256_madd_epi16(t0, *w0);
2202   __m256i u1 = _mm256_madd_epi16(t1, *w0);
2203   __m256i v0 = _mm256_madd_epi16(t0, *w1);
2204   __m256i v1 = _mm256_madd_epi16(t1, *w1);
2205 
2206   __m256i a0 = _mm256_add_epi32(u0, *__rounding);
2207   __m256i a1 = _mm256_add_epi32(u1, *__rounding);
2208   __m256i b0 = _mm256_add_epi32(v0, *__rounding);
2209   __m256i b1 = _mm256_add_epi32(v1, *__rounding);
2210 
2211   __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
2212   __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
2213   __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
2214   __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
2215 
2216   __m256i temp0 = _mm256_packs_epi32(c0, c1);
2217   __m256i temp1 = _mm256_packs_epi32(d0, d1);
2218 
2219   *out0 = _mm256_castsi256_si128(temp0);
2220   *out1 = _mm256_castsi256_si128(temp1);
2221   *out2 = _mm256_extracti128_si256(temp0, 0x01);
2222   *out3 = _mm256_extracti128_si256(temp1, 0x01);
2223 }
2224 
fdct8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2225 static inline void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
2226                                     int8_t cos_bit) {
2227   const int32_t *cospi = cospi_arr(cos_bit);
2228   const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2229 
2230   __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
2231   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
2232   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
2233   __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
2234   __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
2235   __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
2236   __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
2237   __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
2238   __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
2239 
2240   // stage 1
2241   __m256i x1[8];
2242   x1[0] = _mm256_adds_epi16(input[0], input[7]);
2243   x1[7] = _mm256_subs_epi16(input[0], input[7]);
2244   x1[1] = _mm256_adds_epi16(input[1], input[6]);
2245   x1[6] = _mm256_subs_epi16(input[1], input[6]);
2246   x1[2] = _mm256_adds_epi16(input[2], input[5]);
2247   x1[5] = _mm256_subs_epi16(input[2], input[5]);
2248   x1[3] = _mm256_adds_epi16(input[3], input[4]);
2249   x1[4] = _mm256_subs_epi16(input[3], input[4]);
2250 
2251   // stage 2
2252   __m256i x2[8];
2253   x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
2254   x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
2255   x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
2256   x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
2257   x2[4] = x1[4];
2258   btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
2259                   cos_bit);
2260   x2[5] = x1[5];
2261   x2[6] = x1[6];
2262   x2[7] = x1[7];
2263 
2264   // stage 3
2265   __m256i x3[8];
2266   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
2267                   cos_bit);
2268   x3[0] = x2[0];
2269   x3[1] = x2[1];
2270   btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
2271                   cos_bit);
2272   x3[2] = x2[2];
2273   x3[3] = x2[3];
2274   x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
2275   x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
2276   x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
2277   x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
2278 
2279   // stage 4
2280   __m256i x4[8];
2281   x4[0] = x3[0];
2282   x4[1] = x3[1];
2283   x4[2] = x3[2];
2284   x4[3] = x3[3];
2285   btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
2286                   cos_bit);
2287   x4[4] = x3[4];
2288   x4[7] = x3[7];
2289   btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
2290                   cos_bit);
2291   x4[5] = x3[5];
2292   x4[6] = x3[6];
2293   // stage 5
2294   output[0] = x4[0];
2295   output[1] = x4[4];
2296   output[2] = x4[2];
2297   output[3] = x4[6];
2298   output[4] = x4[1];
2299   output[5] = x4[5];
2300   output[6] = x4[3];
2301   output[7] = x4[7];
2302 }
2303 
fadst8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2304 static inline void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
2305                                      int8_t cos_bit) {
2306   const int32_t *cospi = cospi_arr(cos_bit);
2307   const __m256i __zero = _mm256_setzero_si256();
2308   const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
2309 
2310   __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
2311   __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
2312   __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
2313   __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
2314   __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
2315   __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
2316   __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
2317   __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
2318   __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
2319   __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
2320   __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
2321   __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
2322   __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
2323 
2324   // stage 1
2325   __m256i x1[8];
2326   x1[0] = input[0];
2327   x1[1] = _mm256_subs_epi16(__zero, input[7]);
2328   x1[2] = _mm256_subs_epi16(__zero, input[3]);
2329   x1[3] = input[4];
2330   x1[4] = _mm256_subs_epi16(__zero, input[1]);
2331   x1[5] = input[6];
2332   x1[6] = input[2];
2333   x1[7] = _mm256_subs_epi16(__zero, input[5]);
2334 
2335   // stage 2
2336   __m256i x2[8];
2337   x2[0] = x1[0];
2338   x2[1] = x1[1];
2339   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
2340                   cos_bit);
2341   x2[2] = x1[2];
2342   x2[3] = x1[3];
2343   x2[4] = x1[4];
2344   x2[5] = x1[5];
2345   btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
2346                   cos_bit);
2347   x2[6] = x1[6];
2348   x2[7] = x1[7];
2349 
2350   // stage 3
2351   __m256i x3[8];
2352   x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
2353   x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
2354   x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
2355   x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
2356   x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
2357   x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
2358   x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
2359   x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
2360 
2361   // stage 4
2362   __m256i x4[8];
2363   x4[0] = x3[0];
2364   x4[1] = x3[1];
2365   x4[2] = x3[2];
2366   x4[3] = x3[3];
2367   btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
2368                   cos_bit);
2369   x4[4] = x3[4];
2370   x4[5] = x3[5];
2371   btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
2372                   cos_bit);
2373   x4[6] = x3[6];
2374   x4[7] = x3[7];
2375 
2376   // stage 5
2377   __m256i x5[8];
2378   x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
2379   x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
2380   x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
2381   x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
2382   x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
2383   x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
2384   x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
2385   x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
2386 
2387   // stage 6
2388   __m256i x6[8];
2389   btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
2390                   cos_bit);
2391   x6[0] = x5[0];
2392   x6[1] = x5[1];
2393   btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
2394                   cos_bit);
2395   x6[2] = x5[2];
2396   x6[3] = x5[3];
2397   btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
2398                   cos_bit);
2399   x6[4] = x5[4];
2400   x6[5] = x5[5];
2401   btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
2402                   cos_bit);
2403   x6[6] = x5[6];
2404   x6[7] = x5[7];
2405 
2406   // stage 7
2407   output[0] = x6[1];
2408   output[1] = x6[6];
2409   output[2] = x6[3];
2410   output[3] = x6[4];
2411   output[4] = x6[5];
2412   output[5] = x6[2];
2413   output[6] = x6[7];
2414   output[7] = x6[0];
2415 }
2416 
fidentity8x8_new_avx2(const __m256i * input,__m256i * output,int8_t cos_bit)2417 static inline void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
2418                                          int8_t cos_bit) {
2419   (void)cos_bit;
2420 
2421   output[0] = _mm256_adds_epi16(input[0], input[0]);
2422   output[1] = _mm256_adds_epi16(input[1], input[1]);
2423   output[2] = _mm256_adds_epi16(input[2], input[2]);
2424   output[3] = _mm256_adds_epi16(input[3], input[3]);
2425   output[4] = _mm256_adds_epi16(input[4], input[4]);
2426   output[5] = _mm256_adds_epi16(input[5], input[5]);
2427   output[6] = _mm256_adds_epi16(input[6], input[6]);
2428   output[7] = _mm256_adds_epi16(input[7], input[7]);
2429 }
2430 
fdct8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2431 static inline void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
2432                                      int8_t cos_bit) {
2433   const int32_t *cospi = cospi_arr(cos_bit);
2434   const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
2435   const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
2436   __m128i temp0, temp1, temp2, temp3;
2437   __m256i in0, in1;
2438   __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
2439   __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2440   __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2441   __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
2442   __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
2443   __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
2444   __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
2445   __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
2446   __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
2447   __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
2448   __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
2449   __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
2450   __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
2451   __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
2452   __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
2453   __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
2454   __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
2455   __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
2456 
2457   __m256i cospi_arr[12];
2458 
2459   cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
2460                                          cospi_m32_p32, 0x1);
2461   cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2462                                          cospi_p32_p32, 0x1);
2463   cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2464                                          cospi_p48_p16, 0x1);
2465   cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2466                                          cospi_m16_p48, 0x1);
2467   cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
2468                                          cospi_m48_m16, 0x1);
2469   cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
2470                                          cospi_m16_p48, 0x1);
2471   cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
2472                                          cospi_p24_p40, 0x1);
2473   cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
2474                                          cospi_m40_p24, 0x1);
2475   cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
2476                                          cospi_p28_p36, 0x1);
2477   cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
2478                                          cospi_m36_p28, 0x1);
2479   cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
2480                                           cospi_p12_p52, 0x1);
2481   cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
2482                                           cospi_m52_p12, 0x1);
2483 
2484   __m256i x[8];
2485   x[0] =
2486       _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
2487   x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
2488                                  0x1);
2489   x[2] =
2490       _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
2491   x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
2492                                  0x1);
2493   x[4] =
2494       _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
2495   x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
2496                                  0x1);
2497   x[6] =
2498       _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
2499   x[7] =
2500       _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
2501 
2502   // stage 1
2503   __m256i x1[8];
2504   x1[0] = _mm256_adds_epi16(x[0], x[1]);
2505   x1[7] = _mm256_subs_epi16(x[0], x[1]);
2506   x1[1] = _mm256_adds_epi16(x[2], x[3]);
2507   x1[6] = _mm256_subs_epi16(x[2], x[3]);
2508   x1[2] = _mm256_adds_epi16(x[4], x[5]);
2509   x1[5] = _mm256_subs_epi16(x[4], x[5]);
2510   x1[3] = _mm256_adds_epi16(x[6], x[7]);
2511   x1[4] = _mm256_subs_epi16(x[6], x[7]);
2512 
2513   // stage 2
2514   __m256i x2[8];
2515   x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
2516   x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
2517   x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
2518   x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
2519   x2[2] = x1[4];
2520   x2[3] = x1[7];
2521   btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
2522               &temp2, &temp3, &__rounding_256, &cos_bit);
2523   x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
2524   x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
2525 
2526   // stage 3
2527   __m256i x3[8];
2528   x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
2529   x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
2530   x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
2531   x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
2532   btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
2533               _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
2534   x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
2535   x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
2536   x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
2537   x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
2538   x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
2539 
2540   // stage 4
2541   __m256i x4[8];
2542   x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
2543   x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
2544   btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
2545               &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
2546   x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
2547   x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
2548   x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
2549   x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
2550   in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
2551   in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
2552   btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
2553               &temp3, &__rounding_256, &cos_bit);
2554 
2555   x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
2556   x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
2557 
2558   // stage 5
2559   __m256i x5[4];
2560   in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
2561   in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
2562   btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
2563               &output[10], &output[6], &__rounding_256, &cos_bit);
2564   x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
2565   x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
2566   x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
2567   x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
2568 
2569   // stage 6
2570   in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
2571   in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
2572   btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
2573               &output[9], &output[7], &__rounding_256, &cos_bit);
2574   in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
2575   in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
2576   btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
2577               &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
2578 }
2579 
fadst8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2580 static inline void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
2581                                       int8_t cos_bit) {
2582   const int32_t *cospi = cospi_arr(cos_bit);
2583   const __m256i __zero = _mm256_setzero_si256();
2584   const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
2585   __m256i in0, in1;
2586   __m128i temp0, temp1, temp2, temp3;
2587 
2588   __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
2589   __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
2590   __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
2591   __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
2592   __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
2593   __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
2594   __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
2595   __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
2596   __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
2597   __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
2598   __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
2599   __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
2600   __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
2601   __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
2602   __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
2603   __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
2604   __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
2605   __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
2606   __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
2607   __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
2608   __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
2609   __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
2610   __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
2611   __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
2612   __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
2613   __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
2614   __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
2615 
2616   __m256i cospi_arr[20];
2617 
2618   cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2619                                          cospi_p32_p32, 0x1);
2620   cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2621                                          cospi_p32_m32, 0x1);
2622   cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
2623                                          cospi_p32_p32, 0x1);
2624   cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
2625                                          cospi_p32_m32, 0x1);
2626   cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
2627                                          cospi_m48_p16, 0x1);
2628   cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
2629                                          cospi_p16_p48, 0x1);
2630   cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
2631                                          cospi_m48_p16, 0x1);
2632   cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
2633                                          cospi_p16_p48, 0x1);
2634   cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
2635                                          cospi_p40_p24, 0x1);
2636   cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
2637                                          cospi_p24_m40, 0x1);
2638   cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
2639                                           cospi_m24_p40, 0x1);
2640   cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
2641                                           cospi_p40_p24, 0x1);
2642   cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
2643                                           cospi_p10_p54, 0x1);
2644   cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
2645                                           cospi_p54_m10, 0x1);
2646   cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
2647                                           cospi_p26_p38, 0x1);
2648   cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
2649                                           cospi_p38_m26, 0x1);
2650   cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
2651                                           cospi_p42_p22, 0x1);
2652   cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
2653                                           cospi_p22_m42, 0x1);
2654   cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
2655                                           cospi_p58_p06, 0x1);
2656   cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
2657                                           cospi_p06_m58, 0x1);
2658 
2659   __m256i x[8];
2660   x[0] =
2661       _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
2662   x[1] =
2663       _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
2664   x[2] =
2665       _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
2666   x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
2667                                  0x1);
2668   x[4] =
2669       _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
2670   x[5] =
2671       _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
2672   x[6] =
2673       _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
2674   x[7] =
2675       _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
2676 
2677   // stage 1
2678   __m256i x1[8];
2679   x1[0] = x[0];
2680   x1[1] = _mm256_subs_epi16(__zero, x[7]);
2681   x1[2] = x[2];
2682   x1[3] = _mm256_subs_epi16(__zero, x[5]);
2683   x1[4] = _mm256_subs_epi16(__zero, x[4]);
2684   x1[5] = x[3];
2685   x1[6] = _mm256_subs_epi16(__zero, x[6]);
2686   x1[7] = x[1];
2687 
2688   // stage 2
2689   __m256i x2[8];
2690   x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
2691   x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
2692   x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
2693   x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
2694   in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
2695   in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
2696   btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
2697               &temp3, &__rounding_256, &cos_bit);
2698   x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2699   x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2700   in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
2701   in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
2702   btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
2703               &temp3, &__rounding_256, &cos_bit);
2704   x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2705   x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2706 
2707   // stage 3
2708   __m256i x3[8];
2709   x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
2710   x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
2711   x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
2712   x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
2713   x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
2714   x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
2715   x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
2716   x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
2717 
2718   // stage 4
2719   __m256i x4[8];
2720   x4[0] = x3[0];
2721   x4[1] = x3[1];
2722   x4[4] = x3[4];
2723   x4[5] = x3[5];
2724   in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
2725   in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
2726   btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
2727               &temp3, &__rounding_256, &cos_bit);
2728   x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2729   x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2730   in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
2731   in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
2732   btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
2733               &temp3, &__rounding_256, &cos_bit);
2734   x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2735   x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2736 
2737   // stage 5
2738   __m256i x5[8];
2739   x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
2740   x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
2741   x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
2742   x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
2743   x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
2744   x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
2745   x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
2746   x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
2747 
2748   // stage 6
2749   __m256i x6[8];
2750   x6[0] = x5[0];
2751   x6[1] = x5[2];
2752   x6[2] = x5[1];
2753   x6[3] = x5[3];
2754   in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
2755   in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
2756   btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
2757               &temp3, &__rounding_256, &cos_bit);
2758   x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2759   x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2760   in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
2761   in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
2762   btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
2763               &temp2, &temp3, &__rounding_256, &cos_bit);
2764   x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
2765   x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
2766 
2767   // stage 7
2768   __m256i x7[8];
2769   x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
2770   x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
2771   x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
2772   x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
2773   x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
2774   x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
2775   x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
2776   x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
2777 
2778   // stage 8
2779   in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
2780   in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
2781   btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
2782               &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
2783   in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
2784   in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
2785   btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
2786               &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
2787   in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
2788   in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
2789   btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
2790               &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
2791   in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
2792   in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
2793   btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
2794               &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
2795 }
2796 
fidentity8x16_new_avx2(const __m128i * input,__m128i * output,int8_t cos_bit)2797 static inline void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
2798                                           int8_t cos_bit) {
2799   (void)cos_bit;
2800   const __m256i one = _mm256_set1_epi16(1);
2801   __m256i temp;
2802   for (int i = 0; i < 16; i += 2) {
2803     temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
2804                                    input[i + 1], 0x1);
2805     const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
2806     const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
2807     const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
2808     const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
2809     temp = _mm256_packs_epi32(b_lo, b_hi);
2810     output[i] = _mm256_castsi256_si128(temp);
2811     output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
2812   }
2813 }
2814 
2815 static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
2816   fdct8x8_new_avx2,       // DCT_DCT
2817   fdct8x8_new_avx2,       // ADST_DCT
2818   fadst8x8_new_avx2,      // DCT_ADST
2819   fadst8x8_new_avx2,      // ADST_ADST
2820   fdct8x8_new_avx2,       // FLIPADST_DCT
2821   fadst8x8_new_avx2,      // DCT_FLIPADST
2822   fadst8x8_new_avx2,      // FLIPADST_FLIPADST
2823   fadst8x8_new_avx2,      // ADST_FLIPADST
2824   fadst8x8_new_avx2,      // FLIPADST_ADST
2825   fidentity8x8_new_avx2,  // IDTX
2826   fidentity8x8_new_avx2,  // V_DCT
2827   fdct8x8_new_avx2,       // H_DCT
2828   fidentity8x8_new_avx2,  // V_ADST
2829   fadst8x8_new_avx2,      // H_ADST
2830   fidentity8x8_new_avx2,  // V_FLIPADST
2831   fadst8x8_new_avx2       // H_FLIPADST
2832 };
2833 
2834 static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
2835   fdct8x16_new_avx2,       // DCT_DCT
2836   fadst8x16_new_avx2,      // ADST_DCT
2837   fdct8x16_new_avx2,       // DCT_ADST
2838   fadst8x16_new_avx2,      // ADST_ADST
2839   fadst8x16_new_avx2,      // FLIPADST_DCT
2840   fdct8x16_new_avx2,       // DCT_FLIPADST
2841   fadst8x16_new_avx2,      // FLIPADST_FLIPADST
2842   fadst8x16_new_avx2,      // ADST_FLIPADST
2843   fadst8x16_new_avx2,      // FLIPADST_ADST
2844   fidentity8x16_new_avx2,  // IDTX
2845   fdct8x16_new_avx2,       // V_DCT
2846   fidentity8x16_new_avx2,  // H_DCT
2847   fadst8x16_new_avx2,      // V_ADST
2848   fidentity8x16_new_avx2,  // H_ADST
2849   fadst8x16_new_avx2,      // V_FLIPADST
2850   fidentity8x16_new_avx2   // H_FLIPADST
2851 };
2852 
2853 static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
2854   fdct8x8_new_avx2,       // DCT_DCT
2855   fadst8x8_new_avx2,      // ADST_DCT
2856   fdct8x8_new_avx2,       // DCT_ADST
2857   fadst8x8_new_avx2,      // ADST_ADST
2858   fadst8x8_new_avx2,      // FLIPADST_DCT
2859   fdct8x8_new_avx2,       // DCT_FLIPADST
2860   fadst8x8_new_avx2,      // FLIPADST_FLIPADST
2861   fadst8x8_new_avx2,      // ADST_FLIPADST
2862   fadst8x8_new_avx2,      // FLIPADST_ADST
2863   fidentity8x8_new_avx2,  // IDTX
2864   fdct8x8_new_avx2,       // V_DCT
2865   fidentity8x8_new_avx2,  // H_DCT
2866   fadst8x8_new_avx2,      // V_ADST
2867   fidentity8x8_new_avx2,  // H_ADST
2868   fadst8x8_new_avx2,      // V_FLIPADST
2869   fidentity8x8_new_avx2,  // H_FLIPADST
2870 };
2871 
2872 static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
2873   fdct8x16_new_avx2,       // DCT_DCT
2874   fdct8x16_new_avx2,       // ADST_DCT
2875   fadst8x16_new_avx2,      // DCT_ADST
2876   fadst8x16_new_avx2,      // ADST_ADST
2877   fdct8x16_new_avx2,       // FLIPADST_DCT
2878   fadst8x16_new_avx2,      // DCT_FLIPADST
2879   fadst8x16_new_avx2,      // FLIPADST_FLIPADST
2880   fadst8x16_new_avx2,      // ADST_FLIPADST
2881   fadst8x16_new_avx2,      // FLIPADST_ADST
2882   fidentity8x16_new_avx2,  // IDTX
2883   fidentity8x16_new_avx2,  // V_DCT
2884   fdct8x16_new_avx2,       // H_DCT
2885   fidentity8x16_new_avx2,  // V_ADST
2886   fadst8x16_new_avx2,      // H_ADST
2887   fidentity8x16_new_avx2,  // V_FLIPADST
2888   fadst8x16_new_avx2       // H_FLIPADST
2889 };
2890 
lowbd_fwd_txfm2d_8x16_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2891 static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
2892                                        int stride, TX_TYPE tx_type, int bd) {
2893   (void)bd;
2894   __m128i buf0[16], buf1[16];
2895   __m256i buf2[8];
2896   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_8X16];
2897   const int txw_idx = get_txw_idx(TX_8X16);
2898   const int txh_idx = get_txh_idx(TX_8X16);
2899   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2900   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2901   const int width = 8;
2902   const int height = 16;
2903   const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
2904   const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
2905   int ud_flip, lr_flip;
2906 
2907   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2908   if (ud_flip) {
2909     load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
2910   } else {
2911     load_buffer_16bit_to_16bit(input, stride, buf0, height);
2912   }
2913   round_shift_16bit(buf0, height, shift[0]);
2914   col_txfm(buf0, buf0, cos_bit_col);
2915   round_shift_16bit(buf0, height, shift[1]);
2916   transpose_16bit_8x8(buf0, buf1);
2917   transpose_16bit_8x8(buf0 + 8, buf1 + 8);
2918 
2919   __m128i *bufl, *bufu;
2920   if (lr_flip) {
2921     bufl = buf0;
2922     bufu = buf0 + 8;
2923     flip_buf_sse2(buf1 + width * 0, bufl, width);
2924     flip_buf_sse2(buf1 + width * 1, bufu, width);
2925   } else {
2926     bufl = buf1 + width * 0;
2927     bufu = buf1 + width * 1;
2928   }
2929   pack_reg(bufl, bufu, buf2);
2930   row_txfm(buf2, buf2, cos_bit_row);
2931   round_shift_16bit_w16_avx2(buf2, width, shift[2]);
2932   store_rect_buffer_16bit_to_32bit_w16_avx2(buf2, output, height, width);
2933 }
2934 
lowbd_fwd_txfm2d_16x8_avx2(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)2935 static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
2936                                        int stride, TX_TYPE tx_type, int bd) {
2937   (void)bd;
2938   __m128i buf0[16], buf1[16];
2939   __m256i buf2[8];
2940   const int8_t *shift = av1_fwd_txfm_shift_ls[TX_16X8];
2941   const int txw_idx = get_txw_idx(TX_16X8);
2942   const int txh_idx = get_txh_idx(TX_16X8);
2943   const int cos_bit_col = av1_fwd_cos_bit_col[txw_idx][txh_idx];
2944   const int cos_bit_row = av1_fwd_cos_bit_row[txw_idx][txh_idx];
2945   const int width = 16;
2946   const int height = 8;
2947   const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
2948   const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
2949   __m128i *buf;
2950   int ud_flip, lr_flip;
2951 
2952   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
2953 
2954   if (ud_flip) {
2955     load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
2956     load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
2957   } else {
2958     load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
2959     load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
2960   }
2961   pack_reg(buf0, &buf0[8], buf2);
2962   round_shift_16bit_w16_avx2(buf2, height, shift[0]);
2963   col_txfm(buf2, buf2, cos_bit_col);
2964   round_shift_16bit_w16_avx2(buf2, height, shift[1]);
2965   transpose_16bit_16x8_avx2(buf2, buf2);
2966   extract_reg(buf2, buf1);
2967 
2968   if (lr_flip) {
2969     buf = buf0;
2970     flip_buf_sse2(buf1, buf, width);
2971   } else {
2972     buf = buf1;
2973   }
2974   row_txfm(buf, buf, cos_bit_row);
2975   round_shift_16bit(buf, width, shift[2]);
2976   store_rect_buffer_16bit_to_32bit_w8(buf, output, height, width);
2977 }
2978 
2979 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
2980   av1_lowbd_fwd_txfm2d_4x4_sse2,   // 4x4 transform
2981   av1_lowbd_fwd_txfm2d_8x8_avx2,   // 8x8 transform
2982   lowbd_fwd_txfm2d_16x16_avx2,     // 16x16 transform
2983   lowbd_fwd_txfm2d_32x32_avx2,     // 32x32 transform
2984   lowbd_fwd_txfm2d_64x64_avx2,     // 64x64 transform
2985   av1_lowbd_fwd_txfm2d_4x8_sse2,   // 4x8 transform
2986   av1_lowbd_fwd_txfm2d_8x4_sse2,   // 8x4 transform
2987   lowbd_fwd_txfm2d_8x16_avx2,      // 8x16 transform
2988   lowbd_fwd_txfm2d_16x8_avx2,      // 16x8 transform
2989   lowbd_fwd_txfm2d_16x32_avx2,     // 16x32 transform
2990   lowbd_fwd_txfm2d_32x16_avx2,     // 32x16 transform
2991   lowbd_fwd_txfm2d_32x64_avx2,     // 32x64 transform
2992   lowbd_fwd_txfm2d_64x32_avx2,     // 64x32 transform
2993   av1_lowbd_fwd_txfm2d_4x16_sse2,  // 4x16 transform
2994   av1_lowbd_fwd_txfm2d_16x4_sse2,  // 16x4 transform
2995   av1_lowbd_fwd_txfm2d_8x32_sse2,  // 8x32 transform
2996   av1_lowbd_fwd_txfm2d_32x8_sse2,  // 32x8 transform
2997   lowbd_fwd_txfm2d_16x64_avx2,     // 16x64 transform
2998   lowbd_fwd_txfm2d_64x16_avx2,     // 64x16 transform
2999 };
3000 
av1_lowbd_fwd_txfm_avx2(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)3001 void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
3002                              int diff_stride, TxfmParam *txfm_param) {
3003   FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
3004   if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) {
3005     av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
3006   } else {
3007     fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
3008                     txfm_param->bd);
3009   }
3010 }
3011