xref: /aosp_15_r20/external/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <emmintrin.h>  // SSE2
13 
14 #include "./vp9_rtcd.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/txfm_common.h"
17 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
19 #include "vpx_dsp/x86/transpose_sse2.h"
20 #include "vpx_dsp/x86/txfm_common_sse2.h"
21 #include "vpx_ports/mem.h"
22 
load_buffer_4x4(const int16_t * input,__m128i * in,int stride)23 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
24                                    int stride) {
25   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
26   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27   __m128i mask;
28 
29   in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
30   in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
31   in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
32   in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
33 
34   in[0] = _mm_slli_epi16(in[0], 4);
35   in[1] = _mm_slli_epi16(in[1], 4);
36   in[2] = _mm_slli_epi16(in[2], 4);
37   in[3] = _mm_slli_epi16(in[3], 4);
38 
39   mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
40   in[0] = _mm_add_epi16(in[0], mask);
41   in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
42 }
43 
write_buffer_4x4(tran_low_t * output,__m128i * res)44 static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
45   const __m128i kOne = _mm_set1_epi16(1);
46   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
47   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
48   __m128i out01 = _mm_add_epi16(in01, kOne);
49   __m128i out23 = _mm_add_epi16(in23, kOne);
50   out01 = _mm_srai_epi16(out01, 2);
51   out23 = _mm_srai_epi16(out23, 2);
52   store_output(&out01, (output + 0 * 8));
53   store_output(&out23, (output + 1 * 8));
54 }
55 
transpose_4x4(__m128i * res)56 static INLINE void transpose_4x4(__m128i *res) {
57   // Combine and transpose
58   // 00 01 02 03 20 21 22 23
59   // 10 11 12 13 30 31 32 33
60   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
61   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
62 
63   // 00 10 01 11 02 12 03 13
64   // 20 30 21 31 22 32 23 33
65   res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
66   res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
67 
68   // 00 10 20 30 01 11 21 31
69   // 02 12 22 32 03 13 23 33
70   // only use the first 4 16-bit integers
71   res[1] = _mm_unpackhi_epi64(res[0], res[0]);
72   res[3] = _mm_unpackhi_epi64(res[2], res[2]);
73 }
74 
fdct4_sse2(__m128i * in)75 static void fdct4_sse2(__m128i *in) {
76   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
77   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
78   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
79   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
80   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
81 
82   __m128i u[4], v[4];
83   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
84   u[1] = _mm_unpacklo_epi16(in[3], in[2]);
85 
86   v[0] = _mm_add_epi16(u[0], u[1]);
87   v[1] = _mm_sub_epi16(u[0], u[1]);
88 
89   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
90   u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
91   u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
92   u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
93 
94   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
95   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
96   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
97   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
98   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
99   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
100   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
101   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
102 
103   in[0] = _mm_packs_epi32(u[0], u[1]);
104   in[1] = _mm_packs_epi32(u[2], u[3]);
105   transpose_4x4(in);
106 }
107 
fadst4_sse2(__m128i * in)108 static void fadst4_sse2(__m128i *in) {
109   const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
110   const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
111   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
112   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
113   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
114   const __m128i kZero = _mm_setzero_si128();
115   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
116   __m128i u[8], v[8];
117   __m128i in7 = _mm_add_epi16(in[0], in[1]);
118 
119   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
120   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
121   u[2] = _mm_unpacklo_epi16(in7, kZero);
122   u[3] = _mm_unpacklo_epi16(in[2], kZero);
123   u[4] = _mm_unpacklo_epi16(in[3], kZero);
124 
125   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
126   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
127   v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
128   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
129   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
130   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
131   v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
132 
133   u[0] = _mm_add_epi32(v[0], v[1]);
134   u[1] = _mm_sub_epi32(v[2], v[6]);
135   u[2] = _mm_add_epi32(v[3], v[4]);
136   u[3] = _mm_sub_epi32(u[2], u[0]);
137   u[4] = _mm_slli_epi32(v[5], 2);
138   u[5] = _mm_sub_epi32(u[4], v[5]);
139   u[6] = _mm_add_epi32(u[3], u[5]);
140 
141   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
142   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
143   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
144   v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
145 
146   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
147   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
148   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
149   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
150 
151   in[0] = _mm_packs_epi32(u[0], u[2]);
152   in[1] = _mm_packs_epi32(u[1], u[3]);
153   transpose_4x4(in);
154 }
155 
vp9_fht4x4_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)156 void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
157                      int tx_type) {
158   __m128i in[4];
159 
160   switch (tx_type) {
161     case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break;
162     case ADST_DCT:
163       load_buffer_4x4(input, in, stride);
164       fadst4_sse2(in);
165       fdct4_sse2(in);
166       write_buffer_4x4(output, in);
167       break;
168     case DCT_ADST:
169       load_buffer_4x4(input, in, stride);
170       fdct4_sse2(in);
171       fadst4_sse2(in);
172       write_buffer_4x4(output, in);
173       break;
174     default:
175       assert(tx_type == ADST_ADST);
176       load_buffer_4x4(input, in, stride);
177       fadst4_sse2(in);
178       fadst4_sse2(in);
179       write_buffer_4x4(output, in);
180       break;
181   }
182 }
183 
184 // load 8x8 array
load_buffer_8x8(const int16_t * input,__m128i * in,int stride)185 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
186                                    int stride) {
187   in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
188   in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
189   in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
190   in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
191   in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
192   in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
193   in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
194   in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
195 
196   in[0] = _mm_slli_epi16(in[0], 2);
197   in[1] = _mm_slli_epi16(in[1], 2);
198   in[2] = _mm_slli_epi16(in[2], 2);
199   in[3] = _mm_slli_epi16(in[3], 2);
200   in[4] = _mm_slli_epi16(in[4], 2);
201   in[5] = _mm_slli_epi16(in[5], 2);
202   in[6] = _mm_slli_epi16(in[6], 2);
203   in[7] = _mm_slli_epi16(in[7], 2);
204 }
205 
206 // right shift and rounding
right_shift_8x8(__m128i * res,const int bit)207 static INLINE void right_shift_8x8(__m128i *res, const int bit) {
208   __m128i sign0 = _mm_srai_epi16(res[0], 15);
209   __m128i sign1 = _mm_srai_epi16(res[1], 15);
210   __m128i sign2 = _mm_srai_epi16(res[2], 15);
211   __m128i sign3 = _mm_srai_epi16(res[3], 15);
212   __m128i sign4 = _mm_srai_epi16(res[4], 15);
213   __m128i sign5 = _mm_srai_epi16(res[5], 15);
214   __m128i sign6 = _mm_srai_epi16(res[6], 15);
215   __m128i sign7 = _mm_srai_epi16(res[7], 15);
216 
217   if (bit == 2) {
218     const __m128i const_rounding = _mm_set1_epi16(1);
219     res[0] = _mm_add_epi16(res[0], const_rounding);
220     res[1] = _mm_add_epi16(res[1], const_rounding);
221     res[2] = _mm_add_epi16(res[2], const_rounding);
222     res[3] = _mm_add_epi16(res[3], const_rounding);
223     res[4] = _mm_add_epi16(res[4], const_rounding);
224     res[5] = _mm_add_epi16(res[5], const_rounding);
225     res[6] = _mm_add_epi16(res[6], const_rounding);
226     res[7] = _mm_add_epi16(res[7], const_rounding);
227   }
228 
229   res[0] = _mm_sub_epi16(res[0], sign0);
230   res[1] = _mm_sub_epi16(res[1], sign1);
231   res[2] = _mm_sub_epi16(res[2], sign2);
232   res[3] = _mm_sub_epi16(res[3], sign3);
233   res[4] = _mm_sub_epi16(res[4], sign4);
234   res[5] = _mm_sub_epi16(res[5], sign5);
235   res[6] = _mm_sub_epi16(res[6], sign6);
236   res[7] = _mm_sub_epi16(res[7], sign7);
237 
238   if (bit == 1) {
239     res[0] = _mm_srai_epi16(res[0], 1);
240     res[1] = _mm_srai_epi16(res[1], 1);
241     res[2] = _mm_srai_epi16(res[2], 1);
242     res[3] = _mm_srai_epi16(res[3], 1);
243     res[4] = _mm_srai_epi16(res[4], 1);
244     res[5] = _mm_srai_epi16(res[5], 1);
245     res[6] = _mm_srai_epi16(res[6], 1);
246     res[7] = _mm_srai_epi16(res[7], 1);
247   } else {
248     res[0] = _mm_srai_epi16(res[0], 2);
249     res[1] = _mm_srai_epi16(res[1], 2);
250     res[2] = _mm_srai_epi16(res[2], 2);
251     res[3] = _mm_srai_epi16(res[3], 2);
252     res[4] = _mm_srai_epi16(res[4], 2);
253     res[5] = _mm_srai_epi16(res[5], 2);
254     res[6] = _mm_srai_epi16(res[6], 2);
255     res[7] = _mm_srai_epi16(res[7], 2);
256   }
257 }
258 
259 // write 8x8 array
write_buffer_8x8(tran_low_t * output,__m128i * res,int stride)260 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
261                                     int stride) {
262   store_output(&res[0], (output + 0 * stride));
263   store_output(&res[1], (output + 1 * stride));
264   store_output(&res[2], (output + 2 * stride));
265   store_output(&res[3], (output + 3 * stride));
266   store_output(&res[4], (output + 4 * stride));
267   store_output(&res[5], (output + 5 * stride));
268   store_output(&res[6], (output + 6 * stride));
269   store_output(&res[7], (output + 7 * stride));
270 }
271 
fdct8_sse2(__m128i * in)272 static void fdct8_sse2(__m128i *in) {
273   // constants
274   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
275   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
276   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
277   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
278   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
279   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
280   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
281   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
282   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
283   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
284   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
285   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
286 
287   // stage 1
288   s0 = _mm_add_epi16(in[0], in[7]);
289   s1 = _mm_add_epi16(in[1], in[6]);
290   s2 = _mm_add_epi16(in[2], in[5]);
291   s3 = _mm_add_epi16(in[3], in[4]);
292   s4 = _mm_sub_epi16(in[3], in[4]);
293   s5 = _mm_sub_epi16(in[2], in[5]);
294   s6 = _mm_sub_epi16(in[1], in[6]);
295   s7 = _mm_sub_epi16(in[0], in[7]);
296 
297   u0 = _mm_add_epi16(s0, s3);
298   u1 = _mm_add_epi16(s1, s2);
299   u2 = _mm_sub_epi16(s1, s2);
300   u3 = _mm_sub_epi16(s0, s3);
301   // interleave and perform butterfly multiplication/addition
302   v0 = _mm_unpacklo_epi16(u0, u1);
303   v1 = _mm_unpackhi_epi16(u0, u1);
304   v2 = _mm_unpacklo_epi16(u2, u3);
305   v3 = _mm_unpackhi_epi16(u2, u3);
306 
307   u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
308   u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
309   u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
310   u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
311   u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
312   u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
313   u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
314   u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
315 
316   // shift and rounding
317   v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
318   v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
319   v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
320   v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
321   v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
322   v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
323   v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
324   v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
325 
326   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
327   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
328   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
329   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
330   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
331   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
332   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
333   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
334 
335   in[0] = _mm_packs_epi32(u0, u1);
336   in[2] = _mm_packs_epi32(u4, u5);
337   in[4] = _mm_packs_epi32(u2, u3);
338   in[6] = _mm_packs_epi32(u6, u7);
339 
340   // stage 2
341   // interleave and perform butterfly multiplication/addition
342   u0 = _mm_unpacklo_epi16(s6, s5);
343   u1 = _mm_unpackhi_epi16(s6, s5);
344   v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
345   v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
346   v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
347   v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
348 
349   // shift and rounding
350   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
351   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
352   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
353   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
354 
355   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
356   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
357   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
358   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
359 
360   u0 = _mm_packs_epi32(v0, v1);
361   u1 = _mm_packs_epi32(v2, v3);
362 
363   // stage 3
364   s0 = _mm_add_epi16(s4, u0);
365   s1 = _mm_sub_epi16(s4, u0);
366   s2 = _mm_sub_epi16(s7, u1);
367   s3 = _mm_add_epi16(s7, u1);
368 
369   // stage 4
370   u0 = _mm_unpacklo_epi16(s0, s3);
371   u1 = _mm_unpackhi_epi16(s0, s3);
372   u2 = _mm_unpacklo_epi16(s1, s2);
373   u3 = _mm_unpackhi_epi16(s1, s2);
374 
375   v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
376   v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
377   v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
378   v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
379   v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
380   v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
381   v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
382   v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
383 
384   // shift and rounding
385   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
386   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
387   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
388   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
389   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
390   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
391   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
392   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
393 
394   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
395   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
396   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
397   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
398   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
399   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
400   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
401   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
402 
403   in[1] = _mm_packs_epi32(v0, v1);
404   in[3] = _mm_packs_epi32(v4, v5);
405   in[5] = _mm_packs_epi32(v2, v3);
406   in[7] = _mm_packs_epi32(v6, v7);
407 
408   // transpose
409   transpose_16bit_8x8(in, in);
410 }
411 
fadst8_sse2(__m128i * in)412 static void fadst8_sse2(__m128i *in) {
413   // Constants
414   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
415   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
416   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
417   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
418   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
419   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
420   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
421   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
422   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
423   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
424   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
425   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
426   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
427   const __m128i k__const_0 = _mm_setzero_si128();
428   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
429 
430   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
431   __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
432   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
433   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
434   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
435 
436   // properly aligned for butterfly input
437   in0 = in[7];
438   in1 = in[0];
439   in2 = in[5];
440   in3 = in[2];
441   in4 = in[3];
442   in5 = in[4];
443   in6 = in[1];
444   in7 = in[6];
445 
446   // column transformation
447   // stage 1
448   // interleave and multiply/add into 32-bit integer
449   s0 = _mm_unpacklo_epi16(in0, in1);
450   s1 = _mm_unpackhi_epi16(in0, in1);
451   s2 = _mm_unpacklo_epi16(in2, in3);
452   s3 = _mm_unpackhi_epi16(in2, in3);
453   s4 = _mm_unpacklo_epi16(in4, in5);
454   s5 = _mm_unpackhi_epi16(in4, in5);
455   s6 = _mm_unpacklo_epi16(in6, in7);
456   s7 = _mm_unpackhi_epi16(in6, in7);
457 
458   u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
459   u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
460   u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
461   u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
462   u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
463   u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
464   u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
465   u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
466   u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
467   u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
468   u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
469   u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
470   u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
471   u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
472   u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
473   u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
474 
475   // addition
476   w0 = _mm_add_epi32(u0, u8);
477   w1 = _mm_add_epi32(u1, u9);
478   w2 = _mm_add_epi32(u2, u10);
479   w3 = _mm_add_epi32(u3, u11);
480   w4 = _mm_add_epi32(u4, u12);
481   w5 = _mm_add_epi32(u5, u13);
482   w6 = _mm_add_epi32(u6, u14);
483   w7 = _mm_add_epi32(u7, u15);
484   w8 = _mm_sub_epi32(u0, u8);
485   w9 = _mm_sub_epi32(u1, u9);
486   w10 = _mm_sub_epi32(u2, u10);
487   w11 = _mm_sub_epi32(u3, u11);
488   w12 = _mm_sub_epi32(u4, u12);
489   w13 = _mm_sub_epi32(u5, u13);
490   w14 = _mm_sub_epi32(u6, u14);
491   w15 = _mm_sub_epi32(u7, u15);
492 
493   // shift and rounding
494   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
495   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
496   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
497   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
498   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
499   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
500   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
501   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
502   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
503   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
504   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
505   v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
506   v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
507   v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
508   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
509   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
510 
511   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
512   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
513   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
514   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
515   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
516   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
517   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
518   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
519   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
520   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
521   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
522   u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
523   u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
524   u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
525   u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
526   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
527 
528   // back to 16-bit and pack 8 integers into __m128i
529   in[0] = _mm_packs_epi32(u0, u1);
530   in[1] = _mm_packs_epi32(u2, u3);
531   in[2] = _mm_packs_epi32(u4, u5);
532   in[3] = _mm_packs_epi32(u6, u7);
533   in[4] = _mm_packs_epi32(u8, u9);
534   in[5] = _mm_packs_epi32(u10, u11);
535   in[6] = _mm_packs_epi32(u12, u13);
536   in[7] = _mm_packs_epi32(u14, u15);
537 
538   // stage 2
539   s0 = _mm_add_epi16(in[0], in[2]);
540   s1 = _mm_add_epi16(in[1], in[3]);
541   s2 = _mm_sub_epi16(in[0], in[2]);
542   s3 = _mm_sub_epi16(in[1], in[3]);
543   u0 = _mm_unpacklo_epi16(in[4], in[5]);
544   u1 = _mm_unpackhi_epi16(in[4], in[5]);
545   u2 = _mm_unpacklo_epi16(in[6], in[7]);
546   u3 = _mm_unpackhi_epi16(in[6], in[7]);
547 
548   v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
549   v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
550   v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
551   v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
552   v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
553   v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
554   v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
555   v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
556 
557   w0 = _mm_add_epi32(v0, v4);
558   w1 = _mm_add_epi32(v1, v5);
559   w2 = _mm_add_epi32(v2, v6);
560   w3 = _mm_add_epi32(v3, v7);
561   w4 = _mm_sub_epi32(v0, v4);
562   w5 = _mm_sub_epi32(v1, v5);
563   w6 = _mm_sub_epi32(v2, v6);
564   w7 = _mm_sub_epi32(v3, v7);
565 
566   v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
567   v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
568   v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
569   v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
570   v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
571   v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
572   v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
573   v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
574 
575   u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
576   u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
577   u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
578   u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
579   u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
580   u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
581   u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
582   u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
583 
584   // back to 16-bit intergers
585   s4 = _mm_packs_epi32(u0, u1);
586   s5 = _mm_packs_epi32(u2, u3);
587   s6 = _mm_packs_epi32(u4, u5);
588   s7 = _mm_packs_epi32(u6, u7);
589 
590   // stage 3
591   u0 = _mm_unpacklo_epi16(s2, s3);
592   u1 = _mm_unpackhi_epi16(s2, s3);
593   u2 = _mm_unpacklo_epi16(s6, s7);
594   u3 = _mm_unpackhi_epi16(s6, s7);
595 
596   v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
597   v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
598   v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
599   v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
600   v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
601   v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
602   v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
603   v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
604 
605   u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
606   u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
607   u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
608   u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
609   u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
610   u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
611   u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
612   u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
613 
614   v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
615   v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
616   v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
617   v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
618   v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
619   v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
620   v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
621   v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
622 
623   s2 = _mm_packs_epi32(v0, v1);
624   s3 = _mm_packs_epi32(v2, v3);
625   s6 = _mm_packs_epi32(v4, v5);
626   s7 = _mm_packs_epi32(v6, v7);
627 
628   // FIXME(jingning): do subtract using bit inversion?
629   in[0] = s0;
630   in[1] = _mm_sub_epi16(k__const_0, s4);
631   in[2] = s6;
632   in[3] = _mm_sub_epi16(k__const_0, s2);
633   in[4] = s3;
634   in[5] = _mm_sub_epi16(k__const_0, s7);
635   in[6] = s5;
636   in[7] = _mm_sub_epi16(k__const_0, s1);
637 
638   // transpose
639   transpose_16bit_8x8(in, in);
640 }
641 
vp9_fht8x8_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)642 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
643                      int tx_type) {
644   __m128i in[8];
645 
646   switch (tx_type) {
647     case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break;
648     case ADST_DCT:
649       load_buffer_8x8(input, in, stride);
650       fadst8_sse2(in);
651       fdct8_sse2(in);
652       right_shift_8x8(in, 1);
653       write_buffer_8x8(output, in, 8);
654       break;
655     case DCT_ADST:
656       load_buffer_8x8(input, in, stride);
657       fdct8_sse2(in);
658       fadst8_sse2(in);
659       right_shift_8x8(in, 1);
660       write_buffer_8x8(output, in, 8);
661       break;
662     default:
663       assert(tx_type == ADST_ADST);
664       load_buffer_8x8(input, in, stride);
665       fadst8_sse2(in);
666       fadst8_sse2(in);
667       right_shift_8x8(in, 1);
668       write_buffer_8x8(output, in, 8);
669       break;
670   }
671 }
672 
load_buffer_16x16(const int16_t * input,__m128i * in0,__m128i * in1,int stride)673 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
674                                      __m128i *in1, int stride) {
675   // load first 8 columns
676   load_buffer_8x8(input, in0, stride);
677   load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
678 
679   input += 8;
680   // load second 8 columns
681   load_buffer_8x8(input, in1, stride);
682   load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
683 }
684 
write_buffer_16x16(tran_low_t * output,__m128i * in0,__m128i * in1,int stride)685 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
686                                       __m128i *in1, int stride) {
687   // write first 8 columns
688   write_buffer_8x8(output, in0, stride);
689   write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
690   // write second 8 columns
691   output += 8;
692   write_buffer_8x8(output, in1, stride);
693   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
694 }
695 
right_shift_16x16(__m128i * res0,__m128i * res1)696 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
697   // perform rounding operations
698   right_shift_8x8(res0, 2);
699   right_shift_8x8(res0 + 8, 2);
700   right_shift_8x8(res1, 2);
701   right_shift_8x8(res1 + 8, 2);
702 }
703 
fdct16_8col(__m128i * in)704 static void fdct16_8col(__m128i *in) {
705   // perform 16x16 1-D DCT for 8 columns
706   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
707   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
708   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
709   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
710   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
711   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
712   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
713   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
714   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
715   const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
716   const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
717   const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
718   const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
719   const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
720   const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
721   const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
722   const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
723   const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
724   const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
725   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
726 
727   // stage 1
728   i[0] = _mm_add_epi16(in[0], in[15]);
729   i[1] = _mm_add_epi16(in[1], in[14]);
730   i[2] = _mm_add_epi16(in[2], in[13]);
731   i[3] = _mm_add_epi16(in[3], in[12]);
732   i[4] = _mm_add_epi16(in[4], in[11]);
733   i[5] = _mm_add_epi16(in[5], in[10]);
734   i[6] = _mm_add_epi16(in[6], in[9]);
735   i[7] = _mm_add_epi16(in[7], in[8]);
736 
737   s[0] = _mm_sub_epi16(in[7], in[8]);
738   s[1] = _mm_sub_epi16(in[6], in[9]);
739   s[2] = _mm_sub_epi16(in[5], in[10]);
740   s[3] = _mm_sub_epi16(in[4], in[11]);
741   s[4] = _mm_sub_epi16(in[3], in[12]);
742   s[5] = _mm_sub_epi16(in[2], in[13]);
743   s[6] = _mm_sub_epi16(in[1], in[14]);
744   s[7] = _mm_sub_epi16(in[0], in[15]);
745 
746   p[0] = _mm_add_epi16(i[0], i[7]);
747   p[1] = _mm_add_epi16(i[1], i[6]);
748   p[2] = _mm_add_epi16(i[2], i[5]);
749   p[3] = _mm_add_epi16(i[3], i[4]);
750   p[4] = _mm_sub_epi16(i[3], i[4]);
751   p[5] = _mm_sub_epi16(i[2], i[5]);
752   p[6] = _mm_sub_epi16(i[1], i[6]);
753   p[7] = _mm_sub_epi16(i[0], i[7]);
754 
755   u[0] = _mm_add_epi16(p[0], p[3]);
756   u[1] = _mm_add_epi16(p[1], p[2]);
757   u[2] = _mm_sub_epi16(p[1], p[2]);
758   u[3] = _mm_sub_epi16(p[0], p[3]);
759 
760   v[0] = _mm_unpacklo_epi16(u[0], u[1]);
761   v[1] = _mm_unpackhi_epi16(u[0], u[1]);
762   v[2] = _mm_unpacklo_epi16(u[2], u[3]);
763   v[3] = _mm_unpackhi_epi16(u[2], u[3]);
764 
765   u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
766   u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
767   u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
768   u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
769   u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
770   u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
771   u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
772   u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
773 
774   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
775   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
776   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
777   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
778   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
779   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
780   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
781   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
782 
783   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
784   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
785   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
786   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
787   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
788   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
789   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
790   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
791 
792   in[0] = _mm_packs_epi32(u[0], u[1]);
793   in[4] = _mm_packs_epi32(u[4], u[5]);
794   in[8] = _mm_packs_epi32(u[2], u[3]);
795   in[12] = _mm_packs_epi32(u[6], u[7]);
796 
797   u[0] = _mm_unpacklo_epi16(p[5], p[6]);
798   u[1] = _mm_unpackhi_epi16(p[5], p[6]);
799   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
800   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
801   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
802   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
803 
804   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
805   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
806   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
807   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
808 
809   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
810   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
811   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
812   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
813 
814   u[0] = _mm_packs_epi32(v[0], v[1]);
815   u[1] = _mm_packs_epi32(v[2], v[3]);
816 
817   t[0] = _mm_add_epi16(p[4], u[0]);
818   t[1] = _mm_sub_epi16(p[4], u[0]);
819   t[2] = _mm_sub_epi16(p[7], u[1]);
820   t[3] = _mm_add_epi16(p[7], u[1]);
821 
822   u[0] = _mm_unpacklo_epi16(t[0], t[3]);
823   u[1] = _mm_unpackhi_epi16(t[0], t[3]);
824   u[2] = _mm_unpacklo_epi16(t[1], t[2]);
825   u[3] = _mm_unpackhi_epi16(t[1], t[2]);
826 
827   v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
828   v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
829   v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
830   v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
831   v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
832   v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
833   v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
834   v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
835 
836   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
837   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
838   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
839   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
840   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
841   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
842   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
843   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
844 
845   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
846   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
847   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
848   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
849   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
850   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
851   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
852   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
853 
854   in[2] = _mm_packs_epi32(v[0], v[1]);
855   in[6] = _mm_packs_epi32(v[4], v[5]);
856   in[10] = _mm_packs_epi32(v[2], v[3]);
857   in[14] = _mm_packs_epi32(v[6], v[7]);
858 
859   // stage 2
860   u[0] = _mm_unpacklo_epi16(s[2], s[5]);
861   u[1] = _mm_unpackhi_epi16(s[2], s[5]);
862   u[2] = _mm_unpacklo_epi16(s[3], s[4]);
863   u[3] = _mm_unpackhi_epi16(s[3], s[4]);
864 
865   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
866   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
867   v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
868   v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
869   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
870   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
871   v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
872   v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
873 
874   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
875   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
876   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
877   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
878   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
879   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
880   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
881   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
882 
883   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
884   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
885   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
886   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
887   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
888   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
889   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
890   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
891 
892   t[2] = _mm_packs_epi32(v[0], v[1]);
893   t[3] = _mm_packs_epi32(v[2], v[3]);
894   t[4] = _mm_packs_epi32(v[4], v[5]);
895   t[5] = _mm_packs_epi32(v[6], v[7]);
896 
897   // stage 3
898   p[0] = _mm_add_epi16(s[0], t[3]);
899   p[1] = _mm_add_epi16(s[1], t[2]);
900   p[2] = _mm_sub_epi16(s[1], t[2]);
901   p[3] = _mm_sub_epi16(s[0], t[3]);
902   p[4] = _mm_sub_epi16(s[7], t[4]);
903   p[5] = _mm_sub_epi16(s[6], t[5]);
904   p[6] = _mm_add_epi16(s[6], t[5]);
905   p[7] = _mm_add_epi16(s[7], t[4]);
906 
907   // stage 4
908   u[0] = _mm_unpacklo_epi16(p[1], p[6]);
909   u[1] = _mm_unpackhi_epi16(p[1], p[6]);
910   u[2] = _mm_unpacklo_epi16(p[2], p[5]);
911   u[3] = _mm_unpackhi_epi16(p[2], p[5]);
912 
913   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
914   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
915   v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
916   v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
917   v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
918   v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
919   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
920   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
921 
922   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
923   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
924   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
925   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
926   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
927   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
928   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
929   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
930 
931   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
932   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
933   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
934   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
935   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
936   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
937   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
938   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
939 
940   t[1] = _mm_packs_epi32(v[0], v[1]);
941   t[2] = _mm_packs_epi32(v[2], v[3]);
942   t[5] = _mm_packs_epi32(v[4], v[5]);
943   t[6] = _mm_packs_epi32(v[6], v[7]);
944 
945   // stage 5
946   s[0] = _mm_add_epi16(p[0], t[1]);
947   s[1] = _mm_sub_epi16(p[0], t[1]);
948   s[2] = _mm_add_epi16(p[3], t[2]);
949   s[3] = _mm_sub_epi16(p[3], t[2]);
950   s[4] = _mm_sub_epi16(p[4], t[5]);
951   s[5] = _mm_add_epi16(p[4], t[5]);
952   s[6] = _mm_sub_epi16(p[7], t[6]);
953   s[7] = _mm_add_epi16(p[7], t[6]);
954 
955   // stage 6
956   u[0] = _mm_unpacklo_epi16(s[0], s[7]);
957   u[1] = _mm_unpackhi_epi16(s[0], s[7]);
958   u[2] = _mm_unpacklo_epi16(s[1], s[6]);
959   u[3] = _mm_unpackhi_epi16(s[1], s[6]);
960   u[4] = _mm_unpacklo_epi16(s[2], s[5]);
961   u[5] = _mm_unpackhi_epi16(s[2], s[5]);
962   u[6] = _mm_unpacklo_epi16(s[3], s[4]);
963   u[7] = _mm_unpackhi_epi16(s[3], s[4]);
964 
965   v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
966   v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
967   v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
968   v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
969   v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
970   v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
971   v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
972   v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
973   v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
974   v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
975   v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
976   v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
977   v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
978   v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
979   v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
980   v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
981 
982   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
983   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
984   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
985   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
986   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
987   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
988   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
989   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
990   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
991   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
992   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
993   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
994   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
995   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
996   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
997   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
998 
999   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1000   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1001   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1002   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1003   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1004   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1005   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1006   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1007   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1008   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1009   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1010   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1011   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1012   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1013   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1014   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1015 
1016   in[1] = _mm_packs_epi32(v[0], v[1]);
1017   in[9] = _mm_packs_epi32(v[2], v[3]);
1018   in[5] = _mm_packs_epi32(v[4], v[5]);
1019   in[13] = _mm_packs_epi32(v[6], v[7]);
1020   in[3] = _mm_packs_epi32(v[8], v[9]);
1021   in[11] = _mm_packs_epi32(v[10], v[11]);
1022   in[7] = _mm_packs_epi32(v[12], v[13]);
1023   in[15] = _mm_packs_epi32(v[14], v[15]);
1024 }
1025 
fadst16_8col(__m128i * in)1026 static void fadst16_8col(__m128i *in) {
1027   // perform 16x16 1-D ADST for 8 columns
1028   __m128i s[16], x[16], u[32], v[32];
1029   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1030   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1031   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1032   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1033   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1034   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1035   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1036   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1037   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1038   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1039   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1040   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1041   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1042   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1043   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1044   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1045   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1046   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1047   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1048   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1049   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1050   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1051   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1052   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1053   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1054   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1055   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1056   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1057   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1058   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1059   const __m128i kZero = _mm_setzero_si128();
1060 
1061   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1062   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1063   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1064   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1065   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1066   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1067   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1068   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1069   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1070   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1071   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1072   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1073   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1074   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1075   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1076   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1077 
1078   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1079   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1080   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1081   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1082   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1083   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1084   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1085   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1086   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1087   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1088   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1089   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1090   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1091   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1092   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1093   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1094   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1095   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1096   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1097   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1098   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1099   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1100   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1101   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1102   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1103   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1104   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1105   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1106   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1107   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1108   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1109   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1110 
1111   u[0] = _mm_add_epi32(v[0], v[16]);
1112   u[1] = _mm_add_epi32(v[1], v[17]);
1113   u[2] = _mm_add_epi32(v[2], v[18]);
1114   u[3] = _mm_add_epi32(v[3], v[19]);
1115   u[4] = _mm_add_epi32(v[4], v[20]);
1116   u[5] = _mm_add_epi32(v[5], v[21]);
1117   u[6] = _mm_add_epi32(v[6], v[22]);
1118   u[7] = _mm_add_epi32(v[7], v[23]);
1119   u[8] = _mm_add_epi32(v[8], v[24]);
1120   u[9] = _mm_add_epi32(v[9], v[25]);
1121   u[10] = _mm_add_epi32(v[10], v[26]);
1122   u[11] = _mm_add_epi32(v[11], v[27]);
1123   u[12] = _mm_add_epi32(v[12], v[28]);
1124   u[13] = _mm_add_epi32(v[13], v[29]);
1125   u[14] = _mm_add_epi32(v[14], v[30]);
1126   u[15] = _mm_add_epi32(v[15], v[31]);
1127   u[16] = _mm_sub_epi32(v[0], v[16]);
1128   u[17] = _mm_sub_epi32(v[1], v[17]);
1129   u[18] = _mm_sub_epi32(v[2], v[18]);
1130   u[19] = _mm_sub_epi32(v[3], v[19]);
1131   u[20] = _mm_sub_epi32(v[4], v[20]);
1132   u[21] = _mm_sub_epi32(v[5], v[21]);
1133   u[22] = _mm_sub_epi32(v[6], v[22]);
1134   u[23] = _mm_sub_epi32(v[7], v[23]);
1135   u[24] = _mm_sub_epi32(v[8], v[24]);
1136   u[25] = _mm_sub_epi32(v[9], v[25]);
1137   u[26] = _mm_sub_epi32(v[10], v[26]);
1138   u[27] = _mm_sub_epi32(v[11], v[27]);
1139   u[28] = _mm_sub_epi32(v[12], v[28]);
1140   u[29] = _mm_sub_epi32(v[13], v[29]);
1141   u[30] = _mm_sub_epi32(v[14], v[30]);
1142   u[31] = _mm_sub_epi32(v[15], v[31]);
1143 
1144   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1145   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1146   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1147   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1148   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1149   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1150   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1151   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1152   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1153   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1154   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1155   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1156   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1157   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1158   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1159   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1160   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1161   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1162   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1163   v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1164   v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1165   v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1166   v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1167   v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1168   v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1169   v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1170   v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1171   v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1172   v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1173   v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1174   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1175   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1176 
1177   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1178   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1179   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1180   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1181   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1182   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1183   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1184   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1185   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1186   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1187   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1188   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1189   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1190   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1191   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1192   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1193   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1194   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1195   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1196   u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1197   u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1198   u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1199   u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1200   u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1201   u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1202   u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1203   u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1204   u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1205   u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1206   u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1207   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1208   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1209 
1210   s[0] = _mm_packs_epi32(u[0], u[1]);
1211   s[1] = _mm_packs_epi32(u[2], u[3]);
1212   s[2] = _mm_packs_epi32(u[4], u[5]);
1213   s[3] = _mm_packs_epi32(u[6], u[7]);
1214   s[4] = _mm_packs_epi32(u[8], u[9]);
1215   s[5] = _mm_packs_epi32(u[10], u[11]);
1216   s[6] = _mm_packs_epi32(u[12], u[13]);
1217   s[7] = _mm_packs_epi32(u[14], u[15]);
1218   s[8] = _mm_packs_epi32(u[16], u[17]);
1219   s[9] = _mm_packs_epi32(u[18], u[19]);
1220   s[10] = _mm_packs_epi32(u[20], u[21]);
1221   s[11] = _mm_packs_epi32(u[22], u[23]);
1222   s[12] = _mm_packs_epi32(u[24], u[25]);
1223   s[13] = _mm_packs_epi32(u[26], u[27]);
1224   s[14] = _mm_packs_epi32(u[28], u[29]);
1225   s[15] = _mm_packs_epi32(u[30], u[31]);
1226 
1227   // stage 2
1228   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1229   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1230   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1231   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1232   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1233   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1234   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1235   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1236 
1237   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1238   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1239   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1240   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1241   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1242   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1243   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1244   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1245   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1246   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1247   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1248   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1249   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1250   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1251   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1252   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1253 
1254   u[0] = _mm_add_epi32(v[0], v[8]);
1255   u[1] = _mm_add_epi32(v[1], v[9]);
1256   u[2] = _mm_add_epi32(v[2], v[10]);
1257   u[3] = _mm_add_epi32(v[3], v[11]);
1258   u[4] = _mm_add_epi32(v[4], v[12]);
1259   u[5] = _mm_add_epi32(v[5], v[13]);
1260   u[6] = _mm_add_epi32(v[6], v[14]);
1261   u[7] = _mm_add_epi32(v[7], v[15]);
1262   u[8] = _mm_sub_epi32(v[0], v[8]);
1263   u[9] = _mm_sub_epi32(v[1], v[9]);
1264   u[10] = _mm_sub_epi32(v[2], v[10]);
1265   u[11] = _mm_sub_epi32(v[3], v[11]);
1266   u[12] = _mm_sub_epi32(v[4], v[12]);
1267   u[13] = _mm_sub_epi32(v[5], v[13]);
1268   u[14] = _mm_sub_epi32(v[6], v[14]);
1269   u[15] = _mm_sub_epi32(v[7], v[15]);
1270 
1271   v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1272   v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1273   v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1274   v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1275   v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1276   v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1277   v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1278   v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1279   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1280   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1281   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1282   v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1283   v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1284   v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1285   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1286   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1287 
1288   u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1289   u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1290   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1291   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1292   u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1293   u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1294   u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1295   u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1296   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1297   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1298   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1299   u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1300   u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1301   u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1302   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1303   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1304 
1305   x[0] = _mm_add_epi16(s[0], s[4]);
1306   x[1] = _mm_add_epi16(s[1], s[5]);
1307   x[2] = _mm_add_epi16(s[2], s[6]);
1308   x[3] = _mm_add_epi16(s[3], s[7]);
1309   x[4] = _mm_sub_epi16(s[0], s[4]);
1310   x[5] = _mm_sub_epi16(s[1], s[5]);
1311   x[6] = _mm_sub_epi16(s[2], s[6]);
1312   x[7] = _mm_sub_epi16(s[3], s[7]);
1313   x[8] = _mm_packs_epi32(u[0], u[1]);
1314   x[9] = _mm_packs_epi32(u[2], u[3]);
1315   x[10] = _mm_packs_epi32(u[4], u[5]);
1316   x[11] = _mm_packs_epi32(u[6], u[7]);
1317   x[12] = _mm_packs_epi32(u[8], u[9]);
1318   x[13] = _mm_packs_epi32(u[10], u[11]);
1319   x[14] = _mm_packs_epi32(u[12], u[13]);
1320   x[15] = _mm_packs_epi32(u[14], u[15]);
1321 
1322   // stage 3
1323   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1324   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1325   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1326   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1327   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1328   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1329   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1330   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1331 
1332   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1333   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1334   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1335   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1336   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1337   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1338   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1339   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1340   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1341   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1342   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1343   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1344   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1345   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1346   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1347   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1348 
1349   u[0] = _mm_add_epi32(v[0], v[4]);
1350   u[1] = _mm_add_epi32(v[1], v[5]);
1351   u[2] = _mm_add_epi32(v[2], v[6]);
1352   u[3] = _mm_add_epi32(v[3], v[7]);
1353   u[4] = _mm_sub_epi32(v[0], v[4]);
1354   u[5] = _mm_sub_epi32(v[1], v[5]);
1355   u[6] = _mm_sub_epi32(v[2], v[6]);
1356   u[7] = _mm_sub_epi32(v[3], v[7]);
1357   u[8] = _mm_add_epi32(v[8], v[12]);
1358   u[9] = _mm_add_epi32(v[9], v[13]);
1359   u[10] = _mm_add_epi32(v[10], v[14]);
1360   u[11] = _mm_add_epi32(v[11], v[15]);
1361   u[12] = _mm_sub_epi32(v[8], v[12]);
1362   u[13] = _mm_sub_epi32(v[9], v[13]);
1363   u[14] = _mm_sub_epi32(v[10], v[14]);
1364   u[15] = _mm_sub_epi32(v[11], v[15]);
1365 
1366   u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1367   u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1368   u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1369   u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1370   u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1371   u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1372   u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1373   u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1374   u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1375   u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1376   u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1377   u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1378   u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1379   u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1380   u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1381   u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1382 
1383   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1384   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1385   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1386   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1387   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1388   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1389   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1390   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1391   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1392   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1393   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1394   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1395   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1396   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1397   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1398   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1399 
1400   s[0] = _mm_add_epi16(x[0], x[2]);
1401   s[1] = _mm_add_epi16(x[1], x[3]);
1402   s[2] = _mm_sub_epi16(x[0], x[2]);
1403   s[3] = _mm_sub_epi16(x[1], x[3]);
1404   s[4] = _mm_packs_epi32(v[0], v[1]);
1405   s[5] = _mm_packs_epi32(v[2], v[3]);
1406   s[6] = _mm_packs_epi32(v[4], v[5]);
1407   s[7] = _mm_packs_epi32(v[6], v[7]);
1408   s[8] = _mm_add_epi16(x[8], x[10]);
1409   s[9] = _mm_add_epi16(x[9], x[11]);
1410   s[10] = _mm_sub_epi16(x[8], x[10]);
1411   s[11] = _mm_sub_epi16(x[9], x[11]);
1412   s[12] = _mm_packs_epi32(v[8], v[9]);
1413   s[13] = _mm_packs_epi32(v[10], v[11]);
1414   s[14] = _mm_packs_epi32(v[12], v[13]);
1415   s[15] = _mm_packs_epi32(v[14], v[15]);
1416 
1417   // stage 4
1418   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1419   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1420   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1421   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1422   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1423   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1424   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1425   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1426 
1427   v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1428   v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1429   v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1430   v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1431   v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1432   v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1433   v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1434   v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1435   v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1436   v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1437   v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1438   v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1439   v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1440   v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1441   v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1442   v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1443 
1444   u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1445   u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1446   u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1447   u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1448   u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1449   u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1450   u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1451   u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1452   u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1453   u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1454   u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1455   u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1456   u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1457   u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1458   u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1459   u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1460 
1461   v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1462   v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1463   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1464   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1465   v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1466   v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1467   v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1468   v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1469   v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1470   v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1471   v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1472   v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1473   v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1474   v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1475   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1476   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1477 
1478   in[0] = s[0];
1479   in[1] = _mm_sub_epi16(kZero, s[8]);
1480   in[2] = s[12];
1481   in[3] = _mm_sub_epi16(kZero, s[4]);
1482   in[4] = _mm_packs_epi32(v[4], v[5]);
1483   in[5] = _mm_packs_epi32(v[12], v[13]);
1484   in[6] = _mm_packs_epi32(v[8], v[9]);
1485   in[7] = _mm_packs_epi32(v[0], v[1]);
1486   in[8] = _mm_packs_epi32(v[2], v[3]);
1487   in[9] = _mm_packs_epi32(v[10], v[11]);
1488   in[10] = _mm_packs_epi32(v[14], v[15]);
1489   in[11] = _mm_packs_epi32(v[6], v[7]);
1490   in[12] = s[5];
1491   in[13] = _mm_sub_epi16(kZero, s[13]);
1492   in[14] = s[9];
1493   in[15] = _mm_sub_epi16(kZero, s[1]);
1494 }
1495 
fdct16_sse2(__m128i * in0,__m128i * in1)1496 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
1497   fdct16_8col(in0);
1498   fdct16_8col(in1);
1499   transpose_16bit_16x16(in0, in1);
1500 }
1501 
fadst16_sse2(__m128i * in0,__m128i * in1)1502 static void fadst16_sse2(__m128i *in0, __m128i *in1) {
1503   fadst16_8col(in0);
1504   fadst16_8col(in1);
1505   transpose_16bit_16x16(in0, in1);
1506 }
1507 
vp9_fht16x16_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)1508 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
1509                        int tx_type) {
1510   __m128i in0[16], in1[16];
1511 
1512   switch (tx_type) {
1513     case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break;
1514     case ADST_DCT:
1515       load_buffer_16x16(input, in0, in1, stride);
1516       fadst16_sse2(in0, in1);
1517       right_shift_16x16(in0, in1);
1518       fdct16_sse2(in0, in1);
1519       write_buffer_16x16(output, in0, in1, 16);
1520       break;
1521     case DCT_ADST:
1522       load_buffer_16x16(input, in0, in1, stride);
1523       fdct16_sse2(in0, in1);
1524       right_shift_16x16(in0, in1);
1525       fadst16_sse2(in0, in1);
1526       write_buffer_16x16(output, in0, in1, 16);
1527       break;
1528     default:
1529       assert(tx_type == ADST_ADST);
1530       load_buffer_16x16(input, in0, in1, stride);
1531       fadst16_sse2(in0, in1);
1532       right_shift_16x16(in0, in1);
1533       fadst16_sse2(in0, in1);
1534       write_buffer_16x16(output, in0, in1, 16);
1535       break;
1536   }
1537 }
1538