1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <emmintrin.h> // SSE2
13
14 #include "./vp9_rtcd.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/txfm_common.h"
17 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
19 #include "vpx_dsp/x86/transpose_sse2.h"
20 #include "vpx_dsp/x86/txfm_common_sse2.h"
21 #include "vpx_ports/mem.h"
22
load_buffer_4x4(const int16_t * input,__m128i * in,int stride)23 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
24 int stride) {
25 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
26 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27 __m128i mask;
28
29 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
30 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
31 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
32 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
33
34 in[0] = _mm_slli_epi16(in[0], 4);
35 in[1] = _mm_slli_epi16(in[1], 4);
36 in[2] = _mm_slli_epi16(in[2], 4);
37 in[3] = _mm_slli_epi16(in[3], 4);
38
39 mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
40 in[0] = _mm_add_epi16(in[0], mask);
41 in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
42 }
43
write_buffer_4x4(tran_low_t * output,__m128i * res)44 static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
45 const __m128i kOne = _mm_set1_epi16(1);
46 __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
47 __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
48 __m128i out01 = _mm_add_epi16(in01, kOne);
49 __m128i out23 = _mm_add_epi16(in23, kOne);
50 out01 = _mm_srai_epi16(out01, 2);
51 out23 = _mm_srai_epi16(out23, 2);
52 store_output(&out01, (output + 0 * 8));
53 store_output(&out23, (output + 1 * 8));
54 }
55
transpose_4x4(__m128i * res)56 static INLINE void transpose_4x4(__m128i *res) {
57 // Combine and transpose
58 // 00 01 02 03 20 21 22 23
59 // 10 11 12 13 30 31 32 33
60 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
61 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
62
63 // 00 10 01 11 02 12 03 13
64 // 20 30 21 31 22 32 23 33
65 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
66 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
67
68 // 00 10 20 30 01 11 21 31
69 // 02 12 22 32 03 13 23 33
70 // only use the first 4 16-bit integers
71 res[1] = _mm_unpackhi_epi64(res[0], res[0]);
72 res[3] = _mm_unpackhi_epi64(res[2], res[2]);
73 }
74
fdct4_sse2(__m128i * in)75 static void fdct4_sse2(__m128i *in) {
76 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
77 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
78 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
79 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
80 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
81
82 __m128i u[4], v[4];
83 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
84 u[1] = _mm_unpacklo_epi16(in[3], in[2]);
85
86 v[0] = _mm_add_epi16(u[0], u[1]);
87 v[1] = _mm_sub_epi16(u[0], u[1]);
88
89 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
90 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
91 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
92 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
93
94 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
95 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
96 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
97 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
98 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
99 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
100 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
101 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
102
103 in[0] = _mm_packs_epi32(u[0], u[1]);
104 in[1] = _mm_packs_epi32(u[2], u[3]);
105 transpose_4x4(in);
106 }
107
fadst4_sse2(__m128i * in)108 static void fadst4_sse2(__m128i *in) {
109 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
110 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
111 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
112 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
113 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
114 const __m128i kZero = _mm_setzero_si128();
115 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
116 __m128i u[8], v[8];
117 __m128i in7 = _mm_add_epi16(in[0], in[1]);
118
119 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
120 u[1] = _mm_unpacklo_epi16(in[2], in[3]);
121 u[2] = _mm_unpacklo_epi16(in7, kZero);
122 u[3] = _mm_unpacklo_epi16(in[2], kZero);
123 u[4] = _mm_unpacklo_epi16(in[3], kZero);
124
125 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
126 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
127 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
128 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
129 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
130 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
131 v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
132
133 u[0] = _mm_add_epi32(v[0], v[1]);
134 u[1] = _mm_sub_epi32(v[2], v[6]);
135 u[2] = _mm_add_epi32(v[3], v[4]);
136 u[3] = _mm_sub_epi32(u[2], u[0]);
137 u[4] = _mm_slli_epi32(v[5], 2);
138 u[5] = _mm_sub_epi32(u[4], v[5]);
139 u[6] = _mm_add_epi32(u[3], u[5]);
140
141 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
142 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
143 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
144 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
145
146 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
147 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
148 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
149 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
150
151 in[0] = _mm_packs_epi32(u[0], u[2]);
152 in[1] = _mm_packs_epi32(u[1], u[3]);
153 transpose_4x4(in);
154 }
155
vp9_fht4x4_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)156 void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
157 int tx_type) {
158 __m128i in[4];
159
160 switch (tx_type) {
161 case DCT_DCT: vpx_fdct4x4_sse2(input, output, stride); break;
162 case ADST_DCT:
163 load_buffer_4x4(input, in, stride);
164 fadst4_sse2(in);
165 fdct4_sse2(in);
166 write_buffer_4x4(output, in);
167 break;
168 case DCT_ADST:
169 load_buffer_4x4(input, in, stride);
170 fdct4_sse2(in);
171 fadst4_sse2(in);
172 write_buffer_4x4(output, in);
173 break;
174 default:
175 assert(tx_type == ADST_ADST);
176 load_buffer_4x4(input, in, stride);
177 fadst4_sse2(in);
178 fadst4_sse2(in);
179 write_buffer_4x4(output, in);
180 break;
181 }
182 }
183
184 // load 8x8 array
load_buffer_8x8(const int16_t * input,__m128i * in,int stride)185 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
186 int stride) {
187 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
188 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
189 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
190 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
191 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
192 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
193 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
194 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
195
196 in[0] = _mm_slli_epi16(in[0], 2);
197 in[1] = _mm_slli_epi16(in[1], 2);
198 in[2] = _mm_slli_epi16(in[2], 2);
199 in[3] = _mm_slli_epi16(in[3], 2);
200 in[4] = _mm_slli_epi16(in[4], 2);
201 in[5] = _mm_slli_epi16(in[5], 2);
202 in[6] = _mm_slli_epi16(in[6], 2);
203 in[7] = _mm_slli_epi16(in[7], 2);
204 }
205
206 // right shift and rounding
right_shift_8x8(__m128i * res,const int bit)207 static INLINE void right_shift_8x8(__m128i *res, const int bit) {
208 __m128i sign0 = _mm_srai_epi16(res[0], 15);
209 __m128i sign1 = _mm_srai_epi16(res[1], 15);
210 __m128i sign2 = _mm_srai_epi16(res[2], 15);
211 __m128i sign3 = _mm_srai_epi16(res[3], 15);
212 __m128i sign4 = _mm_srai_epi16(res[4], 15);
213 __m128i sign5 = _mm_srai_epi16(res[5], 15);
214 __m128i sign6 = _mm_srai_epi16(res[6], 15);
215 __m128i sign7 = _mm_srai_epi16(res[7], 15);
216
217 if (bit == 2) {
218 const __m128i const_rounding = _mm_set1_epi16(1);
219 res[0] = _mm_add_epi16(res[0], const_rounding);
220 res[1] = _mm_add_epi16(res[1], const_rounding);
221 res[2] = _mm_add_epi16(res[2], const_rounding);
222 res[3] = _mm_add_epi16(res[3], const_rounding);
223 res[4] = _mm_add_epi16(res[4], const_rounding);
224 res[5] = _mm_add_epi16(res[5], const_rounding);
225 res[6] = _mm_add_epi16(res[6], const_rounding);
226 res[7] = _mm_add_epi16(res[7], const_rounding);
227 }
228
229 res[0] = _mm_sub_epi16(res[0], sign0);
230 res[1] = _mm_sub_epi16(res[1], sign1);
231 res[2] = _mm_sub_epi16(res[2], sign2);
232 res[3] = _mm_sub_epi16(res[3], sign3);
233 res[4] = _mm_sub_epi16(res[4], sign4);
234 res[5] = _mm_sub_epi16(res[5], sign5);
235 res[6] = _mm_sub_epi16(res[6], sign6);
236 res[7] = _mm_sub_epi16(res[7], sign7);
237
238 if (bit == 1) {
239 res[0] = _mm_srai_epi16(res[0], 1);
240 res[1] = _mm_srai_epi16(res[1], 1);
241 res[2] = _mm_srai_epi16(res[2], 1);
242 res[3] = _mm_srai_epi16(res[3], 1);
243 res[4] = _mm_srai_epi16(res[4], 1);
244 res[5] = _mm_srai_epi16(res[5], 1);
245 res[6] = _mm_srai_epi16(res[6], 1);
246 res[7] = _mm_srai_epi16(res[7], 1);
247 } else {
248 res[0] = _mm_srai_epi16(res[0], 2);
249 res[1] = _mm_srai_epi16(res[1], 2);
250 res[2] = _mm_srai_epi16(res[2], 2);
251 res[3] = _mm_srai_epi16(res[3], 2);
252 res[4] = _mm_srai_epi16(res[4], 2);
253 res[5] = _mm_srai_epi16(res[5], 2);
254 res[6] = _mm_srai_epi16(res[6], 2);
255 res[7] = _mm_srai_epi16(res[7], 2);
256 }
257 }
258
259 // write 8x8 array
write_buffer_8x8(tran_low_t * output,__m128i * res,int stride)260 static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
261 int stride) {
262 store_output(&res[0], (output + 0 * stride));
263 store_output(&res[1], (output + 1 * stride));
264 store_output(&res[2], (output + 2 * stride));
265 store_output(&res[3], (output + 3 * stride));
266 store_output(&res[4], (output + 4 * stride));
267 store_output(&res[5], (output + 5 * stride));
268 store_output(&res[6], (output + 6 * stride));
269 store_output(&res[7], (output + 7 * stride));
270 }
271
fdct8_sse2(__m128i * in)272 static void fdct8_sse2(__m128i *in) {
273 // constants
274 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
275 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
276 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
277 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
278 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
279 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
280 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
281 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
282 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
283 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
284 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
285 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
286
287 // stage 1
288 s0 = _mm_add_epi16(in[0], in[7]);
289 s1 = _mm_add_epi16(in[1], in[6]);
290 s2 = _mm_add_epi16(in[2], in[5]);
291 s3 = _mm_add_epi16(in[3], in[4]);
292 s4 = _mm_sub_epi16(in[3], in[4]);
293 s5 = _mm_sub_epi16(in[2], in[5]);
294 s6 = _mm_sub_epi16(in[1], in[6]);
295 s7 = _mm_sub_epi16(in[0], in[7]);
296
297 u0 = _mm_add_epi16(s0, s3);
298 u1 = _mm_add_epi16(s1, s2);
299 u2 = _mm_sub_epi16(s1, s2);
300 u3 = _mm_sub_epi16(s0, s3);
301 // interleave and perform butterfly multiplication/addition
302 v0 = _mm_unpacklo_epi16(u0, u1);
303 v1 = _mm_unpackhi_epi16(u0, u1);
304 v2 = _mm_unpacklo_epi16(u2, u3);
305 v3 = _mm_unpackhi_epi16(u2, u3);
306
307 u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
308 u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
309 u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
310 u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
311 u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
312 u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
313 u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
314 u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
315
316 // shift and rounding
317 v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
318 v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
319 v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
320 v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
321 v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
322 v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
323 v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
324 v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
325
326 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
327 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
328 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
329 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
330 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
331 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
332 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
333 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
334
335 in[0] = _mm_packs_epi32(u0, u1);
336 in[2] = _mm_packs_epi32(u4, u5);
337 in[4] = _mm_packs_epi32(u2, u3);
338 in[6] = _mm_packs_epi32(u6, u7);
339
340 // stage 2
341 // interleave and perform butterfly multiplication/addition
342 u0 = _mm_unpacklo_epi16(s6, s5);
343 u1 = _mm_unpackhi_epi16(s6, s5);
344 v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
345 v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
346 v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
347 v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
348
349 // shift and rounding
350 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
351 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
352 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
353 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
354
355 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
356 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
357 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
358 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
359
360 u0 = _mm_packs_epi32(v0, v1);
361 u1 = _mm_packs_epi32(v2, v3);
362
363 // stage 3
364 s0 = _mm_add_epi16(s4, u0);
365 s1 = _mm_sub_epi16(s4, u0);
366 s2 = _mm_sub_epi16(s7, u1);
367 s3 = _mm_add_epi16(s7, u1);
368
369 // stage 4
370 u0 = _mm_unpacklo_epi16(s0, s3);
371 u1 = _mm_unpackhi_epi16(s0, s3);
372 u2 = _mm_unpacklo_epi16(s1, s2);
373 u3 = _mm_unpackhi_epi16(s1, s2);
374
375 v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
376 v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
377 v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
378 v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
379 v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
380 v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
381 v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
382 v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
383
384 // shift and rounding
385 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
386 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
387 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
388 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
389 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
390 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
391 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
392 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
393
394 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
395 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
396 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
397 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
398 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
399 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
400 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
401 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
402
403 in[1] = _mm_packs_epi32(v0, v1);
404 in[3] = _mm_packs_epi32(v4, v5);
405 in[5] = _mm_packs_epi32(v2, v3);
406 in[7] = _mm_packs_epi32(v6, v7);
407
408 // transpose
409 transpose_16bit_8x8(in, in);
410 }
411
fadst8_sse2(__m128i * in)412 static void fadst8_sse2(__m128i *in) {
413 // Constants
414 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
415 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
416 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
417 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
418 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
419 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
420 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
421 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
422 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
423 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
424 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
425 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
426 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
427 const __m128i k__const_0 = _mm_setzero_si128();
428 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
429
430 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
431 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
432 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
433 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
434 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
435
436 // properly aligned for butterfly input
437 in0 = in[7];
438 in1 = in[0];
439 in2 = in[5];
440 in3 = in[2];
441 in4 = in[3];
442 in5 = in[4];
443 in6 = in[1];
444 in7 = in[6];
445
446 // column transformation
447 // stage 1
448 // interleave and multiply/add into 32-bit integer
449 s0 = _mm_unpacklo_epi16(in0, in1);
450 s1 = _mm_unpackhi_epi16(in0, in1);
451 s2 = _mm_unpacklo_epi16(in2, in3);
452 s3 = _mm_unpackhi_epi16(in2, in3);
453 s4 = _mm_unpacklo_epi16(in4, in5);
454 s5 = _mm_unpackhi_epi16(in4, in5);
455 s6 = _mm_unpacklo_epi16(in6, in7);
456 s7 = _mm_unpackhi_epi16(in6, in7);
457
458 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
459 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
460 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
461 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
462 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
463 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
464 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
465 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
466 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
467 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
468 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
469 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
470 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
471 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
472 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
473 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
474
475 // addition
476 w0 = _mm_add_epi32(u0, u8);
477 w1 = _mm_add_epi32(u1, u9);
478 w2 = _mm_add_epi32(u2, u10);
479 w3 = _mm_add_epi32(u3, u11);
480 w4 = _mm_add_epi32(u4, u12);
481 w5 = _mm_add_epi32(u5, u13);
482 w6 = _mm_add_epi32(u6, u14);
483 w7 = _mm_add_epi32(u7, u15);
484 w8 = _mm_sub_epi32(u0, u8);
485 w9 = _mm_sub_epi32(u1, u9);
486 w10 = _mm_sub_epi32(u2, u10);
487 w11 = _mm_sub_epi32(u3, u11);
488 w12 = _mm_sub_epi32(u4, u12);
489 w13 = _mm_sub_epi32(u5, u13);
490 w14 = _mm_sub_epi32(u6, u14);
491 w15 = _mm_sub_epi32(u7, u15);
492
493 // shift and rounding
494 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
495 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
496 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
497 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
498 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
499 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
500 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
501 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
502 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
503 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
504 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
505 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
506 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
507 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
508 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
509 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
510
511 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
512 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
513 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
514 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
515 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
516 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
517 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
518 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
519 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
520 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
521 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
522 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
523 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
524 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
525 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
526 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
527
528 // back to 16-bit and pack 8 integers into __m128i
529 in[0] = _mm_packs_epi32(u0, u1);
530 in[1] = _mm_packs_epi32(u2, u3);
531 in[2] = _mm_packs_epi32(u4, u5);
532 in[3] = _mm_packs_epi32(u6, u7);
533 in[4] = _mm_packs_epi32(u8, u9);
534 in[5] = _mm_packs_epi32(u10, u11);
535 in[6] = _mm_packs_epi32(u12, u13);
536 in[7] = _mm_packs_epi32(u14, u15);
537
538 // stage 2
539 s0 = _mm_add_epi16(in[0], in[2]);
540 s1 = _mm_add_epi16(in[1], in[3]);
541 s2 = _mm_sub_epi16(in[0], in[2]);
542 s3 = _mm_sub_epi16(in[1], in[3]);
543 u0 = _mm_unpacklo_epi16(in[4], in[5]);
544 u1 = _mm_unpackhi_epi16(in[4], in[5]);
545 u2 = _mm_unpacklo_epi16(in[6], in[7]);
546 u3 = _mm_unpackhi_epi16(in[6], in[7]);
547
548 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
549 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
550 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
551 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
552 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
553 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
554 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
555 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
556
557 w0 = _mm_add_epi32(v0, v4);
558 w1 = _mm_add_epi32(v1, v5);
559 w2 = _mm_add_epi32(v2, v6);
560 w3 = _mm_add_epi32(v3, v7);
561 w4 = _mm_sub_epi32(v0, v4);
562 w5 = _mm_sub_epi32(v1, v5);
563 w6 = _mm_sub_epi32(v2, v6);
564 w7 = _mm_sub_epi32(v3, v7);
565
566 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
567 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
568 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
569 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
570 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
571 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
572 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
573 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
574
575 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
576 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
577 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
578 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
579 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
580 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
581 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
582 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
583
584 // back to 16-bit intergers
585 s4 = _mm_packs_epi32(u0, u1);
586 s5 = _mm_packs_epi32(u2, u3);
587 s6 = _mm_packs_epi32(u4, u5);
588 s7 = _mm_packs_epi32(u6, u7);
589
590 // stage 3
591 u0 = _mm_unpacklo_epi16(s2, s3);
592 u1 = _mm_unpackhi_epi16(s2, s3);
593 u2 = _mm_unpacklo_epi16(s6, s7);
594 u3 = _mm_unpackhi_epi16(s6, s7);
595
596 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
597 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
598 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
599 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
600 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
601 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
602 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
603 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
604
605 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
606 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
607 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
608 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
609 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
610 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
611 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
612 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
613
614 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
615 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
616 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
617 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
618 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
619 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
620 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
621 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
622
623 s2 = _mm_packs_epi32(v0, v1);
624 s3 = _mm_packs_epi32(v2, v3);
625 s6 = _mm_packs_epi32(v4, v5);
626 s7 = _mm_packs_epi32(v6, v7);
627
628 // FIXME(jingning): do subtract using bit inversion?
629 in[0] = s0;
630 in[1] = _mm_sub_epi16(k__const_0, s4);
631 in[2] = s6;
632 in[3] = _mm_sub_epi16(k__const_0, s2);
633 in[4] = s3;
634 in[5] = _mm_sub_epi16(k__const_0, s7);
635 in[6] = s5;
636 in[7] = _mm_sub_epi16(k__const_0, s1);
637
638 // transpose
639 transpose_16bit_8x8(in, in);
640 }
641
vp9_fht8x8_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)642 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
643 int tx_type) {
644 __m128i in[8];
645
646 switch (tx_type) {
647 case DCT_DCT: vpx_fdct8x8_sse2(input, output, stride); break;
648 case ADST_DCT:
649 load_buffer_8x8(input, in, stride);
650 fadst8_sse2(in);
651 fdct8_sse2(in);
652 right_shift_8x8(in, 1);
653 write_buffer_8x8(output, in, 8);
654 break;
655 case DCT_ADST:
656 load_buffer_8x8(input, in, stride);
657 fdct8_sse2(in);
658 fadst8_sse2(in);
659 right_shift_8x8(in, 1);
660 write_buffer_8x8(output, in, 8);
661 break;
662 default:
663 assert(tx_type == ADST_ADST);
664 load_buffer_8x8(input, in, stride);
665 fadst8_sse2(in);
666 fadst8_sse2(in);
667 right_shift_8x8(in, 1);
668 write_buffer_8x8(output, in, 8);
669 break;
670 }
671 }
672
load_buffer_16x16(const int16_t * input,__m128i * in0,__m128i * in1,int stride)673 static INLINE void load_buffer_16x16(const int16_t *input, __m128i *in0,
674 __m128i *in1, int stride) {
675 // load first 8 columns
676 load_buffer_8x8(input, in0, stride);
677 load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
678
679 input += 8;
680 // load second 8 columns
681 load_buffer_8x8(input, in1, stride);
682 load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
683 }
684
write_buffer_16x16(tran_low_t * output,__m128i * in0,__m128i * in1,int stride)685 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
686 __m128i *in1, int stride) {
687 // write first 8 columns
688 write_buffer_8x8(output, in0, stride);
689 write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
690 // write second 8 columns
691 output += 8;
692 write_buffer_8x8(output, in1, stride);
693 write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
694 }
695
right_shift_16x16(__m128i * res0,__m128i * res1)696 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
697 // perform rounding operations
698 right_shift_8x8(res0, 2);
699 right_shift_8x8(res0 + 8, 2);
700 right_shift_8x8(res1, 2);
701 right_shift_8x8(res1 + 8, 2);
702 }
703
fdct16_8col(__m128i * in)704 static void fdct16_8col(__m128i *in) {
705 // perform 16x16 1-D DCT for 8 columns
706 __m128i i[8], s[8], p[8], t[8], u[16], v[16];
707 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
708 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
709 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
710 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
711 const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
712 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
713 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
714 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
715 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
716 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
717 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
718 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
719 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
720 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
721 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
722 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
723 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
724 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
725 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
726
727 // stage 1
728 i[0] = _mm_add_epi16(in[0], in[15]);
729 i[1] = _mm_add_epi16(in[1], in[14]);
730 i[2] = _mm_add_epi16(in[2], in[13]);
731 i[3] = _mm_add_epi16(in[3], in[12]);
732 i[4] = _mm_add_epi16(in[4], in[11]);
733 i[5] = _mm_add_epi16(in[5], in[10]);
734 i[6] = _mm_add_epi16(in[6], in[9]);
735 i[7] = _mm_add_epi16(in[7], in[8]);
736
737 s[0] = _mm_sub_epi16(in[7], in[8]);
738 s[1] = _mm_sub_epi16(in[6], in[9]);
739 s[2] = _mm_sub_epi16(in[5], in[10]);
740 s[3] = _mm_sub_epi16(in[4], in[11]);
741 s[4] = _mm_sub_epi16(in[3], in[12]);
742 s[5] = _mm_sub_epi16(in[2], in[13]);
743 s[6] = _mm_sub_epi16(in[1], in[14]);
744 s[7] = _mm_sub_epi16(in[0], in[15]);
745
746 p[0] = _mm_add_epi16(i[0], i[7]);
747 p[1] = _mm_add_epi16(i[1], i[6]);
748 p[2] = _mm_add_epi16(i[2], i[5]);
749 p[3] = _mm_add_epi16(i[3], i[4]);
750 p[4] = _mm_sub_epi16(i[3], i[4]);
751 p[5] = _mm_sub_epi16(i[2], i[5]);
752 p[6] = _mm_sub_epi16(i[1], i[6]);
753 p[7] = _mm_sub_epi16(i[0], i[7]);
754
755 u[0] = _mm_add_epi16(p[0], p[3]);
756 u[1] = _mm_add_epi16(p[1], p[2]);
757 u[2] = _mm_sub_epi16(p[1], p[2]);
758 u[3] = _mm_sub_epi16(p[0], p[3]);
759
760 v[0] = _mm_unpacklo_epi16(u[0], u[1]);
761 v[1] = _mm_unpackhi_epi16(u[0], u[1]);
762 v[2] = _mm_unpacklo_epi16(u[2], u[3]);
763 v[3] = _mm_unpackhi_epi16(u[2], u[3]);
764
765 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
766 u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
767 u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
768 u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
769 u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
770 u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
771 u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
772 u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
773
774 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
775 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
776 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
777 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
778 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
779 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
780 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
781 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
782
783 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
784 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
785 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
786 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
787 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
788 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
789 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
790 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
791
792 in[0] = _mm_packs_epi32(u[0], u[1]);
793 in[4] = _mm_packs_epi32(u[4], u[5]);
794 in[8] = _mm_packs_epi32(u[2], u[3]);
795 in[12] = _mm_packs_epi32(u[6], u[7]);
796
797 u[0] = _mm_unpacklo_epi16(p[5], p[6]);
798 u[1] = _mm_unpackhi_epi16(p[5], p[6]);
799 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
800 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
801 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
802 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
803
804 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
805 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
806 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
807 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
808
809 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
810 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
811 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
812 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
813
814 u[0] = _mm_packs_epi32(v[0], v[1]);
815 u[1] = _mm_packs_epi32(v[2], v[3]);
816
817 t[0] = _mm_add_epi16(p[4], u[0]);
818 t[1] = _mm_sub_epi16(p[4], u[0]);
819 t[2] = _mm_sub_epi16(p[7], u[1]);
820 t[3] = _mm_add_epi16(p[7], u[1]);
821
822 u[0] = _mm_unpacklo_epi16(t[0], t[3]);
823 u[1] = _mm_unpackhi_epi16(t[0], t[3]);
824 u[2] = _mm_unpacklo_epi16(t[1], t[2]);
825 u[3] = _mm_unpackhi_epi16(t[1], t[2]);
826
827 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
828 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
829 v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
830 v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
831 v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
832 v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
833 v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
834 v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
835
836 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
837 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
838 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
839 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
840 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
841 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
842 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
843 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
844
845 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
846 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
847 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
848 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
849 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
850 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
851 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
852 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
853
854 in[2] = _mm_packs_epi32(v[0], v[1]);
855 in[6] = _mm_packs_epi32(v[4], v[5]);
856 in[10] = _mm_packs_epi32(v[2], v[3]);
857 in[14] = _mm_packs_epi32(v[6], v[7]);
858
859 // stage 2
860 u[0] = _mm_unpacklo_epi16(s[2], s[5]);
861 u[1] = _mm_unpackhi_epi16(s[2], s[5]);
862 u[2] = _mm_unpacklo_epi16(s[3], s[4]);
863 u[3] = _mm_unpackhi_epi16(s[3], s[4]);
864
865 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
866 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
867 v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
868 v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
869 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
870 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
871 v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
872 v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
873
874 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
875 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
876 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
877 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
878 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
879 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
880 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
881 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
882
883 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
884 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
885 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
886 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
887 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
888 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
889 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
890 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
891
892 t[2] = _mm_packs_epi32(v[0], v[1]);
893 t[3] = _mm_packs_epi32(v[2], v[3]);
894 t[4] = _mm_packs_epi32(v[4], v[5]);
895 t[5] = _mm_packs_epi32(v[6], v[7]);
896
897 // stage 3
898 p[0] = _mm_add_epi16(s[0], t[3]);
899 p[1] = _mm_add_epi16(s[1], t[2]);
900 p[2] = _mm_sub_epi16(s[1], t[2]);
901 p[3] = _mm_sub_epi16(s[0], t[3]);
902 p[4] = _mm_sub_epi16(s[7], t[4]);
903 p[5] = _mm_sub_epi16(s[6], t[5]);
904 p[6] = _mm_add_epi16(s[6], t[5]);
905 p[7] = _mm_add_epi16(s[7], t[4]);
906
907 // stage 4
908 u[0] = _mm_unpacklo_epi16(p[1], p[6]);
909 u[1] = _mm_unpackhi_epi16(p[1], p[6]);
910 u[2] = _mm_unpacklo_epi16(p[2], p[5]);
911 u[3] = _mm_unpackhi_epi16(p[2], p[5]);
912
913 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
914 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
915 v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
916 v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
917 v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
918 v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
919 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
920 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
921
922 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
923 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
924 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
925 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
926 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
927 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
928 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
929 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
930
931 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
932 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
933 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
934 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
935 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
936 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
937 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
938 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
939
940 t[1] = _mm_packs_epi32(v[0], v[1]);
941 t[2] = _mm_packs_epi32(v[2], v[3]);
942 t[5] = _mm_packs_epi32(v[4], v[5]);
943 t[6] = _mm_packs_epi32(v[6], v[7]);
944
945 // stage 5
946 s[0] = _mm_add_epi16(p[0], t[1]);
947 s[1] = _mm_sub_epi16(p[0], t[1]);
948 s[2] = _mm_add_epi16(p[3], t[2]);
949 s[3] = _mm_sub_epi16(p[3], t[2]);
950 s[4] = _mm_sub_epi16(p[4], t[5]);
951 s[5] = _mm_add_epi16(p[4], t[5]);
952 s[6] = _mm_sub_epi16(p[7], t[6]);
953 s[7] = _mm_add_epi16(p[7], t[6]);
954
955 // stage 6
956 u[0] = _mm_unpacklo_epi16(s[0], s[7]);
957 u[1] = _mm_unpackhi_epi16(s[0], s[7]);
958 u[2] = _mm_unpacklo_epi16(s[1], s[6]);
959 u[3] = _mm_unpackhi_epi16(s[1], s[6]);
960 u[4] = _mm_unpacklo_epi16(s[2], s[5]);
961 u[5] = _mm_unpackhi_epi16(s[2], s[5]);
962 u[6] = _mm_unpacklo_epi16(s[3], s[4]);
963 u[7] = _mm_unpackhi_epi16(s[3], s[4]);
964
965 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
966 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
967 v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
968 v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
969 v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
970 v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
971 v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
972 v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
973 v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
974 v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
975 v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
976 v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
977 v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
978 v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
979 v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
980 v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
981
982 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
983 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
984 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
985 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
986 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
987 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
988 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
989 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
990 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
991 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
992 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
993 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
994 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
995 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
996 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
997 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
998
999 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1000 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1001 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1002 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1003 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1004 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1005 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1006 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1007 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1008 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1009 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1010 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1011 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1012 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1013 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1014 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1015
1016 in[1] = _mm_packs_epi32(v[0], v[1]);
1017 in[9] = _mm_packs_epi32(v[2], v[3]);
1018 in[5] = _mm_packs_epi32(v[4], v[5]);
1019 in[13] = _mm_packs_epi32(v[6], v[7]);
1020 in[3] = _mm_packs_epi32(v[8], v[9]);
1021 in[11] = _mm_packs_epi32(v[10], v[11]);
1022 in[7] = _mm_packs_epi32(v[12], v[13]);
1023 in[15] = _mm_packs_epi32(v[14], v[15]);
1024 }
1025
fadst16_8col(__m128i * in)1026 static void fadst16_8col(__m128i *in) {
1027 // perform 16x16 1-D ADST for 8 columns
1028 __m128i s[16], x[16], u[32], v[32];
1029 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
1030 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
1031 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
1032 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
1033 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
1034 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
1035 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
1036 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
1037 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
1038 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
1039 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
1040 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
1041 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
1042 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
1043 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
1044 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
1045 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
1046 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
1047 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
1048 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
1049 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
1050 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
1051 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
1052 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
1053 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
1054 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
1055 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
1056 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
1057 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
1058 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
1059 const __m128i kZero = _mm_setzero_si128();
1060
1061 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
1062 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
1063 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
1064 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
1065 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
1066 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
1067 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
1068 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
1069 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
1070 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
1071 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
1072 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
1073 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
1074 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
1075 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
1076 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
1077
1078 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
1079 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
1080 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
1081 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
1082 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
1083 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
1084 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
1085 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
1086 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
1087 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
1088 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
1089 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
1090 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
1091 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
1092 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
1093 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
1094 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
1095 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
1096 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
1097 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
1098 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
1099 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
1100 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
1101 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
1102 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
1103 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
1104 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
1105 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
1106 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
1107 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
1108 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
1109 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
1110
1111 u[0] = _mm_add_epi32(v[0], v[16]);
1112 u[1] = _mm_add_epi32(v[1], v[17]);
1113 u[2] = _mm_add_epi32(v[2], v[18]);
1114 u[3] = _mm_add_epi32(v[3], v[19]);
1115 u[4] = _mm_add_epi32(v[4], v[20]);
1116 u[5] = _mm_add_epi32(v[5], v[21]);
1117 u[6] = _mm_add_epi32(v[6], v[22]);
1118 u[7] = _mm_add_epi32(v[7], v[23]);
1119 u[8] = _mm_add_epi32(v[8], v[24]);
1120 u[9] = _mm_add_epi32(v[9], v[25]);
1121 u[10] = _mm_add_epi32(v[10], v[26]);
1122 u[11] = _mm_add_epi32(v[11], v[27]);
1123 u[12] = _mm_add_epi32(v[12], v[28]);
1124 u[13] = _mm_add_epi32(v[13], v[29]);
1125 u[14] = _mm_add_epi32(v[14], v[30]);
1126 u[15] = _mm_add_epi32(v[15], v[31]);
1127 u[16] = _mm_sub_epi32(v[0], v[16]);
1128 u[17] = _mm_sub_epi32(v[1], v[17]);
1129 u[18] = _mm_sub_epi32(v[2], v[18]);
1130 u[19] = _mm_sub_epi32(v[3], v[19]);
1131 u[20] = _mm_sub_epi32(v[4], v[20]);
1132 u[21] = _mm_sub_epi32(v[5], v[21]);
1133 u[22] = _mm_sub_epi32(v[6], v[22]);
1134 u[23] = _mm_sub_epi32(v[7], v[23]);
1135 u[24] = _mm_sub_epi32(v[8], v[24]);
1136 u[25] = _mm_sub_epi32(v[9], v[25]);
1137 u[26] = _mm_sub_epi32(v[10], v[26]);
1138 u[27] = _mm_sub_epi32(v[11], v[27]);
1139 u[28] = _mm_sub_epi32(v[12], v[28]);
1140 u[29] = _mm_sub_epi32(v[13], v[29]);
1141 u[30] = _mm_sub_epi32(v[14], v[30]);
1142 u[31] = _mm_sub_epi32(v[15], v[31]);
1143
1144 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1145 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1146 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1147 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1148 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1149 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1150 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1151 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1152 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1153 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1154 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1155 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1156 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1157 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1158 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1159 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1160 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
1161 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
1162 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
1163 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
1164 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
1165 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
1166 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
1167 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
1168 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
1169 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
1170 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
1171 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
1172 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
1173 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
1174 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
1175 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
1176
1177 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1178 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1179 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1180 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1181 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1182 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1183 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1184 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1185 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1186 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1187 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1188 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1189 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1190 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1191 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1192 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1193 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
1194 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
1195 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
1196 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
1197 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
1198 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
1199 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
1200 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
1201 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
1202 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
1203 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
1204 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
1205 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
1206 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
1207 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
1208 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
1209
1210 s[0] = _mm_packs_epi32(u[0], u[1]);
1211 s[1] = _mm_packs_epi32(u[2], u[3]);
1212 s[2] = _mm_packs_epi32(u[4], u[5]);
1213 s[3] = _mm_packs_epi32(u[6], u[7]);
1214 s[4] = _mm_packs_epi32(u[8], u[9]);
1215 s[5] = _mm_packs_epi32(u[10], u[11]);
1216 s[6] = _mm_packs_epi32(u[12], u[13]);
1217 s[7] = _mm_packs_epi32(u[14], u[15]);
1218 s[8] = _mm_packs_epi32(u[16], u[17]);
1219 s[9] = _mm_packs_epi32(u[18], u[19]);
1220 s[10] = _mm_packs_epi32(u[20], u[21]);
1221 s[11] = _mm_packs_epi32(u[22], u[23]);
1222 s[12] = _mm_packs_epi32(u[24], u[25]);
1223 s[13] = _mm_packs_epi32(u[26], u[27]);
1224 s[14] = _mm_packs_epi32(u[28], u[29]);
1225 s[15] = _mm_packs_epi32(u[30], u[31]);
1226
1227 // stage 2
1228 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
1229 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
1230 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
1231 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
1232 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
1233 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
1234 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1235 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1236
1237 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
1238 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
1239 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
1240 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
1241 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
1242 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
1243 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
1244 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
1245 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
1246 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
1247 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
1248 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
1249 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
1250 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
1251 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
1252 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
1253
1254 u[0] = _mm_add_epi32(v[0], v[8]);
1255 u[1] = _mm_add_epi32(v[1], v[9]);
1256 u[2] = _mm_add_epi32(v[2], v[10]);
1257 u[3] = _mm_add_epi32(v[3], v[11]);
1258 u[4] = _mm_add_epi32(v[4], v[12]);
1259 u[5] = _mm_add_epi32(v[5], v[13]);
1260 u[6] = _mm_add_epi32(v[6], v[14]);
1261 u[7] = _mm_add_epi32(v[7], v[15]);
1262 u[8] = _mm_sub_epi32(v[0], v[8]);
1263 u[9] = _mm_sub_epi32(v[1], v[9]);
1264 u[10] = _mm_sub_epi32(v[2], v[10]);
1265 u[11] = _mm_sub_epi32(v[3], v[11]);
1266 u[12] = _mm_sub_epi32(v[4], v[12]);
1267 u[13] = _mm_sub_epi32(v[5], v[13]);
1268 u[14] = _mm_sub_epi32(v[6], v[14]);
1269 u[15] = _mm_sub_epi32(v[7], v[15]);
1270
1271 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1272 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1273 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1274 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1275 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1276 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1277 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1278 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1279 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1280 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1281 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1282 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1283 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1284 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1285 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1286 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1287
1288 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
1289 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
1290 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
1291 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
1292 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
1293 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
1294 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
1295 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
1296 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
1297 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
1298 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
1299 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
1300 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
1301 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
1302 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
1303 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
1304
1305 x[0] = _mm_add_epi16(s[0], s[4]);
1306 x[1] = _mm_add_epi16(s[1], s[5]);
1307 x[2] = _mm_add_epi16(s[2], s[6]);
1308 x[3] = _mm_add_epi16(s[3], s[7]);
1309 x[4] = _mm_sub_epi16(s[0], s[4]);
1310 x[5] = _mm_sub_epi16(s[1], s[5]);
1311 x[6] = _mm_sub_epi16(s[2], s[6]);
1312 x[7] = _mm_sub_epi16(s[3], s[7]);
1313 x[8] = _mm_packs_epi32(u[0], u[1]);
1314 x[9] = _mm_packs_epi32(u[2], u[3]);
1315 x[10] = _mm_packs_epi32(u[4], u[5]);
1316 x[11] = _mm_packs_epi32(u[6], u[7]);
1317 x[12] = _mm_packs_epi32(u[8], u[9]);
1318 x[13] = _mm_packs_epi32(u[10], u[11]);
1319 x[14] = _mm_packs_epi32(u[12], u[13]);
1320 x[15] = _mm_packs_epi32(u[14], u[15]);
1321
1322 // stage 3
1323 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
1324 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
1325 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
1326 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
1327 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
1328 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
1329 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
1330 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
1331
1332 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
1333 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
1334 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
1335 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
1336 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
1337 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
1338 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
1339 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
1340 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
1341 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
1342 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
1343 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
1344 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
1345 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
1346 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
1347 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
1348
1349 u[0] = _mm_add_epi32(v[0], v[4]);
1350 u[1] = _mm_add_epi32(v[1], v[5]);
1351 u[2] = _mm_add_epi32(v[2], v[6]);
1352 u[3] = _mm_add_epi32(v[3], v[7]);
1353 u[4] = _mm_sub_epi32(v[0], v[4]);
1354 u[5] = _mm_sub_epi32(v[1], v[5]);
1355 u[6] = _mm_sub_epi32(v[2], v[6]);
1356 u[7] = _mm_sub_epi32(v[3], v[7]);
1357 u[8] = _mm_add_epi32(v[8], v[12]);
1358 u[9] = _mm_add_epi32(v[9], v[13]);
1359 u[10] = _mm_add_epi32(v[10], v[14]);
1360 u[11] = _mm_add_epi32(v[11], v[15]);
1361 u[12] = _mm_sub_epi32(v[8], v[12]);
1362 u[13] = _mm_sub_epi32(v[9], v[13]);
1363 u[14] = _mm_sub_epi32(v[10], v[14]);
1364 u[15] = _mm_sub_epi32(v[11], v[15]);
1365
1366 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
1367 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
1368 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
1369 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
1370 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
1371 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
1372 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
1373 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
1374 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
1375 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
1376 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
1377 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
1378 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
1379 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
1380 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
1381 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
1382
1383 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1384 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1385 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1386 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1387 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1388 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1389 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1390 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1391 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1392 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1393 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1394 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1395 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1396 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1397 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1398 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1399
1400 s[0] = _mm_add_epi16(x[0], x[2]);
1401 s[1] = _mm_add_epi16(x[1], x[3]);
1402 s[2] = _mm_sub_epi16(x[0], x[2]);
1403 s[3] = _mm_sub_epi16(x[1], x[3]);
1404 s[4] = _mm_packs_epi32(v[0], v[1]);
1405 s[5] = _mm_packs_epi32(v[2], v[3]);
1406 s[6] = _mm_packs_epi32(v[4], v[5]);
1407 s[7] = _mm_packs_epi32(v[6], v[7]);
1408 s[8] = _mm_add_epi16(x[8], x[10]);
1409 s[9] = _mm_add_epi16(x[9], x[11]);
1410 s[10] = _mm_sub_epi16(x[8], x[10]);
1411 s[11] = _mm_sub_epi16(x[9], x[11]);
1412 s[12] = _mm_packs_epi32(v[8], v[9]);
1413 s[13] = _mm_packs_epi32(v[10], v[11]);
1414 s[14] = _mm_packs_epi32(v[12], v[13]);
1415 s[15] = _mm_packs_epi32(v[14], v[15]);
1416
1417 // stage 4
1418 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
1419 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
1420 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
1421 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
1422 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
1423 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
1424 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
1425 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
1426
1427 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
1428 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
1429 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
1430 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
1431 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
1432 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
1433 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
1434 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
1435 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
1436 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
1437 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
1438 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
1439 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
1440 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
1441 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
1442 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
1443
1444 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
1445 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
1446 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
1447 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
1448 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
1449 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
1450 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
1451 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
1452 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
1453 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
1454 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
1455 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
1456 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
1457 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
1458 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
1459 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
1460
1461 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
1462 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
1463 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
1464 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
1465 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
1466 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
1467 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
1468 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
1469 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
1470 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
1471 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
1472 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
1473 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
1474 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
1475 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
1476 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
1477
1478 in[0] = s[0];
1479 in[1] = _mm_sub_epi16(kZero, s[8]);
1480 in[2] = s[12];
1481 in[3] = _mm_sub_epi16(kZero, s[4]);
1482 in[4] = _mm_packs_epi32(v[4], v[5]);
1483 in[5] = _mm_packs_epi32(v[12], v[13]);
1484 in[6] = _mm_packs_epi32(v[8], v[9]);
1485 in[7] = _mm_packs_epi32(v[0], v[1]);
1486 in[8] = _mm_packs_epi32(v[2], v[3]);
1487 in[9] = _mm_packs_epi32(v[10], v[11]);
1488 in[10] = _mm_packs_epi32(v[14], v[15]);
1489 in[11] = _mm_packs_epi32(v[6], v[7]);
1490 in[12] = s[5];
1491 in[13] = _mm_sub_epi16(kZero, s[13]);
1492 in[14] = s[9];
1493 in[15] = _mm_sub_epi16(kZero, s[1]);
1494 }
1495
fdct16_sse2(__m128i * in0,__m128i * in1)1496 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
1497 fdct16_8col(in0);
1498 fdct16_8col(in1);
1499 transpose_16bit_16x16(in0, in1);
1500 }
1501
fadst16_sse2(__m128i * in0,__m128i * in1)1502 static void fadst16_sse2(__m128i *in0, __m128i *in1) {
1503 fadst16_8col(in0);
1504 fadst16_8col(in1);
1505 transpose_16bit_16x16(in0, in1);
1506 }
1507
vp9_fht16x16_sse2(const int16_t * input,tran_low_t * output,int stride,int tx_type)1508 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
1509 int tx_type) {
1510 __m128i in0[16], in1[16];
1511
1512 switch (tx_type) {
1513 case DCT_DCT: vpx_fdct16x16_sse2(input, output, stride); break;
1514 case ADST_DCT:
1515 load_buffer_16x16(input, in0, in1, stride);
1516 fadst16_sse2(in0, in1);
1517 right_shift_16x16(in0, in1);
1518 fdct16_sse2(in0, in1);
1519 write_buffer_16x16(output, in0, in1, 16);
1520 break;
1521 case DCT_ADST:
1522 load_buffer_16x16(input, in0, in1, stride);
1523 fdct16_sse2(in0, in1);
1524 right_shift_16x16(in0, in1);
1525 fadst16_sse2(in0, in1);
1526 write_buffer_16x16(output, in0, in1, 16);
1527 break;
1528 default:
1529 assert(tx_type == ADST_ADST);
1530 load_buffer_16x16(input, in0, in1, stride);
1531 fadst16_sse2(in0, in1);
1532 right_shift_16x16(in0, in1);
1533 fadst16_sse2(in0, in1);
1534 write_buffer_16x16(output, in0, in1, 16);
1535 break;
1536 }
1537 }
1538