xref: /aosp_15_r20/external/libaom/av1/common/x86/highbd_inv_txfm_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #include <assert.h>
12 #include <smmintrin.h> /* SSE4.1 */
13 
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/av1_inv_txfm1d_cfg.h"
18 #include "av1/common/idct.h"
19 #include "av1/common/x86/av1_inv_txfm_ssse3.h"
20 #include "av1/common/x86/av1_txfm_sse2.h"
21 #include "av1/common/x86/av1_txfm_sse4.h"
22 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
23 
highbd_clamp_epi16(__m128i u,int bd)24 static inline __m128i highbd_clamp_epi16(__m128i u, int bd) {
25   const __m128i zero = _mm_setzero_si128();
26   const __m128i one = _mm_set1_epi16(1);
27   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
28   __m128i clamped, mask;
29 
30   mask = _mm_cmpgt_epi16(u, max);
31   clamped = _mm_andnot_si128(mask, u);
32   mask = _mm_and_si128(mask, max);
33   clamped = _mm_or_si128(mask, clamped);
34   mask = _mm_cmpgt_epi16(clamped, zero);
35   clamped = _mm_and_si128(clamped, mask);
36 
37   return clamped;
38 }
39 
round_shift_4x4(__m128i * in,int shift)40 static inline void round_shift_4x4(__m128i *in, int shift) {
41   if (shift != 0) {
42     __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
43     in[0] = _mm_add_epi32(in[0], rnding);
44     in[1] = _mm_add_epi32(in[1], rnding);
45     in[2] = _mm_add_epi32(in[2], rnding);
46     in[3] = _mm_add_epi32(in[3], rnding);
47 
48     in[0] = _mm_srai_epi32(in[0], shift);
49     in[1] = _mm_srai_epi32(in[1], shift);
50     in[2] = _mm_srai_epi32(in[2], shift);
51     in[3] = _mm_srai_epi32(in[3], shift);
52   }
53 }
54 
round_shift_8x8(__m128i * in,int shift)55 static void round_shift_8x8(__m128i *in, int shift) {
56   round_shift_4x4(&in[0], shift);
57   round_shift_4x4(&in[4], shift);
58   round_shift_4x4(&in[8], shift);
59   round_shift_4x4(&in[12], shift);
60 }
61 
highbd_clamp_epi32_sse4_1(__m128i * in,__m128i * out,const __m128i * clamp_lo,const __m128i * clamp_hi,int size)62 static void highbd_clamp_epi32_sse4_1(__m128i *in, __m128i *out,
63                                       const __m128i *clamp_lo,
64                                       const __m128i *clamp_hi, int size) {
65   __m128i a0, a1;
66   for (int i = 0; i < size; i += 4) {
67     a0 = _mm_max_epi32(in[i], *clamp_lo);
68     out[i] = _mm_min_epi32(a0, *clamp_hi);
69 
70     a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
71     out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
72 
73     a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
74     out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
75 
76     a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
77     out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
78   }
79 }
80 
highbd_get_recon_8x8_sse4_1(const __m128i pred,__m128i res0,__m128i res1,const int bd)81 static inline __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
82                                                   __m128i res0, __m128i res1,
83                                                   const int bd) {
84   __m128i x0 = _mm_cvtepi16_epi32(pred);
85   __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
86   __m128i min_clip_val = _mm_setzero_si128();
87   __m128i max_clip_val = _mm_set1_epi32((1 << bd) - 1);
88   x0 = _mm_add_epi32(res0, x0);
89   x1 = _mm_add_epi32(res1, x1);
90   x0 = _mm_max_epi32(x0, min_clip_val);
91   x0 = _mm_min_epi32(x0, max_clip_val);
92   x1 = _mm_max_epi32(x1, min_clip_val);
93   x1 = _mm_min_epi32(x1, max_clip_val);
94   x0 = _mm_packus_epi32(x0, x1);
95   return x0;
96 }
97 
highbd_get_recon_4xn_sse4_1(const __m128i pred,__m128i res0,const int bd)98 static inline __m128i highbd_get_recon_4xn_sse4_1(const __m128i pred,
99                                                   __m128i res0, const int bd) {
100   __m128i x0 = _mm_cvtepi16_epi32(pred);
101 
102   x0 = _mm_add_epi32(res0, x0);
103   x0 = _mm_packus_epi32(x0, x0);
104   x0 = highbd_clamp_epi16(x0, bd);
105   return x0;
106 }
107 
highbd_write_buffer_4xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)108 static inline void highbd_write_buffer_4xn_sse4_1(__m128i *in, uint16_t *output,
109                                                   int stride, int flipud,
110                                                   int height, const int bd) {
111   int j = flipud ? (height - 1) : 0;
112   const int step = flipud ? -1 : 1;
113   for (int i = 0; i < height; ++i, j += step) {
114     __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
115     __m128i u = highbd_get_recon_4xn_sse4_1(v, in[j], bd);
116 
117     _mm_storel_epi64((__m128i *)(output + i * stride), u);
118   }
119 }
120 
highbd_write_buffer_8xn_sse4_1(__m128i * in,uint16_t * output,int stride,int flipud,int height,const int bd)121 static inline void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
122                                                   int stride, int flipud,
123                                                   int height, const int bd) {
124   int j = flipud ? (height - 1) : 0;
125   const int step = flipud ? -1 : 1;
126   for (int i = 0; i < height; ++i, j += step) {
127     __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
128     __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
129 
130     _mm_storeu_si128((__m128i *)(output + i * stride), u);
131   }
132 }
133 
load_buffer_32bit_input(const int32_t * in,int stride,__m128i * out,int out_size)134 static inline void load_buffer_32bit_input(const int32_t *in, int stride,
135                                            __m128i *out, int out_size) {
136   for (int i = 0; i < out_size; ++i) {
137     out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
138   }
139 }
140 
load_buffer_4x4(const int32_t * coeff,__m128i * in)141 static inline void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
142   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
143   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
144   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
145   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
146 }
147 
av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t * input,uint8_t * dest8,int stride,int bd)148 void av1_highbd_iwht4x4_16_add_sse4_1(const tran_low_t *input, uint8_t *dest8,
149                                       int stride, int bd) {
150   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
151      0.5 shifts per pixel. */
152   __m128i op[4];
153   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
154 
155   load_buffer_4x4(input, op);
156 
157   // Shift before-hand.
158   op[0] = _mm_srai_epi32(op[0], UNIT_QUANT_SHIFT);
159   op[1] = _mm_srai_epi32(op[1], UNIT_QUANT_SHIFT);
160   op[2] = _mm_srai_epi32(op[2], UNIT_QUANT_SHIFT);
161   op[3] = _mm_srai_epi32(op[3], UNIT_QUANT_SHIFT);
162 
163   for (int i = 0; i < 2; ++i) {
164     __m128i a1 = op[0];
165     __m128i c1 = op[1];
166     __m128i d1 = op[2];
167     __m128i b1 = op[3];
168     a1 = _mm_add_epi32(a1, c1);          // a1 += c1
169     d1 = _mm_sub_epi32(d1, b1);          // d1 -= b1
170     __m128i e1 = _mm_sub_epi32(a1, d1);  // e1 = (a1 - d1) >> 1
171     e1 = _mm_srai_epi32(e1, 1);
172     b1 = _mm_sub_epi32(e1, b1);  // b1 = e1 - b1
173     c1 = _mm_sub_epi32(e1, c1);  // c1 = e1 - c1
174     a1 = _mm_sub_epi32(a1, b1);  // a1 -= b1
175     d1 = _mm_add_epi32(d1, c1);  // d1 += c1
176 
177     op[0] = a1;
178     op[1] = b1;
179     op[2] = c1;
180     op[3] = d1;
181     if (i == 0) {
182       transpose_32bit_4x4(op, op);
183     }
184   }
185 
186   // Convert to int16_t. The C code checks that we are in range.
187   op[0] = _mm_packs_epi32(op[0], op[1]);
188   op[1] = _mm_packs_epi32(op[2], op[3]);
189 
190   // Load uint16_t.
191   __m128i dst[2];
192   __m128i tmp[4];
193   tmp[0] = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
194   tmp[1] = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
195   dst[0] = _mm_unpacklo_epi64(tmp[0], tmp[1]);
196   tmp[2] = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
197   tmp[3] = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
198   dst[1] = _mm_unpacklo_epi64(tmp[2], tmp[3]);
199 
200   // Add to the previous results.
201   dst[0] = _mm_add_epi16(dst[0], op[0]);
202   dst[1] = _mm_add_epi16(dst[1], op[1]);
203 
204   // Clamp.
205   dst[0] = highbd_clamp_epi16(dst[0], bd);
206   dst[1] = highbd_clamp_epi16(dst[1], bd);
207 
208   // Store.
209   _mm_storel_epi64((__m128i *)(dest + 0 * stride), dst[0]);
210   dst[0] = _mm_srli_si128(dst[0], 8);
211   _mm_storel_epi64((__m128i *)(dest + 1 * stride), dst[0]);
212   _mm_storel_epi64((__m128i *)(dest + 2 * stride), dst[1]);
213   dst[1] = _mm_srli_si128(dst[1], 8);
214   _mm_storel_epi64((__m128i *)(dest + 3 * stride), dst[1]);
215 }
216 
addsub_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi)217 static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
218                           __m128i *out1, const __m128i *clamp_lo,
219                           const __m128i *clamp_hi) {
220   __m128i a0 = _mm_add_epi32(in0, in1);
221   __m128i a1 = _mm_sub_epi32(in0, in1);
222 
223   a0 = _mm_max_epi32(a0, *clamp_lo);
224   a0 = _mm_min_epi32(a0, *clamp_hi);
225   a1 = _mm_max_epi32(a1, *clamp_lo);
226   a1 = _mm_min_epi32(a1, *clamp_hi);
227 
228   *out0 = a0;
229   *out1 = a1;
230 }
231 
shift_and_clamp_sse4_1(__m128i * in0,__m128i * in1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)232 static void shift_and_clamp_sse4_1(__m128i *in0, __m128i *in1,
233                                    const __m128i *clamp_lo,
234                                    const __m128i *clamp_hi, int shift) {
235   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
236   __m128i in0_w_offset = _mm_add_epi32(*in0, offset);
237   __m128i in1_w_offset = _mm_add_epi32(*in1, offset);
238 
239   in0_w_offset = _mm_sra_epi32(in0_w_offset, _mm_cvtsi32_si128(shift));
240   in1_w_offset = _mm_sra_epi32(in1_w_offset, _mm_cvtsi32_si128(shift));
241 
242   in0_w_offset = _mm_max_epi32(in0_w_offset, *clamp_lo);
243   in0_w_offset = _mm_min_epi32(in0_w_offset, *clamp_hi);
244   in1_w_offset = _mm_max_epi32(in1_w_offset, *clamp_lo);
245   in1_w_offset = _mm_min_epi32(in1_w_offset, *clamp_hi);
246 
247   *in0 = in0_w_offset;
248   *in1 = in1_w_offset;
249 }
250 
idct32_stage4_sse4_1(__m128i * bf1,const __m128i * cospim8,const __m128i * cospi56,const __m128i * cospi8,const __m128i * cospim56,const __m128i * cospim40,const __m128i * cospi24,const __m128i * cospi40,const __m128i * cospim24,const __m128i * rounding,int bit)251 static inline void idct32_stage4_sse4_1(
252     __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
253     const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
254     const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
255     const __m128i *rounding, int bit) {
256   __m128i temp1, temp2;
257   temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
258   bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
259   bf1[17] = temp1;
260 
261   temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
262   bf1[29] =
263       half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
264   bf1[18] = temp2;
265 
266   temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
267   bf1[26] =
268       half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
269   bf1[21] = temp1;
270 
271   temp2 =
272       half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
273   bf1[25] =
274       half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
275   bf1[22] = temp2;
276 }
277 
idct32_stage5_sse4_1(__m128i * bf1,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)278 static inline void idct32_stage5_sse4_1(
279     __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
280     const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
281     const __m128i *clamp_hi, const __m128i *rounding, int bit) {
282   __m128i temp1, temp2;
283   temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
284   bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
285   bf1[9] = temp1;
286 
287   temp2 =
288       half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
289   bf1[13] =
290       half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
291   bf1[10] = temp2;
292 
293   addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
294   addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
295   addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
296   addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
297   addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
298   addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
299   addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
300   addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
301 }
302 
idct32_stage6_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)303 static inline void idct32_stage6_sse4_1(
304     __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
305     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
306     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
307     const __m128i *rounding, int bit) {
308   __m128i temp1, temp2;
309   temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
310   bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
311   bf1[5] = temp1;
312 
313   addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
314   addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
315   addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
316   addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
317 
318   temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
319   bf1[29] =
320       half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
321   bf1[18] = temp1;
322   temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
323   bf1[28] =
324       half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
325   bf1[19] = temp2;
326   temp1 =
327       half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
328   bf1[27] =
329       half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
330   bf1[20] = temp1;
331   temp2 =
332       half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
333   bf1[26] =
334       half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
335   bf1[21] = temp2;
336 }
337 
idct32_stage7_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)338 static inline void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
339                                         const __m128i *cospi32,
340                                         const __m128i *clamp_lo,
341                                         const __m128i *clamp_hi,
342                                         const __m128i *rounding, int bit) {
343   __m128i temp1, temp2;
344   addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
345   addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
346   addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
347   addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
348 
349   temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
350   bf1[13] =
351       half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
352   bf1[10] = temp1;
353   temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
354   bf1[12] =
355       half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
356   bf1[11] = temp2;
357 
358   addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
359   addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
360   addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
361   addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
362   addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
363   addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
364   addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
365   addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
366 }
367 
idct32_stage8_sse4_1(__m128i * bf1,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rounding,int bit)368 static inline void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
369                                         const __m128i *cospi32,
370                                         const __m128i *clamp_lo,
371                                         const __m128i *clamp_hi,
372                                         const __m128i *rounding, int bit) {
373   __m128i temp1, temp2;
374   addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
375   addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
376   addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
377   addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
378   addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
379   addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
380   addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
381   addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
382 
383   temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
384   bf1[27] =
385       half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
386   bf1[20] = temp1;
387   temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
388   bf1[26] =
389       half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
390   bf1[21] = temp2;
391   temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
392   bf1[25] =
393       half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
394   bf1[22] = temp1;
395   temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
396   bf1[24] =
397       half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
398   bf1[23] = temp2;
399 }
400 
idct32_stage9_sse4_1(__m128i * bf1,__m128i * out,const int do_cols,const int bd,const int out_shift,const __m128i * clamp_lo,const __m128i * clamp_hi)401 static inline void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
402                                         const int do_cols, const int bd,
403                                         const int out_shift,
404                                         const __m128i *clamp_lo,
405                                         const __m128i *clamp_hi) {
406   addsub_sse4_1(bf1[0], bf1[31], out + 0, out + 31, clamp_lo, clamp_hi);
407   addsub_sse4_1(bf1[1], bf1[30], out + 1, out + 30, clamp_lo, clamp_hi);
408   addsub_sse4_1(bf1[2], bf1[29], out + 2, out + 29, clamp_lo, clamp_hi);
409   addsub_sse4_1(bf1[3], bf1[28], out + 3, out + 28, clamp_lo, clamp_hi);
410   addsub_sse4_1(bf1[4], bf1[27], out + 4, out + 27, clamp_lo, clamp_hi);
411   addsub_sse4_1(bf1[5], bf1[26], out + 5, out + 26, clamp_lo, clamp_hi);
412   addsub_sse4_1(bf1[6], bf1[25], out + 6, out + 25, clamp_lo, clamp_hi);
413   addsub_sse4_1(bf1[7], bf1[24], out + 7, out + 24, clamp_lo, clamp_hi);
414   addsub_sse4_1(bf1[8], bf1[23], out + 8, out + 23, clamp_lo, clamp_hi);
415   addsub_sse4_1(bf1[9], bf1[22], out + 9, out + 22, clamp_lo, clamp_hi);
416   addsub_sse4_1(bf1[10], bf1[21], out + 10, out + 21, clamp_lo, clamp_hi);
417   addsub_sse4_1(bf1[11], bf1[20], out + 11, out + 20, clamp_lo, clamp_hi);
418   addsub_sse4_1(bf1[12], bf1[19], out + 12, out + 19, clamp_lo, clamp_hi);
419   addsub_sse4_1(bf1[13], bf1[18], out + 13, out + 18, clamp_lo, clamp_hi);
420   addsub_sse4_1(bf1[14], bf1[17], out + 14, out + 17, clamp_lo, clamp_hi);
421   addsub_sse4_1(bf1[15], bf1[16], out + 15, out + 16, clamp_lo, clamp_hi);
422 
423   if (!do_cols) {
424     const int log_range_out = AOMMAX(16, bd + 6);
425     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
426     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
427     for (int i = 0; i < 32; i += 8) {
428       round_shift_4x4(out + i, out_shift);
429       round_shift_4x4(out + i + 4, out_shift);
430     }
431     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
432   }
433 }
434 
neg_shift_sse4_1(const __m128i in0,const __m128i in1,__m128i * out0,__m128i * out1,const __m128i * clamp_lo,const __m128i * clamp_hi,int shift)435 static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
436                              __m128i *out0, __m128i *out1,
437                              const __m128i *clamp_lo, const __m128i *clamp_hi,
438                              int shift) {
439   __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
440   __m128i a0 = _mm_add_epi32(offset, in0);
441   __m128i a1 = _mm_sub_epi32(offset, in1);
442 
443   a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
444   a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
445 
446   a0 = _mm_max_epi32(a0, *clamp_lo);
447   a0 = _mm_min_epi32(a0, *clamp_hi);
448   a1 = _mm_max_epi32(a1, *clamp_lo);
449   a1 = _mm_min_epi32(a1, *clamp_hi);
450 
451   *out0 = a0;
452   *out1 = a1;
453 }
454 
idct4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)455 static void idct4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
456                            int bd, int out_shift) {
457   const int32_t *cospi = cospi_arr(bit);
458   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
459   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
460   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
461   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
462   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
463   int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
464   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
465   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
466   __m128i u0, u1, u2, u3;
467   __m128i v0, v1, v2, v3, x, y;
468 
469   // Stage 0
470   // Stage 1
471   // Stage 2
472   u0 = in[0];
473   u1 = in[1];
474   u2 = in[2];
475   u3 = in[3];
476 
477   x = _mm_mullo_epi32(u0, cospi32);
478   y = _mm_mullo_epi32(u2, cospi32);
479   v0 = _mm_add_epi32(x, y);
480   v0 = _mm_add_epi32(v0, rnding);
481   v0 = _mm_srai_epi32(v0, bit);
482 
483   v1 = _mm_sub_epi32(x, y);
484   v1 = _mm_add_epi32(v1, rnding);
485   v1 = _mm_srai_epi32(v1, bit);
486 
487   x = _mm_mullo_epi32(u1, cospi48);
488   y = _mm_mullo_epi32(u3, cospim16);
489   v2 = _mm_add_epi32(x, y);
490   v2 = _mm_add_epi32(v2, rnding);
491   v2 = _mm_srai_epi32(v2, bit);
492 
493   x = _mm_mullo_epi32(u1, cospi16);
494   y = _mm_mullo_epi32(u3, cospi48);
495   v3 = _mm_add_epi32(x, y);
496   v3 = _mm_add_epi32(v3, rnding);
497   v3 = _mm_srai_epi32(v3, bit);
498 
499   // Stage 3
500   addsub_sse4_1(v0, v3, out + 0, out + 3, &clamp_lo, &clamp_hi);
501   addsub_sse4_1(v1, v2, out + 1, out + 2, &clamp_lo, &clamp_hi);
502 
503   if (!do_cols) {
504     log_range = AOMMAX(16, bd + 6);
505     clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
506     clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
507 
508     shift_and_clamp_sse4_1(out + 0, out + 3, &clamp_lo, &clamp_hi, out_shift);
509     shift_and_clamp_sse4_1(out + 1, out + 2, &clamp_lo, &clamp_hi, out_shift);
510   }
511 }
512 
iadst4x4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)513 static void iadst4x4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
514                             int bd, int out_shift) {
515   const int32_t *sinpi = sinpi_arr(bit);
516   const __m128i zero = _mm_setzero_si128();
517   __m128i rnding = _mm_set1_epi32(1 << (bit + 4 - 1));
518   rnding = _mm_unpacklo_epi32(rnding, zero);
519   const __m128i mul = _mm_set1_epi32(1 << 4);
520   const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
521   const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
522   const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
523   const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
524   __m128i t;
525   __m128i s0, s1, s2, s3, s4, s5, s6, s7;
526   __m128i x0, x1, x2, x3;
527   __m128i u0, u1, u2, u3;
528   __m128i u0_low, u1_low, u2_low, u3_low;
529   __m128i u0_high, u1_high, u2_high, u3_high;
530 
531   x0 = in[0];
532   x1 = in[1];
533   x2 = in[2];
534   x3 = in[3];
535 
536   s0 = _mm_mullo_epi32(x0, sinpi1);
537   s1 = _mm_mullo_epi32(x0, sinpi2);
538   s2 = _mm_mullo_epi32(x1, sinpi3);
539   s3 = _mm_mullo_epi32(x2, sinpi4);
540   s4 = _mm_mullo_epi32(x2, sinpi1);
541   s5 = _mm_mullo_epi32(x3, sinpi2);
542   s6 = _mm_mullo_epi32(x3, sinpi4);
543   t = _mm_sub_epi32(x0, x2);
544   s7 = _mm_add_epi32(t, x3);
545 
546   t = _mm_add_epi32(s0, s3);
547   s0 = _mm_add_epi32(t, s5);
548   t = _mm_sub_epi32(s1, s4);
549   s1 = _mm_sub_epi32(t, s6);
550   s3 = s2;
551   s2 = _mm_mullo_epi32(s7, sinpi3);
552 
553   u0 = _mm_add_epi32(s0, s3);
554   u1 = _mm_add_epi32(s1, s3);
555   u2 = s2;
556   t = _mm_add_epi32(s0, s1);
557   u3 = _mm_sub_epi32(t, s3);
558 
559   // u0
560   u0_low = _mm_mul_epi32(u0, mul);
561   u0_low = _mm_add_epi64(u0_low, rnding);
562 
563   u0 = _mm_srli_si128(u0, 4);
564   u0_high = _mm_mul_epi32(u0, mul);
565   u0_high = _mm_add_epi64(u0_high, rnding);
566 
567   u0_low = _mm_srli_si128(u0_low, 2);
568   u0_high = _mm_srli_si128(u0_high, 2);
569 
570   u0 = _mm_unpacklo_epi32(u0_low, u0_high);
571   u0_high = _mm_unpackhi_epi32(u0_low, u0_high);
572   u0 = _mm_unpacklo_epi64(u0, u0_high);
573 
574   // u1
575   u1_low = _mm_mul_epi32(u1, mul);
576   u1_low = _mm_add_epi64(u1_low, rnding);
577 
578   u1 = _mm_srli_si128(u1, 4);
579   u1_high = _mm_mul_epi32(u1, mul);
580   u1_high = _mm_add_epi64(u1_high, rnding);
581 
582   u1_low = _mm_srli_si128(u1_low, 2);
583   u1_high = _mm_srli_si128(u1_high, 2);
584 
585   u1 = _mm_unpacklo_epi32(u1_low, u1_high);
586   u1_high = _mm_unpackhi_epi32(u1_low, u1_high);
587   u1 = _mm_unpacklo_epi64(u1, u1_high);
588 
589   // u2
590   u2_low = _mm_mul_epi32(u2, mul);
591   u2_low = _mm_add_epi64(u2_low, rnding);
592 
593   u2 = _mm_srli_si128(u2, 4);
594   u2_high = _mm_mul_epi32(u2, mul);
595   u2_high = _mm_add_epi64(u2_high, rnding);
596 
597   u2_low = _mm_srli_si128(u2_low, 2);
598   u2_high = _mm_srli_si128(u2_high, 2);
599 
600   u2 = _mm_unpacklo_epi32(u2_low, u2_high);
601   u2_high = _mm_unpackhi_epi32(u2_low, u2_high);
602   u2 = _mm_unpacklo_epi64(u2, u2_high);
603 
604   // u3
605   u3_low = _mm_mul_epi32(u3, mul);
606   u3_low = _mm_add_epi64(u3_low, rnding);
607 
608   u3 = _mm_srli_si128(u3, 4);
609   u3_high = _mm_mul_epi32(u3, mul);
610   u3_high = _mm_add_epi64(u3_high, rnding);
611 
612   u3_low = _mm_srli_si128(u3_low, 2);
613   u3_high = _mm_srli_si128(u3_high, 2);
614 
615   u3 = _mm_unpacklo_epi32(u3_low, u3_high);
616   u3_high = _mm_unpackhi_epi32(u3_low, u3_high);
617   u3 = _mm_unpacklo_epi64(u3, u3_high);
618 
619   out[0] = u0;
620   out[1] = u1;
621   out[2] = u2;
622   out[3] = u3;
623 
624   if (!do_cols) {
625     const int log_range = AOMMAX(16, bd + 6);
626     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
627     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
628     round_shift_4x4(out, out_shift);
629     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
630   }
631 }
632 
write_buffer_4x4(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)633 static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
634                              int fliplr, int flipud, int shift, int bd) {
635   const __m128i zero = _mm_setzero_si128();
636   __m128i u0, u1, u2, u3;
637   __m128i v0, v1, v2, v3;
638 
639   round_shift_4x4(in, shift);
640 
641   v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
642   v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
643   v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
644   v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
645 
646   v0 = _mm_unpacklo_epi16(v0, zero);
647   v1 = _mm_unpacklo_epi16(v1, zero);
648   v2 = _mm_unpacklo_epi16(v2, zero);
649   v3 = _mm_unpacklo_epi16(v3, zero);
650 
651   if (fliplr) {
652     in[0] = _mm_shuffle_epi32(in[0], 0x1B);
653     in[1] = _mm_shuffle_epi32(in[1], 0x1B);
654     in[2] = _mm_shuffle_epi32(in[2], 0x1B);
655     in[3] = _mm_shuffle_epi32(in[3], 0x1B);
656   }
657 
658   if (flipud) {
659     u0 = _mm_add_epi32(in[3], v0);
660     u1 = _mm_add_epi32(in[2], v1);
661     u2 = _mm_add_epi32(in[1], v2);
662     u3 = _mm_add_epi32(in[0], v3);
663   } else {
664     u0 = _mm_add_epi32(in[0], v0);
665     u1 = _mm_add_epi32(in[1], v1);
666     u2 = _mm_add_epi32(in[2], v2);
667     u3 = _mm_add_epi32(in[3], v3);
668   }
669 
670   v0 = _mm_packus_epi32(u0, u1);
671   v2 = _mm_packus_epi32(u2, u3);
672 
673   u0 = highbd_clamp_epi16(v0, bd);
674   u2 = highbd_clamp_epi16(v2, bd);
675 
676   v0 = _mm_unpacklo_epi64(u0, u0);
677   v1 = _mm_unpackhi_epi64(u0, u0);
678   v2 = _mm_unpacklo_epi64(u2, u2);
679   v3 = _mm_unpackhi_epi64(u2, u2);
680 
681   _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
682   _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
683   _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
684   _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
685 }
686 
iidentity4_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)687 static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
688                               int bd, int out_shift) {
689   (void)bit;
690   __m128i zero = _mm_setzero_si128();
691   __m128i fact = _mm_set1_epi32(NewSqrt2);
692   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
693   __m128i a0_low, a1_low;
694   __m128i a0_high, a1_high;
695 
696   offset = _mm_unpacklo_epi32(offset, zero);
697 
698   for (int i = 0; i < 4; i++) {
699     a0_low = _mm_mul_epi32(in[i], fact);
700     a0_low = _mm_add_epi32(a0_low, offset);
701     a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
702 
703     a0_high = _mm_srli_si128(in[i], 4);
704     a0_high = _mm_mul_epi32(a0_high, fact);
705     a0_high = _mm_add_epi32(a0_high, offset);
706     a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
707 
708     a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
709     a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
710     out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
711   }
712 
713   if (!do_cols) {
714     const int log_range = AOMMAX(16, bd + 6);
715     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
716     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
717     round_shift_4x4(out, out_shift);
718     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
719   }
720 }
av1_inv_txfm2d_add_4x4_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)721 void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *input, uint16_t *output,
722                                    int stride, TX_TYPE tx_type, int bd) {
723   __m128i in[4];
724   const int8_t *shift = av1_inv_txfm_shift_ls[TX_4X4];
725 
726   switch (tx_type) {
727     case DCT_DCT:
728       load_buffer_4x4(input, in);
729       idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
730       transpose_32bit_4x4(in, in);
731       idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
732       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
733       break;
734     case ADST_DCT:
735       load_buffer_4x4(input, in);
736       idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
737       transpose_32bit_4x4(in, in);
738       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
739       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
740       break;
741     case DCT_ADST:
742       load_buffer_4x4(input, in);
743       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
744       transpose_32bit_4x4(in, in);
745       idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
746       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
747       break;
748     case ADST_ADST:
749       load_buffer_4x4(input, in);
750       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
751       transpose_32bit_4x4(in, in);
752       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
753       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
754       break;
755     case FLIPADST_DCT:
756       load_buffer_4x4(input, in);
757       idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
758       transpose_32bit_4x4(in, in);
759       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
760       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
761       break;
762     case DCT_FLIPADST:
763       load_buffer_4x4(input, in);
764       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
765       transpose_32bit_4x4(in, in);
766       idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
767       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
768       break;
769     case FLIPADST_FLIPADST:
770       load_buffer_4x4(input, in);
771       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
772       transpose_32bit_4x4(in, in);
773       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
774       write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
775       break;
776     case ADST_FLIPADST:
777       load_buffer_4x4(input, in);
778       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
779       transpose_32bit_4x4(in, in);
780       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
781       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
782       break;
783     case FLIPADST_ADST:
784       load_buffer_4x4(input, in);
785       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
786       transpose_32bit_4x4(in, in);
787       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
788       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
789       break;
790     case IDTX:
791       load_buffer_4x4(input, in);
792       iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
793       transpose_32bit_4x4(in, in);
794       iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
795       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
796       break;
797     case V_DCT:
798       load_buffer_4x4(input, in);
799       iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
800       transpose_32bit_4x4(in, in);
801       idct4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
802       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
803       break;
804     case H_DCT:
805       load_buffer_4x4(input, in);
806       idct4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
807       transpose_32bit_4x4(in, in);
808       iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
809       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
810       break;
811     case V_ADST:
812       load_buffer_4x4(input, in);
813       iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
814       transpose_32bit_4x4(in, in);
815       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
816       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
817       break;
818     case H_ADST:
819       load_buffer_4x4(input, in);
820       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
821       transpose_32bit_4x4(in, in);
822       iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
823       write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
824       break;
825     case V_FLIPADST:
826       load_buffer_4x4(input, in);
827       iidentity4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
828       transpose_32bit_4x4(in, in);
829       iadst4x4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
830       write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
831       break;
832     case H_FLIPADST:
833       load_buffer_4x4(input, in);
834       iadst4x4_sse4_1(in, in, INV_COS_BIT, 0, bd, 0);
835       transpose_32bit_4x4(in, in);
836       iidentity4_sse4_1(in, in, INV_COS_BIT, 1, bd, 0);
837       write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
838       break;
839     default: assert(0);
840   }
841 }
842 
843 // 8x8
load_buffer_8x8(const int32_t * coeff,__m128i * in)844 static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
845   in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
846   in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
847   in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
848   in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
849   in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
850   in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
851   in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
852   in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
853   in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
854   in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
855   in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
856   in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
857   in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
858   in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
859   in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
860   in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
861 }
862 
idct8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)863 static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
864                            int bd, int out_shift) {
865   const int32_t *cospi = cospi_arr(bit);
866   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
867   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
868   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
869   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
870   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
871   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
872   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
873   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
874   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
875   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
876   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
877   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
878   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
879   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
880   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
881   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
882   __m128i x, y;
883   int col;
884 
885   // Note:
886   //  Even column: 0, 2, ..., 14
887   //  Odd column: 1, 3, ..., 15
888   //  one even column plus one odd column constructs one row (8 coeffs)
889   //  total we have 8 rows (8x8).
890   for (col = 0; col < 2; ++col) {
891     // stage 0
892     // stage 1
893     // stage 2
894     u0 = in[0 * 2 + col];
895     u1 = in[4 * 2 + col];
896     u2 = in[2 * 2 + col];
897     u3 = in[6 * 2 + col];
898 
899     x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
900     y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
901     u4 = _mm_add_epi32(x, y);
902     u4 = _mm_add_epi32(u4, rnding);
903     u4 = _mm_srai_epi32(u4, bit);
904 
905     x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
906     y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
907     u7 = _mm_add_epi32(x, y);
908     u7 = _mm_add_epi32(u7, rnding);
909     u7 = _mm_srai_epi32(u7, bit);
910 
911     x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
912     y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
913     u5 = _mm_add_epi32(x, y);
914     u5 = _mm_add_epi32(u5, rnding);
915     u5 = _mm_srai_epi32(u5, bit);
916 
917     x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
918     y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
919     u6 = _mm_add_epi32(x, y);
920     u6 = _mm_add_epi32(u6, rnding);
921     u6 = _mm_srai_epi32(u6, bit);
922 
923     // stage 3
924     x = _mm_mullo_epi32(u0, cospi32);
925     y = _mm_mullo_epi32(u1, cospi32);
926     v0 = _mm_add_epi32(x, y);
927     v0 = _mm_add_epi32(v0, rnding);
928     v0 = _mm_srai_epi32(v0, bit);
929 
930     v1 = _mm_sub_epi32(x, y);
931     v1 = _mm_add_epi32(v1, rnding);
932     v1 = _mm_srai_epi32(v1, bit);
933 
934     x = _mm_mullo_epi32(u2, cospi48);
935     y = _mm_mullo_epi32(u3, cospim16);
936     v2 = _mm_add_epi32(x, y);
937     v2 = _mm_add_epi32(v2, rnding);
938     v2 = _mm_srai_epi32(v2, bit);
939 
940     x = _mm_mullo_epi32(u2, cospi16);
941     y = _mm_mullo_epi32(u3, cospi48);
942     v3 = _mm_add_epi32(x, y);
943     v3 = _mm_add_epi32(v3, rnding);
944     v3 = _mm_srai_epi32(v3, bit);
945 
946     addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
947     addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
948 
949     // stage 4
950     addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
951     addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
952     u4 = v4;
953     u7 = v7;
954 
955     x = _mm_mullo_epi32(v5, cospi32);
956     y = _mm_mullo_epi32(v6, cospi32);
957     u6 = _mm_add_epi32(y, x);
958     u6 = _mm_add_epi32(u6, rnding);
959     u6 = _mm_srai_epi32(u6, bit);
960 
961     u5 = _mm_sub_epi32(y, x);
962     u5 = _mm_add_epi32(u5, rnding);
963     u5 = _mm_srai_epi32(u5, bit);
964 
965     // stage 5
966     addsub_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col, &clamp_lo,
967                   &clamp_hi);
968     addsub_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col, &clamp_lo,
969                   &clamp_hi);
970     addsub_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col, &clamp_lo,
971                   &clamp_hi);
972     addsub_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col, &clamp_lo,
973                   &clamp_hi);
974   }
975 
976   if (!do_cols) {
977     const int log_range_out = AOMMAX(16, bd + 6);
978     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
979     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
980     round_shift_8x8(out, out_shift);
981     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
982   }
983 }
984 
iadst8x8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)985 static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
986                             int bd, int out_shift) {
987   const int32_t *cospi = cospi_arr(bit);
988   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
989   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
990   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
991   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
992   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
993   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
994   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
995   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
996   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
997   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
998   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
999   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1000   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1001   const __m128i kZero = _mm_setzero_si128();
1002   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1003   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1004   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1005   __m128i u[8], v[8], x;
1006 
1007   // Even 8 points: 0, 2, ..., 14
1008   // stage 0
1009   // stage 1
1010   // stage 2
1011   // (1)
1012   u[0] = _mm_mullo_epi32(in[14], cospi4);
1013   x = _mm_mullo_epi32(in[0], cospi60);
1014   u[0] = _mm_add_epi32(u[0], x);
1015   u[0] = _mm_add_epi32(u[0], rnding);
1016   u[0] = _mm_srai_epi32(u[0], bit);
1017 
1018   u[1] = _mm_mullo_epi32(in[14], cospi60);
1019   x = _mm_mullo_epi32(in[0], cospi4);
1020   u[1] = _mm_sub_epi32(u[1], x);
1021   u[1] = _mm_add_epi32(u[1], rnding);
1022   u[1] = _mm_srai_epi32(u[1], bit);
1023 
1024   // (2)
1025   u[2] = _mm_mullo_epi32(in[10], cospi20);
1026   x = _mm_mullo_epi32(in[4], cospi44);
1027   u[2] = _mm_add_epi32(u[2], x);
1028   u[2] = _mm_add_epi32(u[2], rnding);
1029   u[2] = _mm_srai_epi32(u[2], bit);
1030 
1031   u[3] = _mm_mullo_epi32(in[10], cospi44);
1032   x = _mm_mullo_epi32(in[4], cospi20);
1033   u[3] = _mm_sub_epi32(u[3], x);
1034   u[3] = _mm_add_epi32(u[3], rnding);
1035   u[3] = _mm_srai_epi32(u[3], bit);
1036 
1037   // (3)
1038   u[4] = _mm_mullo_epi32(in[6], cospi36);
1039   x = _mm_mullo_epi32(in[8], cospi28);
1040   u[4] = _mm_add_epi32(u[4], x);
1041   u[4] = _mm_add_epi32(u[4], rnding);
1042   u[4] = _mm_srai_epi32(u[4], bit);
1043 
1044   u[5] = _mm_mullo_epi32(in[6], cospi28);
1045   x = _mm_mullo_epi32(in[8], cospi36);
1046   u[5] = _mm_sub_epi32(u[5], x);
1047   u[5] = _mm_add_epi32(u[5], rnding);
1048   u[5] = _mm_srai_epi32(u[5], bit);
1049 
1050   // (4)
1051   u[6] = _mm_mullo_epi32(in[2], cospi52);
1052   x = _mm_mullo_epi32(in[12], cospi12);
1053   u[6] = _mm_add_epi32(u[6], x);
1054   u[6] = _mm_add_epi32(u[6], rnding);
1055   u[6] = _mm_srai_epi32(u[6], bit);
1056 
1057   u[7] = _mm_mullo_epi32(in[2], cospi12);
1058   x = _mm_mullo_epi32(in[12], cospi52);
1059   u[7] = _mm_sub_epi32(u[7], x);
1060   u[7] = _mm_add_epi32(u[7], rnding);
1061   u[7] = _mm_srai_epi32(u[7], bit);
1062 
1063   // stage 3
1064   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1065   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1066   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1067   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1068 
1069   // stage 4
1070   u[0] = v[0];
1071   u[1] = v[1];
1072   u[2] = v[2];
1073   u[3] = v[3];
1074 
1075   u[4] = _mm_mullo_epi32(v[4], cospi16);
1076   x = _mm_mullo_epi32(v[5], cospi48);
1077   u[4] = _mm_add_epi32(u[4], x);
1078   u[4] = _mm_add_epi32(u[4], rnding);
1079   u[4] = _mm_srai_epi32(u[4], bit);
1080 
1081   u[5] = _mm_mullo_epi32(v[4], cospi48);
1082   x = _mm_mullo_epi32(v[5], cospi16);
1083   u[5] = _mm_sub_epi32(u[5], x);
1084   u[5] = _mm_add_epi32(u[5], rnding);
1085   u[5] = _mm_srai_epi32(u[5], bit);
1086 
1087   u[6] = _mm_mullo_epi32(v[6], cospim48);
1088   x = _mm_mullo_epi32(v[7], cospi16);
1089   u[6] = _mm_add_epi32(u[6], x);
1090   u[6] = _mm_add_epi32(u[6], rnding);
1091   u[6] = _mm_srai_epi32(u[6], bit);
1092 
1093   u[7] = _mm_mullo_epi32(v[6], cospi16);
1094   x = _mm_mullo_epi32(v[7], cospim48);
1095   u[7] = _mm_sub_epi32(u[7], x);
1096   u[7] = _mm_add_epi32(u[7], rnding);
1097   u[7] = _mm_srai_epi32(u[7], bit);
1098 
1099   // stage 5
1100   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1101   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1102   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1103   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1104 
1105   // stage 6
1106   u[0] = v[0];
1107   u[1] = v[1];
1108   u[4] = v[4];
1109   u[5] = v[5];
1110 
1111   v[0] = _mm_mullo_epi32(v[2], cospi32);
1112   x = _mm_mullo_epi32(v[3], cospi32);
1113   u[2] = _mm_add_epi32(v[0], x);
1114   u[2] = _mm_add_epi32(u[2], rnding);
1115   u[2] = _mm_srai_epi32(u[2], bit);
1116 
1117   u[3] = _mm_sub_epi32(v[0], x);
1118   u[3] = _mm_add_epi32(u[3], rnding);
1119   u[3] = _mm_srai_epi32(u[3], bit);
1120 
1121   v[0] = _mm_mullo_epi32(v[6], cospi32);
1122   x = _mm_mullo_epi32(v[7], cospi32);
1123   u[6] = _mm_add_epi32(v[0], x);
1124   u[6] = _mm_add_epi32(u[6], rnding);
1125   u[6] = _mm_srai_epi32(u[6], bit);
1126 
1127   u[7] = _mm_sub_epi32(v[0], x);
1128   u[7] = _mm_add_epi32(u[7], rnding);
1129   u[7] = _mm_srai_epi32(u[7], bit);
1130 
1131   // stage 7
1132   if (do_cols) {
1133     out[0] = u[0];
1134     out[2] = _mm_sub_epi32(kZero, u[4]);
1135     out[4] = u[6];
1136     out[6] = _mm_sub_epi32(kZero, u[2]);
1137     out[8] = u[3];
1138     out[10] = _mm_sub_epi32(kZero, u[7]);
1139     out[12] = u[5];
1140     out[14] = _mm_sub_epi32(kZero, u[1]);
1141   } else {
1142     const int log_range_out = AOMMAX(16, bd + 6);
1143     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1144     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1145 
1146     neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
1147                      out_shift);
1148     neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
1149                      out_shift);
1150     neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
1151                      &clamp_hi_out, out_shift);
1152     neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
1153                      &clamp_hi_out, out_shift);
1154   }
1155 
1156   // Odd 8 points: 1, 3, ..., 15
1157   // stage 0
1158   // stage 1
1159   // stage 2
1160   // (1)
1161   u[0] = _mm_mullo_epi32(in[15], cospi4);
1162   x = _mm_mullo_epi32(in[1], cospi60);
1163   u[0] = _mm_add_epi32(u[0], x);
1164   u[0] = _mm_add_epi32(u[0], rnding);
1165   u[0] = _mm_srai_epi32(u[0], bit);
1166 
1167   u[1] = _mm_mullo_epi32(in[15], cospi60);
1168   x = _mm_mullo_epi32(in[1], cospi4);
1169   u[1] = _mm_sub_epi32(u[1], x);
1170   u[1] = _mm_add_epi32(u[1], rnding);
1171   u[1] = _mm_srai_epi32(u[1], bit);
1172 
1173   // (2)
1174   u[2] = _mm_mullo_epi32(in[11], cospi20);
1175   x = _mm_mullo_epi32(in[5], cospi44);
1176   u[2] = _mm_add_epi32(u[2], x);
1177   u[2] = _mm_add_epi32(u[2], rnding);
1178   u[2] = _mm_srai_epi32(u[2], bit);
1179 
1180   u[3] = _mm_mullo_epi32(in[11], cospi44);
1181   x = _mm_mullo_epi32(in[5], cospi20);
1182   u[3] = _mm_sub_epi32(u[3], x);
1183   u[3] = _mm_add_epi32(u[3], rnding);
1184   u[3] = _mm_srai_epi32(u[3], bit);
1185 
1186   // (3)
1187   u[4] = _mm_mullo_epi32(in[7], cospi36);
1188   x = _mm_mullo_epi32(in[9], cospi28);
1189   u[4] = _mm_add_epi32(u[4], x);
1190   u[4] = _mm_add_epi32(u[4], rnding);
1191   u[4] = _mm_srai_epi32(u[4], bit);
1192 
1193   u[5] = _mm_mullo_epi32(in[7], cospi28);
1194   x = _mm_mullo_epi32(in[9], cospi36);
1195   u[5] = _mm_sub_epi32(u[5], x);
1196   u[5] = _mm_add_epi32(u[5], rnding);
1197   u[5] = _mm_srai_epi32(u[5], bit);
1198 
1199   // (4)
1200   u[6] = _mm_mullo_epi32(in[3], cospi52);
1201   x = _mm_mullo_epi32(in[13], cospi12);
1202   u[6] = _mm_add_epi32(u[6], x);
1203   u[6] = _mm_add_epi32(u[6], rnding);
1204   u[6] = _mm_srai_epi32(u[6], bit);
1205 
1206   u[7] = _mm_mullo_epi32(in[3], cospi12);
1207   x = _mm_mullo_epi32(in[13], cospi52);
1208   u[7] = _mm_sub_epi32(u[7], x);
1209   u[7] = _mm_add_epi32(u[7], rnding);
1210   u[7] = _mm_srai_epi32(u[7], bit);
1211 
1212   // stage 3
1213   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1214   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1215   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1216   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1217 
1218   // stage 4
1219   u[0] = v[0];
1220   u[1] = v[1];
1221   u[2] = v[2];
1222   u[3] = v[3];
1223 
1224   u[4] = _mm_mullo_epi32(v[4], cospi16);
1225   x = _mm_mullo_epi32(v[5], cospi48);
1226   u[4] = _mm_add_epi32(u[4], x);
1227   u[4] = _mm_add_epi32(u[4], rnding);
1228   u[4] = _mm_srai_epi32(u[4], bit);
1229 
1230   u[5] = _mm_mullo_epi32(v[4], cospi48);
1231   x = _mm_mullo_epi32(v[5], cospi16);
1232   u[5] = _mm_sub_epi32(u[5], x);
1233   u[5] = _mm_add_epi32(u[5], rnding);
1234   u[5] = _mm_srai_epi32(u[5], bit);
1235 
1236   u[6] = _mm_mullo_epi32(v[6], cospim48);
1237   x = _mm_mullo_epi32(v[7], cospi16);
1238   u[6] = _mm_add_epi32(u[6], x);
1239   u[6] = _mm_add_epi32(u[6], rnding);
1240   u[6] = _mm_srai_epi32(u[6], bit);
1241 
1242   u[7] = _mm_mullo_epi32(v[6], cospi16);
1243   x = _mm_mullo_epi32(v[7], cospim48);
1244   u[7] = _mm_sub_epi32(u[7], x);
1245   u[7] = _mm_add_epi32(u[7], rnding);
1246   u[7] = _mm_srai_epi32(u[7], bit);
1247 
1248   // stage 5
1249   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1250   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1251   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1252   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1253 
1254   // stage 6
1255   u[0] = v[0];
1256   u[1] = v[1];
1257   u[4] = v[4];
1258   u[5] = v[5];
1259 
1260   v[0] = _mm_mullo_epi32(v[2], cospi32);
1261   x = _mm_mullo_epi32(v[3], cospi32);
1262   u[2] = _mm_add_epi32(v[0], x);
1263   u[2] = _mm_add_epi32(u[2], rnding);
1264   u[2] = _mm_srai_epi32(u[2], bit);
1265 
1266   u[3] = _mm_sub_epi32(v[0], x);
1267   u[3] = _mm_add_epi32(u[3], rnding);
1268   u[3] = _mm_srai_epi32(u[3], bit);
1269 
1270   v[0] = _mm_mullo_epi32(v[6], cospi32);
1271   x = _mm_mullo_epi32(v[7], cospi32);
1272   u[6] = _mm_add_epi32(v[0], x);
1273   u[6] = _mm_add_epi32(u[6], rnding);
1274   u[6] = _mm_srai_epi32(u[6], bit);
1275 
1276   u[7] = _mm_sub_epi32(v[0], x);
1277   u[7] = _mm_add_epi32(u[7], rnding);
1278   u[7] = _mm_srai_epi32(u[7], bit);
1279 
1280   // stage 7
1281   if (do_cols) {
1282     out[1] = u[0];
1283     out[3] = _mm_sub_epi32(kZero, u[4]);
1284     out[5] = u[6];
1285     out[7] = _mm_sub_epi32(kZero, u[2]);
1286     out[9] = u[3];
1287     out[11] = _mm_sub_epi32(kZero, u[7]);
1288     out[13] = u[5];
1289     out[15] = _mm_sub_epi32(kZero, u[1]);
1290   } else {
1291     const int log_range_out = AOMMAX(16, bd + 6);
1292     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1293     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1294 
1295     neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
1296                      out_shift);
1297     neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
1298                      out_shift);
1299     neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
1300                      &clamp_hi_out, out_shift);
1301     neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
1302                      &clamp_hi_out, out_shift);
1303   }
1304 }
1305 
iidentity8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1306 static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1307                               int bd, int out_shift) {
1308   (void)bit;
1309   out[0] = _mm_add_epi32(in[0], in[0]);
1310   out[1] = _mm_add_epi32(in[1], in[1]);
1311   out[2] = _mm_add_epi32(in[2], in[2]);
1312   out[3] = _mm_add_epi32(in[3], in[3]);
1313   out[4] = _mm_add_epi32(in[4], in[4]);
1314   out[5] = _mm_add_epi32(in[5], in[5]);
1315   out[6] = _mm_add_epi32(in[6], in[6]);
1316   out[7] = _mm_add_epi32(in[7], in[7]);
1317 
1318   if (!do_cols) {
1319     const int log_range = AOMMAX(16, bd + 6);
1320     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1321     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1322     round_shift_4x4(out, out_shift);
1323     round_shift_4x4(out + 4, out_shift);
1324     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 8);
1325   }
1326 }
1327 
get_recon_8x8(const __m128i pred,__m128i res_lo,__m128i res_hi,int fliplr,int bd)1328 static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
1329                              int fliplr, int bd) {
1330   __m128i x0, x1;
1331   const __m128i zero = _mm_setzero_si128();
1332 
1333   x0 = _mm_unpacklo_epi16(pred, zero);
1334   x1 = _mm_unpackhi_epi16(pred, zero);
1335 
1336   if (fliplr) {
1337     res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
1338     res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
1339     x0 = _mm_add_epi32(res_hi, x0);
1340     x1 = _mm_add_epi32(res_lo, x1);
1341 
1342   } else {
1343     x0 = _mm_add_epi32(res_lo, x0);
1344     x1 = _mm_add_epi32(res_hi, x1);
1345   }
1346 
1347   x0 = _mm_packus_epi32(x0, x1);
1348   return highbd_clamp_epi16(x0, bd);
1349 }
1350 
write_buffer_8x8(__m128i * in,uint16_t * output,int stride,int fliplr,int flipud,int shift,int bd)1351 static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
1352                              int fliplr, int flipud, int shift, int bd) {
1353   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1354   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1355 
1356   round_shift_8x8(in, shift);
1357 
1358   v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
1359   v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
1360   v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
1361   v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
1362   v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
1363   v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
1364   v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
1365   v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
1366 
1367   if (flipud) {
1368     u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
1369     u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
1370     u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
1371     u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
1372     u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
1373     u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
1374     u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
1375     u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
1376   } else {
1377     u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
1378     u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
1379     u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
1380     u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
1381     u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
1382     u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
1383     u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
1384     u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
1385   }
1386 
1387   _mm_store_si128((__m128i *)(output + 0 * stride), u0);
1388   _mm_store_si128((__m128i *)(output + 1 * stride), u1);
1389   _mm_store_si128((__m128i *)(output + 2 * stride), u2);
1390   _mm_store_si128((__m128i *)(output + 3 * stride), u3);
1391   _mm_store_si128((__m128i *)(output + 4 * stride), u4);
1392   _mm_store_si128((__m128i *)(output + 5 * stride), u5);
1393   _mm_store_si128((__m128i *)(output + 6 * stride), u6);
1394   _mm_store_si128((__m128i *)(output + 7 * stride), u7);
1395 }
1396 
av1_inv_txfm2d_add_8x8_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,int bd)1397 void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *input, uint16_t *output,
1398                                    int stride, TX_TYPE tx_type, int bd) {
1399   __m128i in[16], out[16];
1400   const int8_t *shift = av1_inv_txfm_shift_ls[TX_8X8];
1401 
1402   switch (tx_type) {
1403     case DCT_DCT:
1404       load_buffer_8x8(input, in);
1405       idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1406       transpose_8x8(out, in);
1407       idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1408       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1409       break;
1410     case DCT_ADST:
1411       load_buffer_8x8(input, in);
1412       iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1413       transpose_8x8(out, in);
1414       idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1415       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1416       break;
1417     case ADST_DCT:
1418       load_buffer_8x8(input, in);
1419       idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1420       transpose_8x8(out, in);
1421       iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1422       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1423       break;
1424     case ADST_ADST:
1425       load_buffer_8x8(input, in);
1426       iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1427       transpose_8x8(out, in);
1428       iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1429       write_buffer_8x8(out, output, stride, 0, 0, -shift[1], bd);
1430       break;
1431     case FLIPADST_DCT:
1432       load_buffer_8x8(input, in);
1433       idct8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1434       transpose_8x8(out, in);
1435       iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1436       write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1437       break;
1438     case DCT_FLIPADST:
1439       load_buffer_8x8(input, in);
1440       iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1441       transpose_8x8(out, in);
1442       idct8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1443       write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1444       break;
1445     case ADST_FLIPADST:
1446       load_buffer_8x8(input, in);
1447       iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1448       transpose_8x8(out, in);
1449       iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1450       write_buffer_8x8(out, output, stride, 1, 0, -shift[1], bd);
1451       break;
1452     case FLIPADST_FLIPADST:
1453       load_buffer_8x8(input, in);
1454       iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1455       transpose_8x8(out, in);
1456       iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1457       write_buffer_8x8(out, output, stride, 1, 1, -shift[1], bd);
1458       break;
1459     case FLIPADST_ADST:
1460       load_buffer_8x8(input, in);
1461       iadst8x8_sse4_1(in, out, INV_COS_BIT, 0, bd, -shift[0]);
1462       transpose_8x8(out, in);
1463       iadst8x8_sse4_1(in, out, INV_COS_BIT, 1, bd, 0);
1464       write_buffer_8x8(out, output, stride, 0, 1, -shift[1], bd);
1465       break;
1466     default: assert(0);
1467   }
1468 }
1469 
idct8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1470 static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1471                                 int bd, int out_shift) {
1472   const int32_t *cospi = cospi_arr(bit);
1473   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1474   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1475   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1476   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1477   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1478   __m128i x;
1479 
1480   // stage 0
1481   // stage 1
1482   // stage 2
1483   // stage 3
1484   x = _mm_mullo_epi32(in[0], cospi32);
1485   x = _mm_add_epi32(x, rnding);
1486   x = _mm_srai_epi32(x, bit);
1487 
1488   // stage 4
1489   // stage 5
1490   if (!do_cols) {
1491     const int log_range_out = AOMMAX(16, bd + 6);
1492     clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1493     clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1494 
1495     __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1496     x = _mm_add_epi32(x, offset);
1497     x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
1498   }
1499 
1500   x = _mm_max_epi32(x, clamp_lo);
1501   x = _mm_min_epi32(x, clamp_hi);
1502   out[0] = x;
1503   out[1] = x;
1504   out[2] = x;
1505   out[3] = x;
1506   out[4] = x;
1507   out[5] = x;
1508   out[6] = x;
1509   out[7] = x;
1510 }
1511 
idct8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1512 static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1513                                int bd, int out_shift) {
1514   const int32_t *cospi = cospi_arr(bit);
1515   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1516   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
1517   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1518   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1519   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
1520   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1521   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1522   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1523   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1524   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1525   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1526   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1527   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1528   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1529   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
1530   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
1531   __m128i x, y;
1532 
1533   // stage 0
1534   // stage 1
1535   // stage 2
1536   u0 = in[0];
1537   u1 = in[4];
1538   u2 = in[2];
1539   u3 = in[6];
1540 
1541   x = _mm_mullo_epi32(in[1], cospi56);
1542   y = _mm_mullo_epi32(in[7], cospim8);
1543   u4 = _mm_add_epi32(x, y);
1544   u4 = _mm_add_epi32(u4, rnding);
1545   u4 = _mm_srai_epi32(u4, bit);
1546 
1547   x = _mm_mullo_epi32(in[1], cospi8);
1548   y = _mm_mullo_epi32(in[7], cospi56);
1549   u7 = _mm_add_epi32(x, y);
1550   u7 = _mm_add_epi32(u7, rnding);
1551   u7 = _mm_srai_epi32(u7, bit);
1552 
1553   x = _mm_mullo_epi32(in[5], cospi24);
1554   y = _mm_mullo_epi32(in[3], cospim40);
1555   u5 = _mm_add_epi32(x, y);
1556   u5 = _mm_add_epi32(u5, rnding);
1557   u5 = _mm_srai_epi32(u5, bit);
1558 
1559   x = _mm_mullo_epi32(in[5], cospi40);
1560   y = _mm_mullo_epi32(in[3], cospi24);
1561   u6 = _mm_add_epi32(x, y);
1562   u6 = _mm_add_epi32(u6, rnding);
1563   u6 = _mm_srai_epi32(u6, bit);
1564 
1565   // stage 3
1566   x = _mm_mullo_epi32(u0, cospi32);
1567   y = _mm_mullo_epi32(u1, cospi32);
1568   v0 = _mm_add_epi32(x, y);
1569   v0 = _mm_add_epi32(v0, rnding);
1570   v0 = _mm_srai_epi32(v0, bit);
1571 
1572   v1 = _mm_sub_epi32(x, y);
1573   v1 = _mm_add_epi32(v1, rnding);
1574   v1 = _mm_srai_epi32(v1, bit);
1575 
1576   x = _mm_mullo_epi32(u2, cospi48);
1577   y = _mm_mullo_epi32(u3, cospim16);
1578   v2 = _mm_add_epi32(x, y);
1579   v2 = _mm_add_epi32(v2, rnding);
1580   v2 = _mm_srai_epi32(v2, bit);
1581 
1582   x = _mm_mullo_epi32(u2, cospi16);
1583   y = _mm_mullo_epi32(u3, cospi48);
1584   v3 = _mm_add_epi32(x, y);
1585   v3 = _mm_add_epi32(v3, rnding);
1586   v3 = _mm_srai_epi32(v3, bit);
1587 
1588   addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
1589   addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
1590 
1591   // stage 4
1592   addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
1593   addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
1594   u4 = v4;
1595   u7 = v7;
1596 
1597   x = _mm_mullo_epi32(v5, cospi32);
1598   y = _mm_mullo_epi32(v6, cospi32);
1599   u6 = _mm_add_epi32(y, x);
1600   u6 = _mm_add_epi32(u6, rnding);
1601   u6 = _mm_srai_epi32(u6, bit);
1602 
1603   u5 = _mm_sub_epi32(y, x);
1604   u5 = _mm_add_epi32(u5, rnding);
1605   u5 = _mm_srai_epi32(u5, bit);
1606 
1607   // stage 5
1608   addsub_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo, &clamp_hi);
1609   addsub_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo, &clamp_hi);
1610   addsub_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo, &clamp_hi);
1611   addsub_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo, &clamp_hi);
1612 
1613   if (!do_cols) {
1614     const int log_range_out = AOMMAX(16, bd + 6);
1615     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1616     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1617 
1618     round_shift_4x4(out, out_shift);
1619     round_shift_4x4(out + 4, out_shift);
1620     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 8);
1621   }
1622 }
1623 
iadst8x8_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1624 static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1625                                  int do_cols, int bd, int out_shift) {
1626   const int32_t *cospi = cospi_arr(bit);
1627   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1628   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1629   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1630   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1631   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1632   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1633   const __m128i kZero = _mm_setzero_si128();
1634   __m128i u[8], x;
1635 
1636   // stage 0
1637   // stage 1
1638   // stage 2
1639 
1640   x = _mm_mullo_epi32(in[0], cospi60);
1641   u[0] = _mm_add_epi32(x, rnding);
1642   u[0] = _mm_srai_epi32(u[0], bit);
1643 
1644   x = _mm_mullo_epi32(in[0], cospi4);
1645   u[1] = _mm_sub_epi32(kZero, x);
1646   u[1] = _mm_add_epi32(u[1], rnding);
1647   u[1] = _mm_srai_epi32(u[1], bit);
1648 
1649   // stage 3
1650   // stage 4
1651   __m128i temp1, temp2;
1652   temp1 = _mm_mullo_epi32(u[0], cospi16);
1653   x = _mm_mullo_epi32(u[1], cospi48);
1654   temp1 = _mm_add_epi32(temp1, x);
1655   temp1 = _mm_add_epi32(temp1, rnding);
1656   temp1 = _mm_srai_epi32(temp1, bit);
1657   u[4] = temp1;
1658 
1659   temp2 = _mm_mullo_epi32(u[0], cospi48);
1660   x = _mm_mullo_epi32(u[1], cospi16);
1661   u[5] = _mm_sub_epi32(temp2, x);
1662   u[5] = _mm_add_epi32(u[5], rnding);
1663   u[5] = _mm_srai_epi32(u[5], bit);
1664 
1665   // stage 5
1666   // stage 6
1667   temp1 = _mm_mullo_epi32(u[0], cospi32);
1668   x = _mm_mullo_epi32(u[1], cospi32);
1669   u[2] = _mm_add_epi32(temp1, x);
1670   u[2] = _mm_add_epi32(u[2], rnding);
1671   u[2] = _mm_srai_epi32(u[2], bit);
1672 
1673   u[3] = _mm_sub_epi32(temp1, x);
1674   u[3] = _mm_add_epi32(u[3], rnding);
1675   u[3] = _mm_srai_epi32(u[3], bit);
1676 
1677   temp1 = _mm_mullo_epi32(u[4], cospi32);
1678   x = _mm_mullo_epi32(u[5], cospi32);
1679   u[6] = _mm_add_epi32(temp1, x);
1680   u[6] = _mm_add_epi32(u[6], rnding);
1681   u[6] = _mm_srai_epi32(u[6], bit);
1682 
1683   u[7] = _mm_sub_epi32(temp1, x);
1684   u[7] = _mm_add_epi32(u[7], rnding);
1685   u[7] = _mm_srai_epi32(u[7], bit);
1686 
1687   // stage 7
1688   if (do_cols) {
1689     out[0] = u[0];
1690     out[1] = _mm_sub_epi32(kZero, u[4]);
1691     out[2] = u[6];
1692     out[3] = _mm_sub_epi32(kZero, u[2]);
1693     out[4] = u[3];
1694     out[5] = _mm_sub_epi32(kZero, u[7]);
1695     out[6] = u[5];
1696     out[7] = _mm_sub_epi32(kZero, u[1]);
1697   } else {
1698     const int log_range_out = AOMMAX(16, bd + 6);
1699     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1700     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1701 
1702     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1703                      out_shift);
1704     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1705                      out_shift);
1706     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1707                      out_shift);
1708     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1709                      out_shift);
1710   }
1711 }
1712 
iadst8x8_new_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1713 static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
1714                                 int bd, int out_shift) {
1715   const int32_t *cospi = cospi_arr(bit);
1716   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1717   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1718   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1719   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1720   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
1721   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1722   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
1723   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1724   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1725   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1726   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1727   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1728   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1729   const __m128i kZero = _mm_setzero_si128();
1730   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1731   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1732   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1733   __m128i u[8], v[8], x;
1734 
1735   // stage 0
1736   // stage 1
1737   // stage 2
1738 
1739   u[0] = _mm_mullo_epi32(in[7], cospi4);
1740   x = _mm_mullo_epi32(in[0], cospi60);
1741   u[0] = _mm_add_epi32(u[0], x);
1742   u[0] = _mm_add_epi32(u[0], rnding);
1743   u[0] = _mm_srai_epi32(u[0], bit);
1744 
1745   u[1] = _mm_mullo_epi32(in[7], cospi60);
1746   x = _mm_mullo_epi32(in[0], cospi4);
1747   u[1] = _mm_sub_epi32(u[1], x);
1748   u[1] = _mm_add_epi32(u[1], rnding);
1749   u[1] = _mm_srai_epi32(u[1], bit);
1750 
1751   // (2)
1752   u[2] = _mm_mullo_epi32(in[5], cospi20);
1753   x = _mm_mullo_epi32(in[2], cospi44);
1754   u[2] = _mm_add_epi32(u[2], x);
1755   u[2] = _mm_add_epi32(u[2], rnding);
1756   u[2] = _mm_srai_epi32(u[2], bit);
1757 
1758   u[3] = _mm_mullo_epi32(in[5], cospi44);
1759   x = _mm_mullo_epi32(in[2], cospi20);
1760   u[3] = _mm_sub_epi32(u[3], x);
1761   u[3] = _mm_add_epi32(u[3], rnding);
1762   u[3] = _mm_srai_epi32(u[3], bit);
1763 
1764   // (3)
1765   u[4] = _mm_mullo_epi32(in[3], cospi36);
1766   x = _mm_mullo_epi32(in[4], cospi28);
1767   u[4] = _mm_add_epi32(u[4], x);
1768   u[4] = _mm_add_epi32(u[4], rnding);
1769   u[4] = _mm_srai_epi32(u[4], bit);
1770 
1771   u[5] = _mm_mullo_epi32(in[3], cospi28);
1772   x = _mm_mullo_epi32(in[4], cospi36);
1773   u[5] = _mm_sub_epi32(u[5], x);
1774   u[5] = _mm_add_epi32(u[5], rnding);
1775   u[5] = _mm_srai_epi32(u[5], bit);
1776 
1777   // (4)
1778   u[6] = _mm_mullo_epi32(in[1], cospi52);
1779   x = _mm_mullo_epi32(in[6], cospi12);
1780   u[6] = _mm_add_epi32(u[6], x);
1781   u[6] = _mm_add_epi32(u[6], rnding);
1782   u[6] = _mm_srai_epi32(u[6], bit);
1783 
1784   u[7] = _mm_mullo_epi32(in[1], cospi12);
1785   x = _mm_mullo_epi32(in[6], cospi52);
1786   u[7] = _mm_sub_epi32(u[7], x);
1787   u[7] = _mm_add_epi32(u[7], rnding);
1788   u[7] = _mm_srai_epi32(u[7], bit);
1789 
1790   // stage 3
1791   addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
1792   addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
1793   addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
1794   addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
1795 
1796   // stage 4
1797   u[0] = v[0];
1798   u[1] = v[1];
1799   u[2] = v[2];
1800   u[3] = v[3];
1801 
1802   u[4] = _mm_mullo_epi32(v[4], cospi16);
1803   x = _mm_mullo_epi32(v[5], cospi48);
1804   u[4] = _mm_add_epi32(u[4], x);
1805   u[4] = _mm_add_epi32(u[4], rnding);
1806   u[4] = _mm_srai_epi32(u[4], bit);
1807 
1808   u[5] = _mm_mullo_epi32(v[4], cospi48);
1809   x = _mm_mullo_epi32(v[5], cospi16);
1810   u[5] = _mm_sub_epi32(u[5], x);
1811   u[5] = _mm_add_epi32(u[5], rnding);
1812   u[5] = _mm_srai_epi32(u[5], bit);
1813 
1814   u[6] = _mm_mullo_epi32(v[6], cospim48);
1815   x = _mm_mullo_epi32(v[7], cospi16);
1816   u[6] = _mm_add_epi32(u[6], x);
1817   u[6] = _mm_add_epi32(u[6], rnding);
1818   u[6] = _mm_srai_epi32(u[6], bit);
1819 
1820   u[7] = _mm_mullo_epi32(v[6], cospi16);
1821   x = _mm_mullo_epi32(v[7], cospim48);
1822   u[7] = _mm_sub_epi32(u[7], x);
1823   u[7] = _mm_add_epi32(u[7], rnding);
1824   u[7] = _mm_srai_epi32(u[7], bit);
1825 
1826   // stage 5
1827   addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
1828   addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
1829   addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
1830   addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
1831 
1832   // stage 6
1833   u[0] = v[0];
1834   u[1] = v[1];
1835   u[4] = v[4];
1836   u[5] = v[5];
1837 
1838   v[0] = _mm_mullo_epi32(v[2], cospi32);
1839   x = _mm_mullo_epi32(v[3], cospi32);
1840   u[2] = _mm_add_epi32(v[0], x);
1841   u[2] = _mm_add_epi32(u[2], rnding);
1842   u[2] = _mm_srai_epi32(u[2], bit);
1843 
1844   u[3] = _mm_sub_epi32(v[0], x);
1845   u[3] = _mm_add_epi32(u[3], rnding);
1846   u[3] = _mm_srai_epi32(u[3], bit);
1847 
1848   v[0] = _mm_mullo_epi32(v[6], cospi32);
1849   x = _mm_mullo_epi32(v[7], cospi32);
1850   u[6] = _mm_add_epi32(v[0], x);
1851   u[6] = _mm_add_epi32(u[6], rnding);
1852   u[6] = _mm_srai_epi32(u[6], bit);
1853 
1854   u[7] = _mm_sub_epi32(v[0], x);
1855   u[7] = _mm_add_epi32(u[7], rnding);
1856   u[7] = _mm_srai_epi32(u[7], bit);
1857 
1858   // stage 7
1859   if (do_cols) {
1860     out[0] = u[0];
1861     out[1] = _mm_sub_epi32(kZero, u[4]);
1862     out[2] = u[6];
1863     out[3] = _mm_sub_epi32(kZero, u[2]);
1864     out[4] = u[3];
1865     out[5] = _mm_sub_epi32(kZero, u[7]);
1866     out[6] = u[5];
1867     out[7] = _mm_sub_epi32(kZero, u[1]);
1868   } else {
1869     const int log_range_out = AOMMAX(16, bd + 6);
1870     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
1871     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
1872 
1873     neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
1874                      out_shift);
1875     neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
1876                      out_shift);
1877     neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
1878                      out_shift);
1879     neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
1880                      out_shift);
1881   }
1882 }
1883 
idct16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1884 static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
1885                                   int do_cols, int bd, int out_shift) {
1886   const int32_t *cospi = cospi_arr(bit);
1887   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1888   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1889   int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1890   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1891   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1892   // stage 0
1893   // stage 1
1894   // stage 2
1895   // stage 3
1896   // stage 4
1897   in[0] = _mm_mullo_epi32(in[0], cospi32);
1898   in[0] = _mm_add_epi32(in[0], rnding);
1899   in[0] = _mm_srai_epi32(in[0], bit);
1900 
1901   // stage 5
1902   // stage 6
1903   // stage 7
1904   if (!do_cols) {
1905     log_range = AOMMAX(16, bd + 6);
1906     clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1907     clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1908     if (out_shift != 0) {
1909       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
1910       in[0] = _mm_add_epi32(in[0], offset);
1911       in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
1912     }
1913   }
1914 
1915   in[0] = _mm_max_epi32(in[0], clamp_lo);
1916   in[0] = _mm_min_epi32(in[0], clamp_hi);
1917   out[0] = in[0];
1918   out[1] = in[0];
1919   out[2] = in[0];
1920   out[3] = in[0];
1921   out[4] = in[0];
1922   out[5] = in[0];
1923   out[6] = in[0];
1924   out[7] = in[0];
1925   out[8] = in[0];
1926   out[9] = in[0];
1927   out[10] = in[0];
1928   out[11] = in[0];
1929   out[12] = in[0];
1930   out[13] = in[0];
1931   out[14] = in[0];
1932   out[15] = in[0];
1933 }
1934 
idct16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)1935 static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
1936                                   int do_cols, int bd, int out_shift) {
1937   const int32_t *cospi = cospi_arr(bit);
1938   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
1939   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
1940   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
1941   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
1942   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
1943   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
1944   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
1945   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
1946   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
1947   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
1948   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
1949   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
1950   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
1951   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
1952   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
1953   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
1954   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
1955   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
1956   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
1957   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
1958   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
1959   __m128i u[16], x, y;
1960   // stage 0
1961   // stage 1
1962   u[0] = in[0];
1963   u[2] = in[4];
1964   u[4] = in[2];
1965   u[6] = in[6];
1966   u[8] = in[1];
1967   u[10] = in[5];
1968   u[12] = in[3];
1969   u[14] = in[7];
1970 
1971   // stage 2
1972   u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
1973   u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
1974 
1975   u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
1976   u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
1977 
1978   u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
1979   u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
1980 
1981   u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
1982   u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
1983 
1984   // stage 3
1985   u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
1986   u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
1987   u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
1988   u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
1989 
1990   addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
1991   addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
1992   addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
1993   addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
1994 
1995   // stage 4
1996   x = _mm_mullo_epi32(u[0], cospi32);
1997   u[0] = _mm_add_epi32(x, rnding);
1998   u[0] = _mm_srai_epi32(u[0], bit);
1999   u[1] = u[0];
2000 
2001   u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
2002   u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
2003 
2004   addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
2005   addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
2006 
2007   x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2008   u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2009   u[9] = x;
2010   y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2011   u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2012   u[10] = y;
2013 
2014   // stage 5
2015   addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2016   addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2017 
2018   x = _mm_mullo_epi32(u[5], cospi32);
2019   y = _mm_mullo_epi32(u[6], cospi32);
2020   u[5] = _mm_sub_epi32(y, x);
2021   u[5] = _mm_add_epi32(u[5], rnding);
2022   u[5] = _mm_srai_epi32(u[5], bit);
2023 
2024   u[6] = _mm_add_epi32(y, x);
2025   u[6] = _mm_add_epi32(u[6], rnding);
2026   u[6] = _mm_srai_epi32(u[6], bit);
2027 
2028   addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2029   addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2030   addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2031   addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2032 
2033   // stage 6
2034   addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
2035   addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
2036   addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
2037   addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
2038 
2039   x = _mm_mullo_epi32(u[10], cospi32);
2040   y = _mm_mullo_epi32(u[13], cospi32);
2041   u[10] = _mm_sub_epi32(y, x);
2042   u[10] = _mm_add_epi32(u[10], rnding);
2043   u[10] = _mm_srai_epi32(u[10], bit);
2044 
2045   u[13] = _mm_add_epi32(x, y);
2046   u[13] = _mm_add_epi32(u[13], rnding);
2047   u[13] = _mm_srai_epi32(u[13], bit);
2048 
2049   x = _mm_mullo_epi32(u[11], cospi32);
2050   y = _mm_mullo_epi32(u[12], cospi32);
2051   u[11] = _mm_sub_epi32(y, x);
2052   u[11] = _mm_add_epi32(u[11], rnding);
2053   u[11] = _mm_srai_epi32(u[11], bit);
2054 
2055   u[12] = _mm_add_epi32(x, y);
2056   u[12] = _mm_add_epi32(u[12], rnding);
2057   u[12] = _mm_srai_epi32(u[12], bit);
2058   // stage 7
2059   addsub_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2060   addsub_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2061   addsub_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2062   addsub_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2063   addsub_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2064   addsub_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2065   addsub_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2066   addsub_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2067 
2068   if (!do_cols) {
2069     const int log_range_out = AOMMAX(16, bd + 6);
2070     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2071     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2072     round_shift_8x8(out, out_shift);
2073     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2074   }
2075 }
2076 
iadst16x16_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2077 static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
2078                                    int do_cols, int bd, int out_shift) {
2079   const int32_t *cospi = cospi_arr(bit);
2080   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2081   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2082   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2083   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2084   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2085   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2086   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2087   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2088   const __m128i zero = _mm_setzero_si128();
2089   __m128i v[16], x, y, temp1, temp2;
2090   // stage 0
2091   // stage 1
2092   // stage 2
2093   x = _mm_mullo_epi32(in[0], cospi62);
2094   v[0] = _mm_add_epi32(x, rnding);
2095   v[0] = _mm_srai_epi32(v[0], bit);
2096 
2097   x = _mm_mullo_epi32(in[0], cospi2);
2098   v[1] = _mm_sub_epi32(zero, x);
2099   v[1] = _mm_add_epi32(v[1], rnding);
2100   v[1] = _mm_srai_epi32(v[1], bit);
2101 
2102   // stage 3
2103   v[8] = v[0];
2104   v[9] = v[1];
2105 
2106   // stage 4
2107   temp1 = _mm_mullo_epi32(v[8], cospi8);
2108   x = _mm_mullo_epi32(v[9], cospi56);
2109   temp1 = _mm_add_epi32(temp1, x);
2110   temp1 = _mm_add_epi32(temp1, rnding);
2111   temp1 = _mm_srai_epi32(temp1, bit);
2112 
2113   temp2 = _mm_mullo_epi32(v[8], cospi56);
2114   x = _mm_mullo_epi32(v[9], cospi8);
2115   temp2 = _mm_sub_epi32(temp2, x);
2116   temp2 = _mm_add_epi32(temp2, rnding);
2117   temp2 = _mm_srai_epi32(temp2, bit);
2118   v[8] = temp1;
2119   v[9] = temp2;
2120 
2121   // stage 5
2122   v[4] = v[0];
2123   v[5] = v[1];
2124   v[12] = v[8];
2125   v[13] = v[9];
2126 
2127   // stage 6
2128   temp1 = _mm_mullo_epi32(v[4], cospi16);
2129   x = _mm_mullo_epi32(v[5], cospi48);
2130   temp1 = _mm_add_epi32(temp1, x);
2131   temp1 = _mm_add_epi32(temp1, rnding);
2132   temp1 = _mm_srai_epi32(temp1, bit);
2133 
2134   temp2 = _mm_mullo_epi32(v[4], cospi48);
2135   x = _mm_mullo_epi32(v[5], cospi16);
2136   temp2 = _mm_sub_epi32(temp2, x);
2137   temp2 = _mm_add_epi32(temp2, rnding);
2138   temp2 = _mm_srai_epi32(temp2, bit);
2139   v[4] = temp1;
2140   v[5] = temp2;
2141 
2142   temp1 = _mm_mullo_epi32(v[12], cospi16);
2143   x = _mm_mullo_epi32(v[13], cospi48);
2144   temp1 = _mm_add_epi32(temp1, x);
2145   temp1 = _mm_add_epi32(temp1, rnding);
2146   temp1 = _mm_srai_epi32(temp1, bit);
2147 
2148   temp2 = _mm_mullo_epi32(v[12], cospi48);
2149   x = _mm_mullo_epi32(v[13], cospi16);
2150   temp2 = _mm_sub_epi32(temp2, x);
2151   temp2 = _mm_add_epi32(temp2, rnding);
2152   temp2 = _mm_srai_epi32(temp2, bit);
2153   v[12] = temp1;
2154   v[13] = temp2;
2155 
2156   // stage 7
2157   v[2] = v[0];
2158   v[3] = v[1];
2159   v[6] = v[4];
2160   v[7] = v[5];
2161   v[10] = v[8];
2162   v[11] = v[9];
2163   v[14] = v[12];
2164   v[15] = v[13];
2165 
2166   // stage 8
2167   y = _mm_mullo_epi32(v[2], cospi32);
2168   x = _mm_mullo_epi32(v[3], cospi32);
2169   v[2] = _mm_add_epi32(y, x);
2170   v[2] = _mm_add_epi32(v[2], rnding);
2171   v[2] = _mm_srai_epi32(v[2], bit);
2172 
2173   v[3] = _mm_sub_epi32(y, x);
2174   v[3] = _mm_add_epi32(v[3], rnding);
2175   v[3] = _mm_srai_epi32(v[3], bit);
2176 
2177   y = _mm_mullo_epi32(v[6], cospi32);
2178   x = _mm_mullo_epi32(v[7], cospi32);
2179   v[6] = _mm_add_epi32(y, x);
2180   v[6] = _mm_add_epi32(v[6], rnding);
2181   v[6] = _mm_srai_epi32(v[6], bit);
2182 
2183   v[7] = _mm_sub_epi32(y, x);
2184   v[7] = _mm_add_epi32(v[7], rnding);
2185   v[7] = _mm_srai_epi32(v[7], bit);
2186 
2187   y = _mm_mullo_epi32(v[10], cospi32);
2188   x = _mm_mullo_epi32(v[11], cospi32);
2189   v[10] = _mm_add_epi32(y, x);
2190   v[10] = _mm_add_epi32(v[10], rnding);
2191   v[10] = _mm_srai_epi32(v[10], bit);
2192 
2193   v[11] = _mm_sub_epi32(y, x);
2194   v[11] = _mm_add_epi32(v[11], rnding);
2195   v[11] = _mm_srai_epi32(v[11], bit);
2196 
2197   y = _mm_mullo_epi32(v[14], cospi32);
2198   x = _mm_mullo_epi32(v[15], cospi32);
2199   v[14] = _mm_add_epi32(y, x);
2200   v[14] = _mm_add_epi32(v[14], rnding);
2201   v[14] = _mm_srai_epi32(v[14], bit);
2202 
2203   v[15] = _mm_sub_epi32(y, x);
2204   v[15] = _mm_add_epi32(v[15], rnding);
2205   v[15] = _mm_srai_epi32(v[15], bit);
2206 
2207   // stage 9
2208   if (do_cols) {
2209     out[0] = v[0];
2210     out[1] = _mm_sub_epi32(zero, v[8]);
2211     out[2] = v[12];
2212     out[3] = _mm_sub_epi32(zero, v[4]);
2213     out[4] = v[6];
2214     out[5] = _mm_sub_epi32(zero, v[14]);
2215     out[6] = v[10];
2216     out[7] = _mm_sub_epi32(zero, v[2]);
2217     out[8] = v[3];
2218     out[9] = _mm_sub_epi32(zero, v[11]);
2219     out[10] = v[15];
2220     out[11] = _mm_sub_epi32(zero, v[7]);
2221     out[12] = v[5];
2222     out[13] = _mm_sub_epi32(zero, v[13]);
2223     out[14] = v[9];
2224     out[15] = _mm_sub_epi32(zero, v[1]);
2225   } else {
2226     const int log_range_out = AOMMAX(16, bd + 6);
2227     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2228     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2229 
2230     neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2231                      out_shift);
2232     neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
2233                      &clamp_hi_out, out_shift);
2234     neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
2235                      &clamp_hi_out, out_shift);
2236     neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
2237                      &clamp_hi_out, out_shift);
2238     neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
2239                      &clamp_hi_out, out_shift);
2240     neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
2241                      &clamp_hi_out, out_shift);
2242     neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
2243                      &clamp_hi_out, out_shift);
2244     neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
2245                      &clamp_hi_out, out_shift);
2246   }
2247 }
2248 
iadst16x16_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2249 static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
2250                                    int do_cols, int bd, int out_shift) {
2251   const int32_t *cospi = cospi_arr(bit);
2252   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2253   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2254   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2255   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2256   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2257   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2258   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2259   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2260   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2261   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2262   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2263   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2264   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2265   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2266   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2267   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2268   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2269   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2270   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2271   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2272   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2273   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2274   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2275   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2276   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2277   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2278   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2279   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2280   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2281   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2282   __m128i zero = _mm_setzero_si128();
2283   __m128i u[16], x, y;
2284 
2285   // stage 0
2286   // stage 1
2287   // stage 2
2288   x = _mm_mullo_epi32(in[0], cospi62);
2289   u[0] = _mm_add_epi32(x, rnding);
2290   u[0] = _mm_srai_epi32(u[0], bit);
2291 
2292   x = _mm_mullo_epi32(in[0], cospi2);
2293   u[1] = _mm_sub_epi32(zero, x);
2294   u[1] = _mm_add_epi32(u[1], rnding);
2295   u[1] = _mm_srai_epi32(u[1], bit);
2296 
2297   x = _mm_mullo_epi32(in[2], cospi54);
2298   u[2] = _mm_add_epi32(x, rnding);
2299   u[2] = _mm_srai_epi32(u[2], bit);
2300 
2301   x = _mm_mullo_epi32(in[2], cospi10);
2302   u[3] = _mm_sub_epi32(zero, x);
2303   u[3] = _mm_add_epi32(u[3], rnding);
2304   u[3] = _mm_srai_epi32(u[3], bit);
2305 
2306   x = _mm_mullo_epi32(in[4], cospi46);
2307   u[4] = _mm_add_epi32(x, rnding);
2308   u[4] = _mm_srai_epi32(u[4], bit);
2309 
2310   x = _mm_mullo_epi32(in[4], cospi18);
2311   u[5] = _mm_sub_epi32(zero, x);
2312   u[5] = _mm_add_epi32(u[5], rnding);
2313   u[5] = _mm_srai_epi32(u[5], bit);
2314 
2315   x = _mm_mullo_epi32(in[6], cospi38);
2316   u[6] = _mm_add_epi32(x, rnding);
2317   u[6] = _mm_srai_epi32(u[6], bit);
2318 
2319   x = _mm_mullo_epi32(in[6], cospi26);
2320   u[7] = _mm_sub_epi32(zero, x);
2321   u[7] = _mm_add_epi32(u[7], rnding);
2322   u[7] = _mm_srai_epi32(u[7], bit);
2323 
2324   u[8] = _mm_mullo_epi32(in[7], cospi34);
2325   u[8] = _mm_add_epi32(u[8], rnding);
2326   u[8] = _mm_srai_epi32(u[8], bit);
2327 
2328   u[9] = _mm_mullo_epi32(in[7], cospi30);
2329   u[9] = _mm_add_epi32(u[9], rnding);
2330   u[9] = _mm_srai_epi32(u[9], bit);
2331 
2332   u[10] = _mm_mullo_epi32(in[5], cospi42);
2333   u[10] = _mm_add_epi32(u[10], rnding);
2334   u[10] = _mm_srai_epi32(u[10], bit);
2335 
2336   u[11] = _mm_mullo_epi32(in[5], cospi22);
2337   u[11] = _mm_add_epi32(u[11], rnding);
2338   u[11] = _mm_srai_epi32(u[11], bit);
2339 
2340   u[12] = _mm_mullo_epi32(in[3], cospi50);
2341   u[12] = _mm_add_epi32(u[12], rnding);
2342   u[12] = _mm_srai_epi32(u[12], bit);
2343 
2344   u[13] = _mm_mullo_epi32(in[3], cospi14);
2345   u[13] = _mm_add_epi32(u[13], rnding);
2346   u[13] = _mm_srai_epi32(u[13], bit);
2347 
2348   u[14] = _mm_mullo_epi32(in[1], cospi58);
2349   u[14] = _mm_add_epi32(u[14], rnding);
2350   u[14] = _mm_srai_epi32(u[14], bit);
2351 
2352   u[15] = _mm_mullo_epi32(in[1], cospi6);
2353   u[15] = _mm_add_epi32(u[15], rnding);
2354   u[15] = _mm_srai_epi32(u[15], bit);
2355 
2356   // stage 3
2357   addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2358   addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2359   addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2360   addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2361   addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2362   addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2363   addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2364   addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2365 
2366   // stage 4
2367   y = _mm_mullo_epi32(u[8], cospi56);
2368   x = _mm_mullo_epi32(u[9], cospi56);
2369   u[8] = _mm_mullo_epi32(u[8], cospi8);
2370   u[8] = _mm_add_epi32(u[8], x);
2371   u[8] = _mm_add_epi32(u[8], rnding);
2372   u[8] = _mm_srai_epi32(u[8], bit);
2373 
2374   x = _mm_mullo_epi32(u[9], cospi8);
2375   u[9] = _mm_sub_epi32(y, x);
2376   u[9] = _mm_add_epi32(u[9], rnding);
2377   u[9] = _mm_srai_epi32(u[9], bit);
2378 
2379   x = _mm_mullo_epi32(u[11], cospi24);
2380   y = _mm_mullo_epi32(u[10], cospi24);
2381   u[10] = _mm_mullo_epi32(u[10], cospi40);
2382   u[10] = _mm_add_epi32(u[10], x);
2383   u[10] = _mm_add_epi32(u[10], rnding);
2384   u[10] = _mm_srai_epi32(u[10], bit);
2385 
2386   x = _mm_mullo_epi32(u[11], cospi40);
2387   u[11] = _mm_sub_epi32(y, x);
2388   u[11] = _mm_add_epi32(u[11], rnding);
2389   u[11] = _mm_srai_epi32(u[11], bit);
2390 
2391   x = _mm_mullo_epi32(u[13], cospi8);
2392   y = _mm_mullo_epi32(u[12], cospi8);
2393   u[12] = _mm_mullo_epi32(u[12], cospim56);
2394   u[12] = _mm_add_epi32(u[12], x);
2395   u[12] = _mm_add_epi32(u[12], rnding);
2396   u[12] = _mm_srai_epi32(u[12], bit);
2397 
2398   x = _mm_mullo_epi32(u[13], cospim56);
2399   u[13] = _mm_sub_epi32(y, x);
2400   u[13] = _mm_add_epi32(u[13], rnding);
2401   u[13] = _mm_srai_epi32(u[13], bit);
2402 
2403   x = _mm_mullo_epi32(u[15], cospi40);
2404   y = _mm_mullo_epi32(u[14], cospi40);
2405   u[14] = _mm_mullo_epi32(u[14], cospim24);
2406   u[14] = _mm_add_epi32(u[14], x);
2407   u[14] = _mm_add_epi32(u[14], rnding);
2408   u[14] = _mm_srai_epi32(u[14], bit);
2409 
2410   x = _mm_mullo_epi32(u[15], cospim24);
2411   u[15] = _mm_sub_epi32(y, x);
2412   u[15] = _mm_add_epi32(u[15], rnding);
2413   u[15] = _mm_srai_epi32(u[15], bit);
2414 
2415   // stage 5
2416   addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2417   addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2418   addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2419   addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2420   addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2421   addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2422   addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2423   addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2424 
2425   // stage 6
2426   x = _mm_mullo_epi32(u[5], cospi48);
2427   y = _mm_mullo_epi32(u[4], cospi48);
2428   u[4] = _mm_mullo_epi32(u[4], cospi16);
2429   u[4] = _mm_add_epi32(u[4], x);
2430   u[4] = _mm_add_epi32(u[4], rnding);
2431   u[4] = _mm_srai_epi32(u[4], bit);
2432 
2433   x = _mm_mullo_epi32(u[5], cospi16);
2434   u[5] = _mm_sub_epi32(y, x);
2435   u[5] = _mm_add_epi32(u[5], rnding);
2436   u[5] = _mm_srai_epi32(u[5], bit);
2437 
2438   x = _mm_mullo_epi32(u[7], cospi16);
2439   y = _mm_mullo_epi32(u[6], cospi16);
2440   u[6] = _mm_mullo_epi32(u[6], cospim48);
2441   u[6] = _mm_add_epi32(u[6], x);
2442   u[6] = _mm_add_epi32(u[6], rnding);
2443   u[6] = _mm_srai_epi32(u[6], bit);
2444 
2445   x = _mm_mullo_epi32(u[7], cospim48);
2446   u[7] = _mm_sub_epi32(y, x);
2447   u[7] = _mm_add_epi32(u[7], rnding);
2448   u[7] = _mm_srai_epi32(u[7], bit);
2449 
2450   x = _mm_mullo_epi32(u[13], cospi48);
2451   y = _mm_mullo_epi32(u[12], cospi48);
2452   u[12] = _mm_mullo_epi32(u[12], cospi16);
2453   u[12] = _mm_add_epi32(u[12], x);
2454   u[12] = _mm_add_epi32(u[12], rnding);
2455   u[12] = _mm_srai_epi32(u[12], bit);
2456 
2457   x = _mm_mullo_epi32(u[13], cospi16);
2458   u[13] = _mm_sub_epi32(y, x);
2459   u[13] = _mm_add_epi32(u[13], rnding);
2460   u[13] = _mm_srai_epi32(u[13], bit);
2461 
2462   x = _mm_mullo_epi32(u[15], cospi16);
2463   y = _mm_mullo_epi32(u[14], cospi16);
2464   u[14] = _mm_mullo_epi32(u[14], cospim48);
2465   u[14] = _mm_add_epi32(u[14], x);
2466   u[14] = _mm_add_epi32(u[14], rnding);
2467   u[14] = _mm_srai_epi32(u[14], bit);
2468 
2469   x = _mm_mullo_epi32(u[15], cospim48);
2470   u[15] = _mm_sub_epi32(y, x);
2471   u[15] = _mm_add_epi32(u[15], rnding);
2472   u[15] = _mm_srai_epi32(u[15], bit);
2473 
2474   // stage 7
2475   addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
2476   addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
2477   addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
2478   addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
2479   addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
2480   addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
2481   addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
2482   addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
2483 
2484   // stage 8
2485   y = _mm_mullo_epi32(u[2], cospi32);
2486   x = _mm_mullo_epi32(u[3], cospi32);
2487   u[2] = _mm_add_epi32(y, x);
2488   u[2] = _mm_add_epi32(u[2], rnding);
2489   u[2] = _mm_srai_epi32(u[2], bit);
2490 
2491   u[3] = _mm_sub_epi32(y, x);
2492   u[3] = _mm_add_epi32(u[3], rnding);
2493   u[3] = _mm_srai_epi32(u[3], bit);
2494   y = _mm_mullo_epi32(u[6], cospi32);
2495   x = _mm_mullo_epi32(u[7], cospi32);
2496   u[6] = _mm_add_epi32(y, x);
2497   u[6] = _mm_add_epi32(u[6], rnding);
2498   u[6] = _mm_srai_epi32(u[6], bit);
2499 
2500   u[7] = _mm_sub_epi32(y, x);
2501   u[7] = _mm_add_epi32(u[7], rnding);
2502   u[7] = _mm_srai_epi32(u[7], bit);
2503 
2504   y = _mm_mullo_epi32(u[10], cospi32);
2505   x = _mm_mullo_epi32(u[11], cospi32);
2506   u[10] = _mm_add_epi32(y, x);
2507   u[10] = _mm_add_epi32(u[10], rnding);
2508   u[10] = _mm_srai_epi32(u[10], bit);
2509 
2510   u[11] = _mm_sub_epi32(y, x);
2511   u[11] = _mm_add_epi32(u[11], rnding);
2512   u[11] = _mm_srai_epi32(u[11], bit);
2513 
2514   y = _mm_mullo_epi32(u[14], cospi32);
2515   x = _mm_mullo_epi32(u[15], cospi32);
2516   u[14] = _mm_add_epi32(y, x);
2517   u[14] = _mm_add_epi32(u[14], rnding);
2518   u[14] = _mm_srai_epi32(u[14], bit);
2519 
2520   u[15] = _mm_sub_epi32(y, x);
2521   u[15] = _mm_add_epi32(u[15], rnding);
2522   u[15] = _mm_srai_epi32(u[15], bit);
2523 
2524   // stage 9
2525   if (do_cols) {
2526     out[0] = u[0];
2527     out[1] = _mm_sub_epi32(zero, u[8]);
2528     out[2] = u[12];
2529     out[3] = _mm_sub_epi32(zero, u[4]);
2530     out[4] = u[6];
2531     out[5] = _mm_sub_epi32(zero, u[14]);
2532     out[6] = u[10];
2533     out[7] = _mm_sub_epi32(zero, u[2]);
2534     out[8] = u[3];
2535     out[9] = _mm_sub_epi32(zero, u[11]);
2536     out[10] = u[15];
2537     out[11] = _mm_sub_epi32(zero, u[7]);
2538     out[12] = u[5];
2539     out[13] = _mm_sub_epi32(zero, u[13]);
2540     out[14] = u[9];
2541     out[15] = _mm_sub_epi32(zero, u[1]);
2542   } else {
2543     const int log_range_out = AOMMAX(16, bd + 6);
2544     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2545     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2546 
2547     neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
2548                      out_shift);
2549     neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
2550                      &clamp_hi_out, out_shift);
2551     neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
2552                      &clamp_hi_out, out_shift);
2553     neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
2554                      &clamp_hi_out, out_shift);
2555     neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
2556                      &clamp_hi_out, out_shift);
2557     neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
2558                      &clamp_hi_out, out_shift);
2559     neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
2560                      &clamp_hi_out, out_shift);
2561     neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
2562                      &clamp_hi_out, out_shift);
2563   }
2564 }
2565 
idct16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2566 static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2567                              int bd, int out_shift) {
2568   const int32_t *cospi = cospi_arr(bit);
2569   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
2570   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
2571   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
2572   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
2573   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
2574   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
2575   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
2576   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
2577   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
2578   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
2579   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
2580   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
2581   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2582   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
2583   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2584   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
2585   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2586   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2587   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2588   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2589   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2590   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
2591   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2592   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2593   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2594   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2595   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2596   __m128i u[16], v[16], x, y;
2597 
2598   {
2599     // stage 0
2600     // stage 1
2601     u[0] = in[0];
2602     u[1] = in[8];
2603     u[2] = in[4];
2604     u[3] = in[12];
2605     u[4] = in[2];
2606     u[5] = in[10];
2607     u[6] = in[6];
2608     u[7] = in[14];
2609     u[8] = in[1];
2610     u[9] = in[9];
2611     u[10] = in[5];
2612     u[11] = in[13];
2613     u[12] = in[3];
2614     u[13] = in[11];
2615     u[14] = in[7];
2616     u[15] = in[15];
2617 
2618     // stage 2
2619     v[0] = u[0];
2620     v[1] = u[1];
2621     v[2] = u[2];
2622     v[3] = u[3];
2623     v[4] = u[4];
2624     v[5] = u[5];
2625     v[6] = u[6];
2626     v[7] = u[7];
2627 
2628     v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
2629     v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
2630     v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
2631     v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
2632     v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
2633     v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
2634     v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
2635     v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
2636 
2637     // stage 3
2638     u[0] = v[0];
2639     u[1] = v[1];
2640     u[2] = v[2];
2641     u[3] = v[3];
2642     u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
2643     u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
2644     u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
2645     u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
2646     addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
2647     addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
2648     addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
2649     addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
2650 
2651     // stage 4
2652     x = _mm_mullo_epi32(u[0], cospi32);
2653     y = _mm_mullo_epi32(u[1], cospi32);
2654     v[0] = _mm_add_epi32(x, y);
2655     v[0] = _mm_add_epi32(v[0], rnding);
2656     v[0] = _mm_srai_epi32(v[0], bit);
2657 
2658     v[1] = _mm_sub_epi32(x, y);
2659     v[1] = _mm_add_epi32(v[1], rnding);
2660     v[1] = _mm_srai_epi32(v[1], bit);
2661 
2662     v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
2663     v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
2664     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
2665     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
2666     v[8] = u[8];
2667     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
2668     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
2669     v[11] = u[11];
2670     v[12] = u[12];
2671     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
2672     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
2673     v[15] = u[15];
2674 
2675     // stage 5
2676     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
2677     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
2678     u[4] = v[4];
2679 
2680     x = _mm_mullo_epi32(v[5], cospi32);
2681     y = _mm_mullo_epi32(v[6], cospi32);
2682     u[5] = _mm_sub_epi32(y, x);
2683     u[5] = _mm_add_epi32(u[5], rnding);
2684     u[5] = _mm_srai_epi32(u[5], bit);
2685 
2686     u[6] = _mm_add_epi32(y, x);
2687     u[6] = _mm_add_epi32(u[6], rnding);
2688     u[6] = _mm_srai_epi32(u[6], bit);
2689 
2690     u[7] = v[7];
2691     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
2692     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
2693     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
2694     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
2695 
2696     // stage 6
2697     addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
2698     addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
2699     addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
2700     addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
2701     v[8] = u[8];
2702     v[9] = u[9];
2703 
2704     x = _mm_mullo_epi32(u[10], cospi32);
2705     y = _mm_mullo_epi32(u[13], cospi32);
2706     v[10] = _mm_sub_epi32(y, x);
2707     v[10] = _mm_add_epi32(v[10], rnding);
2708     v[10] = _mm_srai_epi32(v[10], bit);
2709 
2710     v[13] = _mm_add_epi32(x, y);
2711     v[13] = _mm_add_epi32(v[13], rnding);
2712     v[13] = _mm_srai_epi32(v[13], bit);
2713 
2714     x = _mm_mullo_epi32(u[11], cospi32);
2715     y = _mm_mullo_epi32(u[12], cospi32);
2716     v[11] = _mm_sub_epi32(y, x);
2717     v[11] = _mm_add_epi32(v[11], rnding);
2718     v[11] = _mm_srai_epi32(v[11], bit);
2719 
2720     v[12] = _mm_add_epi32(x, y);
2721     v[12] = _mm_add_epi32(v[12], rnding);
2722     v[12] = _mm_srai_epi32(v[12], bit);
2723 
2724     v[14] = u[14];
2725     v[15] = u[15];
2726 
2727     // stage 7
2728     addsub_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo, &clamp_hi);
2729     addsub_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo, &clamp_hi);
2730     addsub_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo, &clamp_hi);
2731     addsub_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo, &clamp_hi);
2732     addsub_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo, &clamp_hi);
2733     addsub_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo, &clamp_hi);
2734     addsub_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo, &clamp_hi);
2735     addsub_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo, &clamp_hi);
2736 
2737     if (!do_cols) {
2738       const int log_range_out = AOMMAX(16, bd + 6);
2739       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
2740       const __m128i clamp_hi_out =
2741           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
2742       round_shift_8x8(out, out_shift);
2743       highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 16);
2744     }
2745   }
2746 }
2747 
iadst16x16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)2748 static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
2749                               int bd, int out_shift) {
2750   const int32_t *cospi = cospi_arr(bit);
2751   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
2752   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
2753   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
2754   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
2755   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
2756   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
2757   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
2758   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
2759   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
2760   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
2761   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
2762   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
2763   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
2764   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
2765   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
2766   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
2767   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
2768   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
2769   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
2770   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
2771   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
2772   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
2773   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
2774   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
2775   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
2776   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
2777   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
2778   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
2779   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
2780   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
2781   const __m128i zero = _mm_setzero_si128();
2782   __m128i u[16], v[16], x, y;
2783   // Calculate the column 0, 1, 2, 3
2784   // stage 0
2785   // stage 1
2786   // stage 2
2787   v[0] = _mm_mullo_epi32(in[15], cospi2);
2788   x = _mm_mullo_epi32(in[0], cospi62);
2789   v[0] = _mm_add_epi32(v[0], x);
2790   v[0] = _mm_add_epi32(v[0], rnding);
2791   v[0] = _mm_srai_epi32(v[0], bit);
2792 
2793   v[1] = _mm_mullo_epi32(in[15], cospi62);
2794   x = _mm_mullo_epi32(in[0], cospi2);
2795   v[1] = _mm_sub_epi32(v[1], x);
2796   v[1] = _mm_add_epi32(v[1], rnding);
2797   v[1] = _mm_srai_epi32(v[1], bit);
2798 
2799   v[2] = _mm_mullo_epi32(in[13], cospi10);
2800   x = _mm_mullo_epi32(in[2], cospi54);
2801   v[2] = _mm_add_epi32(v[2], x);
2802   v[2] = _mm_add_epi32(v[2], rnding);
2803   v[2] = _mm_srai_epi32(v[2], bit);
2804 
2805   v[3] = _mm_mullo_epi32(in[13], cospi54);
2806   x = _mm_mullo_epi32(in[2], cospi10);
2807   v[3] = _mm_sub_epi32(v[3], x);
2808   v[3] = _mm_add_epi32(v[3], rnding);
2809   v[3] = _mm_srai_epi32(v[3], bit);
2810 
2811   v[4] = _mm_mullo_epi32(in[11], cospi18);
2812   x = _mm_mullo_epi32(in[4], cospi46);
2813   v[4] = _mm_add_epi32(v[4], x);
2814   v[4] = _mm_add_epi32(v[4], rnding);
2815   v[4] = _mm_srai_epi32(v[4], bit);
2816 
2817   v[5] = _mm_mullo_epi32(in[11], cospi46);
2818   x = _mm_mullo_epi32(in[4], cospi18);
2819   v[5] = _mm_sub_epi32(v[5], x);
2820   v[5] = _mm_add_epi32(v[5], rnding);
2821   v[5] = _mm_srai_epi32(v[5], bit);
2822 
2823   v[6] = _mm_mullo_epi32(in[9], cospi26);
2824   x = _mm_mullo_epi32(in[6], cospi38);
2825   v[6] = _mm_add_epi32(v[6], x);
2826   v[6] = _mm_add_epi32(v[6], rnding);
2827   v[6] = _mm_srai_epi32(v[6], bit);
2828 
2829   v[7] = _mm_mullo_epi32(in[9], cospi38);
2830   x = _mm_mullo_epi32(in[6], cospi26);
2831   v[7] = _mm_sub_epi32(v[7], x);
2832   v[7] = _mm_add_epi32(v[7], rnding);
2833   v[7] = _mm_srai_epi32(v[7], bit);
2834 
2835   v[8] = _mm_mullo_epi32(in[7], cospi34);
2836   x = _mm_mullo_epi32(in[8], cospi30);
2837   v[8] = _mm_add_epi32(v[8], x);
2838   v[8] = _mm_add_epi32(v[8], rnding);
2839   v[8] = _mm_srai_epi32(v[8], bit);
2840 
2841   v[9] = _mm_mullo_epi32(in[7], cospi30);
2842   x = _mm_mullo_epi32(in[8], cospi34);
2843   v[9] = _mm_sub_epi32(v[9], x);
2844   v[9] = _mm_add_epi32(v[9], rnding);
2845   v[9] = _mm_srai_epi32(v[9], bit);
2846 
2847   v[10] = _mm_mullo_epi32(in[5], cospi42);
2848   x = _mm_mullo_epi32(in[10], cospi22);
2849   v[10] = _mm_add_epi32(v[10], x);
2850   v[10] = _mm_add_epi32(v[10], rnding);
2851   v[10] = _mm_srai_epi32(v[10], bit);
2852 
2853   v[11] = _mm_mullo_epi32(in[5], cospi22);
2854   x = _mm_mullo_epi32(in[10], cospi42);
2855   v[11] = _mm_sub_epi32(v[11], x);
2856   v[11] = _mm_add_epi32(v[11], rnding);
2857   v[11] = _mm_srai_epi32(v[11], bit);
2858 
2859   v[12] = _mm_mullo_epi32(in[3], cospi50);
2860   x = _mm_mullo_epi32(in[12], cospi14);
2861   v[12] = _mm_add_epi32(v[12], x);
2862   v[12] = _mm_add_epi32(v[12], rnding);
2863   v[12] = _mm_srai_epi32(v[12], bit);
2864 
2865   v[13] = _mm_mullo_epi32(in[3], cospi14);
2866   x = _mm_mullo_epi32(in[12], cospi50);
2867   v[13] = _mm_sub_epi32(v[13], x);
2868   v[13] = _mm_add_epi32(v[13], rnding);
2869   v[13] = _mm_srai_epi32(v[13], bit);
2870 
2871   v[14] = _mm_mullo_epi32(in[1], cospi58);
2872   x = _mm_mullo_epi32(in[14], cospi6);
2873   v[14] = _mm_add_epi32(v[14], x);
2874   v[14] = _mm_add_epi32(v[14], rnding);
2875   v[14] = _mm_srai_epi32(v[14], bit);
2876 
2877   v[15] = _mm_mullo_epi32(in[1], cospi6);
2878   x = _mm_mullo_epi32(in[14], cospi58);
2879   v[15] = _mm_sub_epi32(v[15], x);
2880   v[15] = _mm_add_epi32(v[15], rnding);
2881   v[15] = _mm_srai_epi32(v[15], bit);
2882 
2883   // stage 3
2884   addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
2885   addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
2886   addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
2887   addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
2888   addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
2889   addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
2890   addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
2891   addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
2892 
2893   // stage 4
2894   v[0] = u[0];
2895   v[1] = u[1];
2896   v[2] = u[2];
2897   v[3] = u[3];
2898   v[4] = u[4];
2899   v[5] = u[5];
2900   v[6] = u[6];
2901   v[7] = u[7];
2902 
2903   v[8] = _mm_mullo_epi32(u[8], cospi8);
2904   x = _mm_mullo_epi32(u[9], cospi56);
2905   v[8] = _mm_add_epi32(v[8], x);
2906   v[8] = _mm_add_epi32(v[8], rnding);
2907   v[8] = _mm_srai_epi32(v[8], bit);
2908 
2909   v[9] = _mm_mullo_epi32(u[8], cospi56);
2910   x = _mm_mullo_epi32(u[9], cospi8);
2911   v[9] = _mm_sub_epi32(v[9], x);
2912   v[9] = _mm_add_epi32(v[9], rnding);
2913   v[9] = _mm_srai_epi32(v[9], bit);
2914 
2915   v[10] = _mm_mullo_epi32(u[10], cospi40);
2916   x = _mm_mullo_epi32(u[11], cospi24);
2917   v[10] = _mm_add_epi32(v[10], x);
2918   v[10] = _mm_add_epi32(v[10], rnding);
2919   v[10] = _mm_srai_epi32(v[10], bit);
2920 
2921   v[11] = _mm_mullo_epi32(u[10], cospi24);
2922   x = _mm_mullo_epi32(u[11], cospi40);
2923   v[11] = _mm_sub_epi32(v[11], x);
2924   v[11] = _mm_add_epi32(v[11], rnding);
2925   v[11] = _mm_srai_epi32(v[11], bit);
2926 
2927   v[12] = _mm_mullo_epi32(u[12], cospim56);
2928   x = _mm_mullo_epi32(u[13], cospi8);
2929   v[12] = _mm_add_epi32(v[12], x);
2930   v[12] = _mm_add_epi32(v[12], rnding);
2931   v[12] = _mm_srai_epi32(v[12], bit);
2932 
2933   v[13] = _mm_mullo_epi32(u[12], cospi8);
2934   x = _mm_mullo_epi32(u[13], cospim56);
2935   v[13] = _mm_sub_epi32(v[13], x);
2936   v[13] = _mm_add_epi32(v[13], rnding);
2937   v[13] = _mm_srai_epi32(v[13], bit);
2938 
2939   v[14] = _mm_mullo_epi32(u[14], cospim24);
2940   x = _mm_mullo_epi32(u[15], cospi40);
2941   v[14] = _mm_add_epi32(v[14], x);
2942   v[14] = _mm_add_epi32(v[14], rnding);
2943   v[14] = _mm_srai_epi32(v[14], bit);
2944 
2945   v[15] = _mm_mullo_epi32(u[14], cospi40);
2946   x = _mm_mullo_epi32(u[15], cospim24);
2947   v[15] = _mm_sub_epi32(v[15], x);
2948   v[15] = _mm_add_epi32(v[15], rnding);
2949   v[15] = _mm_srai_epi32(v[15], bit);
2950 
2951   // stage 5
2952   addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
2953   addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
2954   addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
2955   addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
2956   addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
2957   addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
2958   addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
2959   addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
2960 
2961   // stage 6
2962   v[0] = u[0];
2963   v[1] = u[1];
2964   v[2] = u[2];
2965   v[3] = u[3];
2966 
2967   v[4] = _mm_mullo_epi32(u[4], cospi16);
2968   x = _mm_mullo_epi32(u[5], cospi48);
2969   v[4] = _mm_add_epi32(v[4], x);
2970   v[4] = _mm_add_epi32(v[4], rnding);
2971   v[4] = _mm_srai_epi32(v[4], bit);
2972 
2973   v[5] = _mm_mullo_epi32(u[4], cospi48);
2974   x = _mm_mullo_epi32(u[5], cospi16);
2975   v[5] = _mm_sub_epi32(v[5], x);
2976   v[5] = _mm_add_epi32(v[5], rnding);
2977   v[5] = _mm_srai_epi32(v[5], bit);
2978 
2979   v[6] = _mm_mullo_epi32(u[6], cospim48);
2980   x = _mm_mullo_epi32(u[7], cospi16);
2981   v[6] = _mm_add_epi32(v[6], x);
2982   v[6] = _mm_add_epi32(v[6], rnding);
2983   v[6] = _mm_srai_epi32(v[6], bit);
2984 
2985   v[7] = _mm_mullo_epi32(u[6], cospi16);
2986   x = _mm_mullo_epi32(u[7], cospim48);
2987   v[7] = _mm_sub_epi32(v[7], x);
2988   v[7] = _mm_add_epi32(v[7], rnding);
2989   v[7] = _mm_srai_epi32(v[7], bit);
2990 
2991   v[8] = u[8];
2992   v[9] = u[9];
2993   v[10] = u[10];
2994   v[11] = u[11];
2995 
2996   v[12] = _mm_mullo_epi32(u[12], cospi16);
2997   x = _mm_mullo_epi32(u[13], cospi48);
2998   v[12] = _mm_add_epi32(v[12], x);
2999   v[12] = _mm_add_epi32(v[12], rnding);
3000   v[12] = _mm_srai_epi32(v[12], bit);
3001 
3002   v[13] = _mm_mullo_epi32(u[12], cospi48);
3003   x = _mm_mullo_epi32(u[13], cospi16);
3004   v[13] = _mm_sub_epi32(v[13], x);
3005   v[13] = _mm_add_epi32(v[13], rnding);
3006   v[13] = _mm_srai_epi32(v[13], bit);
3007 
3008   v[14] = _mm_mullo_epi32(u[14], cospim48);
3009   x = _mm_mullo_epi32(u[15], cospi16);
3010   v[14] = _mm_add_epi32(v[14], x);
3011   v[14] = _mm_add_epi32(v[14], rnding);
3012   v[14] = _mm_srai_epi32(v[14], bit);
3013 
3014   v[15] = _mm_mullo_epi32(u[14], cospi16);
3015   x = _mm_mullo_epi32(u[15], cospim48);
3016   v[15] = _mm_sub_epi32(v[15], x);
3017   v[15] = _mm_add_epi32(v[15], rnding);
3018   v[15] = _mm_srai_epi32(v[15], bit);
3019 
3020   // stage 7
3021   addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
3022   addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
3023   addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
3024   addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
3025   addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
3026   addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
3027   addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
3028   addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
3029 
3030   // stage 8
3031   v[0] = u[0];
3032   v[1] = u[1];
3033 
3034   y = _mm_mullo_epi32(u[2], cospi32);
3035   x = _mm_mullo_epi32(u[3], cospi32);
3036   v[2] = _mm_add_epi32(y, x);
3037   v[2] = _mm_add_epi32(v[2], rnding);
3038   v[2] = _mm_srai_epi32(v[2], bit);
3039 
3040   v[3] = _mm_sub_epi32(y, x);
3041   v[3] = _mm_add_epi32(v[3], rnding);
3042   v[3] = _mm_srai_epi32(v[3], bit);
3043 
3044   v[4] = u[4];
3045   v[5] = u[5];
3046 
3047   y = _mm_mullo_epi32(u[6], cospi32);
3048   x = _mm_mullo_epi32(u[7], cospi32);
3049   v[6] = _mm_add_epi32(y, x);
3050   v[6] = _mm_add_epi32(v[6], rnding);
3051   v[6] = _mm_srai_epi32(v[6], bit);
3052 
3053   v[7] = _mm_sub_epi32(y, x);
3054   v[7] = _mm_add_epi32(v[7], rnding);
3055   v[7] = _mm_srai_epi32(v[7], bit);
3056 
3057   v[8] = u[8];
3058   v[9] = u[9];
3059 
3060   y = _mm_mullo_epi32(u[10], cospi32);
3061   x = _mm_mullo_epi32(u[11], cospi32);
3062   v[10] = _mm_add_epi32(y, x);
3063   v[10] = _mm_add_epi32(v[10], rnding);
3064   v[10] = _mm_srai_epi32(v[10], bit);
3065 
3066   v[11] = _mm_sub_epi32(y, x);
3067   v[11] = _mm_add_epi32(v[11], rnding);
3068   v[11] = _mm_srai_epi32(v[11], bit);
3069 
3070   v[12] = u[12];
3071   v[13] = u[13];
3072 
3073   y = _mm_mullo_epi32(u[14], cospi32);
3074   x = _mm_mullo_epi32(u[15], cospi32);
3075   v[14] = _mm_add_epi32(y, x);
3076   v[14] = _mm_add_epi32(v[14], rnding);
3077   v[14] = _mm_srai_epi32(v[14], bit);
3078 
3079   v[15] = _mm_sub_epi32(y, x);
3080   v[15] = _mm_add_epi32(v[15], rnding);
3081   v[15] = _mm_srai_epi32(v[15], bit);
3082 
3083   // stage 9
3084   if (do_cols) {
3085     out[0] = v[0];
3086     out[1] = _mm_sub_epi32(zero, v[8]);
3087     out[2] = v[12];
3088     out[3] = _mm_sub_epi32(zero, v[4]);
3089     out[4] = v[6];
3090     out[5] = _mm_sub_epi32(zero, v[14]);
3091     out[6] = v[10];
3092     out[7] = _mm_sub_epi32(zero, v[2]);
3093     out[8] = v[3];
3094     out[9] = _mm_sub_epi32(zero, v[11]);
3095     out[10] = v[15];
3096     out[11] = _mm_sub_epi32(zero, v[7]);
3097     out[12] = v[5];
3098     out[13] = _mm_sub_epi32(zero, v[13]);
3099     out[14] = v[9];
3100     out[15] = _mm_sub_epi32(zero, v[1]);
3101   } else {
3102     const int log_range_out = AOMMAX(16, bd + 6);
3103     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3104     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3105 
3106     neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
3107                      out_shift);
3108     neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
3109                      &clamp_hi_out, out_shift);
3110     neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
3111                      &clamp_hi_out, out_shift);
3112     neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
3113                      &clamp_hi_out, out_shift);
3114     neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
3115                      &clamp_hi_out, out_shift);
3116     neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
3117                      &clamp_hi_out, out_shift);
3118     neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
3119                      &clamp_hi_out, out_shift);
3120     neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
3121                      &clamp_hi_out, out_shift);
3122   }
3123 }
iidentity16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3124 static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3125                                int bd, int out_shift) {
3126   (void)bit;
3127   __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
3128   __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
3129   __m128i a0_low, a0_high, a1_low, a1_high;
3130   __m128i zero = _mm_setzero_si128();
3131   offset = _mm_unpacklo_epi32(offset, zero);
3132 
3133   for (int i = 0; i < 16; i++) {
3134     a0_low = _mm_mul_epi32(in[i], fact);
3135     a0_low = _mm_add_epi32(a0_low, offset);
3136     a0_low = _mm_srli_epi64(a0_low, NewSqrt2Bits);
3137 
3138     a0_high = _mm_srli_si128(in[i], 4);
3139     a0_high = _mm_mul_epi32(a0_high, fact);
3140     a0_high = _mm_add_epi32(a0_high, offset);
3141     a0_high = _mm_srli_epi64(a0_high, NewSqrt2Bits);
3142 
3143     a1_low = _mm_unpacklo_epi32(a0_low, a0_high);
3144     a1_high = _mm_unpackhi_epi32(a0_low, a0_high);
3145     out[i] = _mm_unpacklo_epi64(a1_low, a1_high);
3146   }
3147 
3148   if (!do_cols) {
3149     const int log_range = AOMMAX(16, bd + 6);
3150     const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3151     const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3152     round_shift_8x8(out, out_shift);
3153     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 16);
3154   }
3155 }
idct64_stage8_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * cospim16,const __m128i * cospi48,const __m128i * cospi16,const __m128i * cospim48,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3156 static inline void idct64_stage8_sse4_1(
3157     __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
3158     const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
3159     const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
3160     const __m128i *rnding, int bit) {
3161   int i;
3162   __m128i temp1, temp2, temp3, temp4;
3163   temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
3164   u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
3165   u[10] = temp1;
3166   temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
3167   u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
3168   u[11] = temp2;
3169 
3170   for (i = 16; i < 20; ++i) {
3171     addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
3172     addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
3173                   clamp_hi);
3174   }
3175 
3176   temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
3177   temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
3178   temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
3179   temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
3180   u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
3181   u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
3182   u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
3183   u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
3184   u[36] = temp1;
3185   u[37] = temp2;
3186   u[38] = temp3;
3187   u[39] = temp4;
3188 
3189   temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
3190   temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
3191   temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
3192   temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
3193   u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
3194   u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
3195   u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
3196   u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
3197   u[40] = temp1;
3198   u[41] = temp2;
3199   u[42] = temp3;
3200   u[43] = temp4;
3201 }
3202 
idct64_stage9_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3203 static inline void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
3204                                         const __m128i *cospi32,
3205                                         const __m128i *clamp_lo,
3206                                         const __m128i *clamp_hi,
3207                                         const __m128i *rnding, int bit) {
3208   int i;
3209   __m128i temp1, temp2, temp3, temp4;
3210   for (i = 0; i < 8; ++i) {
3211     addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
3212   }
3213 
3214   temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
3215   temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
3216   temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
3217   temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
3218   u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
3219   u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
3220   u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
3221   u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
3222   u[20] = temp1;
3223   u[21] = temp2;
3224   u[22] = temp3;
3225   u[23] = temp4;
3226   for (i = 32; i < 40; i++) {
3227     addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
3228   }
3229 
3230   for (i = 48; i < 56; i++) {
3231     addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
3232   }
3233 }
3234 
idct64_stage10_sse4_1(__m128i * u,const __m128i * cospim32,const __m128i * cospi32,const __m128i * clamp_lo,const __m128i * clamp_hi,const __m128i * rnding,int bit)3235 static inline void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
3236                                          const __m128i *cospi32,
3237                                          const __m128i *clamp_lo,
3238                                          const __m128i *clamp_hi,
3239                                          const __m128i *rnding, int bit) {
3240   __m128i temp1, temp2, temp3, temp4;
3241   for (int i = 0; i < 16; i++) {
3242     addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
3243   }
3244 
3245   temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
3246   temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
3247   temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
3248   temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
3249   u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
3250   u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
3251   u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
3252   u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
3253   u[40] = temp1;
3254   u[41] = temp2;
3255   u[42] = temp3;
3256   u[43] = temp4;
3257 
3258   temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
3259   temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
3260   temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
3261   temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
3262   u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
3263   u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
3264   u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
3265   u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
3266   u[44] = temp1;
3267   u[45] = temp2;
3268   u[46] = temp3;
3269   u[47] = temp4;
3270 }
3271 
idct64_stage11_sse4_1(__m128i * u,__m128i * out,int do_cols,int bd,int out_shift,const __m128i * clamp_lo,const __m128i * clamp_hi)3272 static inline void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
3273                                          int bd, int out_shift,
3274                                          const __m128i *clamp_lo,
3275                                          const __m128i *clamp_hi) {
3276   for (int i = 0; i < 32; i++) {
3277     addsub_sse4_1(u[i], u[63 - i], out + i, out + 63 - i, clamp_lo, clamp_hi);
3278   }
3279 
3280   if (!do_cols) {
3281     const int log_range_out = AOMMAX(16, bd + 6);
3282     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3283     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3284 
3285     for (int i = 0; i < 64; i += 4) {
3286       round_shift_4x4(out + i, out_shift);
3287       highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out, &clamp_hi_out,
3288                                 4);
3289     }
3290   }
3291 }
3292 
idct64x64_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3293 static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
3294                                   int do_cols, int bd, int out_shift) {
3295   const int32_t *cospi = cospi_arr(bit);
3296   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3297   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3298   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3299   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3300 
3301   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3302 
3303   {
3304     __m128i x;
3305 
3306     // stage 1
3307     // stage 2
3308     // stage 3
3309     // stage 4
3310     // stage 5
3311     // stage 6
3312     x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
3313 
3314     // stage 8
3315     // stage 9
3316     // stage 10
3317     // stage 11
3318     if (!do_cols) {
3319       const int log_range_out = AOMMAX(16, bd + 6);
3320       clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
3321       clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
3322       if (out_shift != 0) {
3323         __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
3324         x = _mm_add_epi32(x, offset);
3325         x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
3326       }
3327     }
3328     x = _mm_max_epi32(x, clamp_lo);
3329     x = _mm_min_epi32(x, clamp_hi);
3330     out[0] = x;
3331     out[1] = x;
3332     out[2] = x;
3333     out[3] = x;
3334     out[4] = x;
3335     out[5] = x;
3336     out[6] = x;
3337     out[7] = x;
3338     out[8] = x;
3339     out[9] = x;
3340     out[10] = x;
3341     out[11] = x;
3342     out[12] = x;
3343     out[13] = x;
3344     out[14] = x;
3345     out[15] = x;
3346     out[16] = x;
3347     out[17] = x;
3348     out[18] = x;
3349     out[19] = x;
3350     out[20] = x;
3351     out[21] = x;
3352     out[22] = x;
3353     out[23] = x;
3354     out[24] = x;
3355     out[25] = x;
3356     out[26] = x;
3357     out[27] = x;
3358     out[28] = x;
3359     out[29] = x;
3360     out[30] = x;
3361     out[31] = x;
3362     out[32] = x;
3363     out[33] = x;
3364     out[34] = x;
3365     out[35] = x;
3366     out[36] = x;
3367     out[37] = x;
3368     out[38] = x;
3369     out[39] = x;
3370     out[40] = x;
3371     out[41] = x;
3372     out[42] = x;
3373     out[43] = x;
3374     out[44] = x;
3375     out[45] = x;
3376     out[46] = x;
3377     out[47] = x;
3378     out[48] = x;
3379     out[49] = x;
3380     out[50] = x;
3381     out[51] = x;
3382     out[52] = x;
3383     out[53] = x;
3384     out[54] = x;
3385     out[55] = x;
3386     out[56] = x;
3387     out[57] = x;
3388     out[58] = x;
3389     out[59] = x;
3390     out[60] = x;
3391     out[61] = x;
3392     out[62] = x;
3393     out[63] = x;
3394   }
3395 }
3396 
idct64x64_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3397 static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
3398                                   int do_cols, int bd, int out_shift) {
3399   int i, j;
3400   const int32_t *cospi = cospi_arr(bit);
3401   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3402   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3403   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3404   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3405 
3406   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3407   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3408   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3409   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3410   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3411   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3412   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3413   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3414   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3415   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3416   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3417   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3418   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3419   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3420   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3421   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3422   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3423   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3424   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3425   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3426   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3427   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3428   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3429   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3430   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3431   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3432   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3433   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3434   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3435   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3436   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3437   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3438   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3439   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3440   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3441   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3442   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3443   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3444 
3445   {
3446     __m128i u[64];
3447 
3448     // stage 1
3449     u[0] = in[0];
3450     u[8] = in[4];
3451     u[16] = in[2];
3452     u[24] = in[6];
3453     u[32] = in[1];
3454     u[40] = in[5];
3455     u[48] = in[3];
3456     u[56] = in[7];
3457 
3458     // stage 2
3459     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3460     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3461     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3462     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3463     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3464     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3465     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3466     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3467 
3468     // stage 3
3469     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3470     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3471     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3472     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3473     u[33] = u[32];
3474     u[38] = u[39];
3475     u[41] = u[40];
3476     u[46] = u[47];
3477     u[49] = u[48];
3478     u[54] = u[55];
3479     u[57] = u[56];
3480     u[62] = u[63];
3481 
3482     // stage 4
3483     __m128i temp1, temp2;
3484     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3485     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3486     u[17] = u[16];
3487     u[22] = u[23];
3488     u[25] = u[24];
3489     u[30] = u[31];
3490 
3491     temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3492     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3493     u[33] = temp1;
3494 
3495     temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3496     u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3497     u[57] = temp2;
3498 
3499     temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3500     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3501     u[41] = temp1;
3502 
3503     temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3504     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3505     u[46] = temp2;
3506 
3507     // stage 5
3508     u[9] = u[8];
3509     u[14] = u[15];
3510 
3511     temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3512     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3513     u[17] = temp1;
3514 
3515     temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3516     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3517     u[22] = temp2;
3518 
3519     u[35] = u[32];
3520     u[34] = u[33];
3521     u[36] = u[39];
3522     u[37] = u[38];
3523     u[43] = u[40];
3524     u[42] = u[41];
3525     u[44] = u[47];
3526     u[45] = u[46];
3527     u[51] = u[48];
3528     u[50] = u[49];
3529     u[52] = u[55];
3530     u[53] = u[54];
3531     u[59] = u[56];
3532     u[58] = u[57];
3533     u[60] = u[63];
3534     u[61] = u[62];
3535 
3536     // stage 6
3537     temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3538     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3539     u[0] = temp1;
3540 
3541     temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3542     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3543     u[9] = temp2;
3544     u[19] = u[16];
3545     u[18] = u[17];
3546     u[20] = u[23];
3547     u[21] = u[22];
3548     u[27] = u[24];
3549     u[26] = u[25];
3550     u[28] = u[31];
3551     u[29] = u[30];
3552 
3553     temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3554     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3555     u[34] = temp1;
3556     temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3557     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3558     u[35] = temp2;
3559     temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3560     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3561     u[36] = temp1;
3562     temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3563     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3564     u[37] = temp2;
3565     temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3566     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3567     u[42] = temp1;
3568     temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3569     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3570     u[43] = temp2;
3571     temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3572     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3573     u[44] = temp1;
3574     temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3575     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3576     u[45] = temp2;
3577 
3578     // stage 7
3579     u[3] = u[0];
3580     u[2] = u[1];
3581     u[11] = u[8];
3582     u[10] = u[9];
3583     u[12] = u[15];
3584     u[13] = u[14];
3585 
3586     temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3587     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3588     u[18] = temp1;
3589     temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3590     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3591     u[19] = temp2;
3592     temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3593     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3594     u[20] = temp1;
3595     temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3596     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3597     u[21] = temp2;
3598     for (i = 32; i < 64; i += 16) {
3599       for (j = i; j < i + 4; j++) {
3600         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3601         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3602                       &clamp_hi);
3603       }
3604     }
3605 
3606     // stage 8
3607     u[7] = u[0];
3608     u[6] = u[1];
3609     u[5] = u[2];
3610     u[4] = u[3];
3611 
3612     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3613                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3614 
3615     // stage 9
3616     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3617                          bit);
3618 
3619     // stage 10
3620     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3621                           bit);
3622 
3623     // stage 11
3624     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3625   }
3626 }
3627 
idct64x64_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3628 static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
3629                                    int do_cols, int bd, int out_shift) {
3630   int i, j;
3631   const int32_t *cospi = cospi_arr(bit);
3632   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3633   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3634   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3635   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3636 
3637   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3638   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3639   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3640   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3641   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3642   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3643   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3644   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3645   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3646   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3647   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3648   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3649   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3650   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3651   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3652   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3653   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3654   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3655   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3656   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3657   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3658   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3659   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3660   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3661   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3662   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3663   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3664   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3665   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3666   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3667   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3668   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3669   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3670 
3671   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
3672   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
3673   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
3674   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
3675   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
3676   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
3677   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
3678   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
3679   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
3680   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
3681   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
3682   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
3683   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
3684   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
3685   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
3686   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
3687   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
3688   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
3689   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
3690   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
3691   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
3692 
3693   {
3694     __m128i u[64];
3695     __m128i tmp1, tmp2, tmp3, tmp4;
3696     // stage 1
3697     u[0] = in[0];
3698     u[32] = in[1];
3699     u[36] = in[9];
3700     u[40] = in[5];
3701     u[44] = in[13];
3702     u[48] = in[3];
3703     u[52] = in[11];
3704     u[56] = in[7];
3705     u[60] = in[15];
3706     u[16] = in[2];
3707     u[20] = in[10];
3708     u[24] = in[6];
3709     u[28] = in[14];
3710     u[4] = in[8];
3711     u[8] = in[4];
3712     u[12] = in[12];
3713 
3714     // stage 2
3715     u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
3716     u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
3717     u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
3718     u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
3719     u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
3720     u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
3721     u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
3722     u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
3723     u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
3724     u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
3725     u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
3726     u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
3727     u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
3728     u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
3729     u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
3730     u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
3731 
3732     // stage 3
3733     u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
3734     u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
3735     u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
3736     u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
3737     u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
3738     u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
3739     u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
3740     u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
3741     u[33] = u[32];
3742     u[34] = u[35];
3743     u[37] = u[36];
3744     u[38] = u[39];
3745     u[41] = u[40];
3746     u[42] = u[43];
3747     u[45] = u[44];
3748     u[46] = u[47];
3749     u[49] = u[48];
3750     u[50] = u[51];
3751     u[53] = u[52];
3752     u[54] = u[55];
3753     u[57] = u[56];
3754     u[58] = u[59];
3755     u[61] = u[60];
3756     u[62] = u[63];
3757 
3758     // stage 4
3759     u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
3760     u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
3761     u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
3762     u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
3763 
3764     u[17] = u[16];
3765     u[18] = u[19];
3766     u[21] = u[20];
3767     u[22] = u[23];
3768     u[25] = u[24];
3769     u[26] = u[27];
3770     u[29] = u[28];
3771     u[30] = u[31];
3772 
3773     tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
3774     tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
3775     tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
3776     tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
3777     u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
3778     u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
3779     u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
3780     u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
3781     u[33] = tmp1;
3782     u[34] = tmp2;
3783     u[37] = tmp3;
3784     u[38] = tmp4;
3785 
3786     tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
3787     tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
3788     tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
3789     tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
3790     u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
3791     u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
3792     u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
3793     u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
3794     u[41] = tmp1;
3795     u[42] = tmp2;
3796     u[45] = tmp3;
3797     u[46] = tmp4;
3798 
3799     // stage 5
3800     u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
3801     u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
3802 
3803     u[9] = u[8];
3804     u[10] = u[11];
3805     u[13] = u[12];
3806     u[14] = u[15];
3807 
3808     tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
3809     tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
3810     tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
3811     tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
3812     u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
3813     u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
3814     u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
3815     u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
3816     u[17] = tmp1;
3817     u[18] = tmp2;
3818     u[21] = tmp3;
3819     u[22] = tmp4;
3820 
3821     for (i = 32; i < 64; i += 8) {
3822       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3823                     &clamp_hi);
3824       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3825                     &clamp_hi);
3826 
3827       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3828                     &clamp_hi);
3829       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3830                     &clamp_hi);
3831     }
3832 
3833     // stage 6
3834     tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3835     u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
3836     u[0] = tmp1;
3837     u[5] = u[4];
3838     u[6] = u[7];
3839 
3840     tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
3841     u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
3842     u[9] = tmp1;
3843     tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
3844     u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
3845     u[10] = tmp2;
3846 
3847     for (i = 16; i < 32; i += 8) {
3848       addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
3849                     &clamp_hi);
3850       addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
3851                     &clamp_hi);
3852 
3853       addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
3854                     &clamp_hi);
3855       addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
3856                     &clamp_hi);
3857     }
3858 
3859     tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
3860     tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
3861     tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
3862     tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
3863     u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
3864     u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
3865     u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
3866     u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
3867     u[34] = tmp1;
3868     u[35] = tmp2;
3869     u[36] = tmp3;
3870     u[37] = tmp4;
3871 
3872     tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
3873     tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
3874     tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
3875     tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
3876     u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
3877     u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
3878     u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
3879     u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
3880     u[42] = tmp1;
3881     u[43] = tmp2;
3882     u[44] = tmp3;
3883     u[45] = tmp4;
3884 
3885     // stage 7
3886     u[3] = u[0];
3887     u[2] = u[1];
3888     tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
3889     u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
3890     u[5] = tmp1;
3891     addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
3892     addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
3893     addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
3894     addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
3895 
3896     tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
3897     tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
3898     tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
3899     tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
3900     u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
3901     u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
3902     u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
3903     u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
3904     u[18] = tmp1;
3905     u[19] = tmp2;
3906     u[20] = tmp3;
3907     u[21] = tmp4;
3908 
3909     for (i = 32; i < 64; i += 16) {
3910       for (j = i; j < i + 4; j++) {
3911         addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
3912         addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
3913                       &clamp_hi);
3914       }
3915     }
3916 
3917     // stage 8
3918     for (i = 0; i < 4; ++i) {
3919       addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
3920     }
3921 
3922     idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
3923                          &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
3924 
3925     // stage 9
3926     idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3927                          bit);
3928 
3929     // stage 10
3930     idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
3931                           bit);
3932 
3933     // stage 11
3934     idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
3935   }
3936 }
3937 
idct64x64_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)3938 static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
3939                              int bd, int out_shift) {
3940   int i, j;
3941   const int32_t *cospi = cospi_arr(bit);
3942   const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
3943   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
3944   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
3945   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
3946 
3947   const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
3948   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
3949   const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
3950   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
3951   const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
3952   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
3953   const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
3954   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
3955   const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
3956   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
3957   const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
3958   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
3959   const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
3960   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
3961   const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
3962   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
3963   const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
3964   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
3965   const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
3966   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
3967   const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
3968   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
3969   const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
3970   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
3971   const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
3972   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
3973   const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
3974   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
3975   const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
3976   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
3977   const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
3978   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
3979   const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
3980   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
3981   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
3982   const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
3983   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
3984   const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
3985   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
3986   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
3987   const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
3988   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
3989   const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
3990   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
3991   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
3992   const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
3993   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
3994   const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
3995   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
3996   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
3997   const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
3998 
3999   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4000   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4001   const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
4002   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4003   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4004   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4005   const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
4006   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4007   const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
4008   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4009   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4010   const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
4011   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4012   const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
4013   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4014   const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
4015   const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
4016   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4017   const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
4018   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4019   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4020   const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
4021   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4022   const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
4023   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4024   const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
4025   const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
4026 
4027   {
4028     __m128i u[64], v[64];
4029 
4030     // stage 1
4031     u[32] = in[1];
4032     u[34] = in[17];
4033     u[36] = in[9];
4034     u[38] = in[25];
4035     u[40] = in[5];
4036     u[42] = in[21];
4037     u[44] = in[13];
4038     u[46] = in[29];
4039     u[48] = in[3];
4040     u[50] = in[19];
4041     u[52] = in[11];
4042     u[54] = in[27];
4043     u[56] = in[7];
4044     u[58] = in[23];
4045     u[60] = in[15];
4046     u[62] = in[31];
4047 
4048     v[16] = in[2];
4049     v[18] = in[18];
4050     v[20] = in[10];
4051     v[22] = in[26];
4052     v[24] = in[6];
4053     v[26] = in[22];
4054     v[28] = in[14];
4055     v[30] = in[30];
4056 
4057     u[8] = in[4];
4058     u[10] = in[20];
4059     u[12] = in[12];
4060     u[14] = in[28];
4061 
4062     v[4] = in[8];
4063     v[6] = in[24];
4064 
4065     u[0] = in[0];
4066     u[2] = in[16];
4067 
4068     // stage 2
4069     v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
4070     v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
4071     v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
4072     v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
4073     v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
4074     v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
4075     v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
4076     v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
4077     v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
4078     v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
4079     v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
4080     v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
4081     v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
4082     v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
4083     v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
4084     v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
4085     v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
4086     v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
4087     v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
4088     v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
4089     v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
4090     v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
4091     v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
4092     v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
4093     v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
4094     v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
4095     v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
4096     v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
4097     v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
4098     v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
4099     v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
4100     v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
4101 
4102     // stage 3
4103     u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
4104     u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
4105     u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
4106     u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
4107     u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
4108     u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
4109     u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
4110     u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
4111     u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
4112     u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
4113     u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
4114     u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
4115     u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
4116     u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
4117     u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
4118     u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
4119 
4120     for (i = 32; i < 64; i += 4) {
4121       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4122                     &clamp_hi);
4123       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4124                     &clamp_hi);
4125     }
4126 
4127     // stage 4
4128     v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
4129     v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
4130     v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
4131     v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
4132     v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
4133     v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
4134     v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
4135     v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
4136 
4137     for (i = 16; i < 32; i += 4) {
4138       addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
4139                     &clamp_hi);
4140       addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
4141                     &clamp_hi);
4142     }
4143 
4144     for (i = 32; i < 64; i += 4) {
4145       v[i + 0] = u[i + 0];
4146       v[i + 3] = u[i + 3];
4147     }
4148 
4149     v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
4150     v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
4151     v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
4152     v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
4153     v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
4154     v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
4155     v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
4156     v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
4157     v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
4158     v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
4159     v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
4160     v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
4161     v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
4162     v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
4163     v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
4164     v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
4165 
4166     // stage 5
4167     u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
4168     u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
4169     u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
4170     u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
4171 
4172     for (i = 8; i < 16; i += 4) {
4173       addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
4174                     &clamp_hi);
4175       addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
4176                     &clamp_hi);
4177     }
4178 
4179     for (i = 16; i < 32; i += 4) {
4180       u[i + 0] = v[i + 0];
4181       u[i + 3] = v[i + 3];
4182     }
4183 
4184     u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
4185     u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
4186     u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
4187     u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
4188     u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
4189     u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
4190     u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
4191     u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
4192 
4193     for (i = 32; i < 64; i += 8) {
4194       addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
4195                     &clamp_hi);
4196       addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
4197                     &clamp_hi);
4198 
4199       addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
4200                     &clamp_hi);
4201       addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
4202                     &clamp_hi);
4203     }
4204 
4205     // stage 6
4206     v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4207     v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
4208     v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
4209     v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
4210 
4211     addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
4212     addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
4213 
4214     for (i = 8; i < 16; i += 4) {
4215       v[i + 0] = u[i + 0];
4216       v[i + 3] = u[i + 3];
4217     }
4218 
4219     v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
4220     v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
4221     v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
4222     v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
4223 
4224     for (i = 16; i < 32; i += 8) {
4225       addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
4226                     &clamp_hi);
4227       addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
4228                     &clamp_hi);
4229 
4230       addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
4231                     &clamp_hi);
4232       addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
4233                     &clamp_hi);
4234     }
4235 
4236     for (i = 32; i < 64; i += 8) {
4237       v[i + 0] = u[i + 0];
4238       v[i + 1] = u[i + 1];
4239       v[i + 6] = u[i + 6];
4240       v[i + 7] = u[i + 7];
4241     }
4242 
4243     v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
4244     v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
4245     v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
4246     v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
4247     v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
4248     v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
4249     v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
4250     v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
4251     v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
4252     v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
4253     v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
4254     v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
4255     v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
4256     v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
4257     v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
4258     v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
4259 
4260     // stage 7
4261     addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
4262     addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
4263 
4264     u[4] = v[4];
4265     u[7] = v[7];
4266     u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
4267     u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
4268 
4269     addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
4270     addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
4271     addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
4272     addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
4273 
4274     for (i = 16; i < 32; i += 8) {
4275       u[i + 0] = v[i + 0];
4276       u[i + 1] = v[i + 1];
4277       u[i + 6] = v[i + 6];
4278       u[i + 7] = v[i + 7];
4279     }
4280 
4281     u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
4282     u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
4283     u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
4284     u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
4285     u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
4286     u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
4287     u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
4288     u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
4289 
4290     for (i = 32; i < 64; i += 16) {
4291       for (j = i; j < i + 4; j++) {
4292         addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
4293         addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
4294                       &clamp_hi);
4295       }
4296     }
4297 
4298     // stage 8
4299     for (i = 0; i < 4; ++i) {
4300       addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
4301     }
4302 
4303     v[8] = u[8];
4304     v[9] = u[9];
4305     v[14] = u[14];
4306     v[15] = u[15];
4307 
4308     v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
4309     v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
4310     v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
4311     v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
4312 
4313     for (i = 16; i < 20; ++i) {
4314       addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
4315       addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
4316                     &clamp_hi);
4317     }
4318 
4319     for (i = 32; i < 36; ++i) {
4320       v[i] = u[i];
4321       v[i + 12] = u[i + 12];
4322       v[i + 16] = u[i + 16];
4323       v[i + 28] = u[i + 28];
4324     }
4325 
4326     v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
4327     v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
4328     v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
4329     v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
4330     v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
4331     v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
4332     v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
4333     v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
4334     v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
4335     v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
4336     v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
4337     v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
4338     v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
4339     v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
4340     v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
4341     v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
4342 
4343     // stage 9
4344     for (i = 0; i < 8; ++i) {
4345       addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
4346     }
4347 
4348     for (i = 16; i < 20; ++i) {
4349       u[i] = v[i];
4350       u[i + 12] = v[i + 12];
4351     }
4352 
4353     u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
4354     u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
4355     u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
4356     u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
4357     u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
4358     u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
4359     u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
4360     u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
4361 
4362     for (i = 32; i < 40; i++) {
4363       addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
4364     }
4365 
4366     for (i = 48; i < 56; i++) {
4367       addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
4368     }
4369 
4370     // stage 10
4371     for (i = 0; i < 16; i++) {
4372       addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
4373     }
4374 
4375     for (i = 32; i < 40; i++) v[i] = u[i];
4376 
4377     v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
4378     v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
4379     v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
4380     v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
4381     v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
4382     v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
4383     v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
4384     v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
4385     v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
4386     v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
4387     v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
4388     v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
4389     v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
4390     v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
4391     v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
4392     v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
4393 
4394     for (i = 56; i < 64; i++) v[i] = u[i];
4395 
4396     // stage 11
4397     for (i = 0; i < 32; i++) {
4398       addsub_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)], &clamp_lo,
4399                     &clamp_hi);
4400     }
4401 
4402     if (!do_cols) {
4403       const int log_range_out = AOMMAX(16, bd + 6);
4404       const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
4405       const __m128i clamp_hi_out =
4406           _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4407       for (i = 0; i < 64; i += 4) {
4408         round_shift_4x4(out + i, out_shift);
4409         highbd_clamp_epi32_sse4_1(out + i, out + i, &clamp_lo_out,
4410                                   &clamp_hi_out, 4);
4411       }
4412     }
4413   }
4414 }
4415 
idct32x32_low1_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4416 static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
4417                                   int do_cols, int bd, int out_shift) {
4418   const int32_t *cospi = cospi_arr(bit);
4419   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4420   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4421   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4422   __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4423   __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4424   __m128i bf1;
4425 
4426   // stage 0
4427   // stage 1
4428   bf1 = in[0];
4429 
4430   // stage 2
4431   // stage 3
4432   // stage 4
4433   // stage 5
4434   bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
4435 
4436   // stage 6
4437   // stage 7
4438   // stage 8
4439   // stage 9
4440   if (do_cols) {
4441     bf1 = _mm_max_epi32(bf1, clamp_lo);
4442     bf1 = _mm_min_epi32(bf1, clamp_hi);
4443   } else {
4444     const int log_range_out = AOMMAX(16, bd + 6);
4445     clamp_lo = _mm_set1_epi32(-(1 << (log_range_out - 1)));
4446     clamp_hi = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
4447     if (out_shift != 0) {
4448       __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
4449       bf1 = _mm_add_epi32(bf1, offset);
4450       bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
4451     }
4452   }
4453 
4454   bf1 = _mm_max_epi32(bf1, clamp_lo);
4455   bf1 = _mm_min_epi32(bf1, clamp_hi);
4456   out[0] = bf1;
4457   out[1] = bf1;
4458   out[2] = bf1;
4459   out[3] = bf1;
4460   out[4] = bf1;
4461   out[5] = bf1;
4462   out[6] = bf1;
4463   out[7] = bf1;
4464   out[8] = bf1;
4465   out[9] = bf1;
4466   out[10] = bf1;
4467   out[11] = bf1;
4468   out[12] = bf1;
4469   out[13] = bf1;
4470   out[14] = bf1;
4471   out[15] = bf1;
4472   out[16] = bf1;
4473   out[17] = bf1;
4474   out[18] = bf1;
4475   out[19] = bf1;
4476   out[20] = bf1;
4477   out[21] = bf1;
4478   out[22] = bf1;
4479   out[23] = bf1;
4480   out[24] = bf1;
4481   out[25] = bf1;
4482   out[26] = bf1;
4483   out[27] = bf1;
4484   out[28] = bf1;
4485   out[29] = bf1;
4486   out[30] = bf1;
4487   out[31] = bf1;
4488 }
4489 
idct32x32_low8_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4490 static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
4491                                   int do_cols, int bd, int out_shift) {
4492   const int32_t *cospi = cospi_arr(bit);
4493   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4494   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4495   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4496   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4497   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4498   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4499   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4500   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4501   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4502   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4503   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4504   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4505   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4506   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4507   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4508   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4509   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4510   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4511   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4512   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4513   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4514   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4515   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4516   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4517   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4518   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4519   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4520   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4521   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4522   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4523   __m128i bf1[32];
4524 
4525   // stage 0
4526   // stage 1
4527   bf1[0] = in[0];
4528   bf1[4] = in[4];
4529   bf1[8] = in[2];
4530   bf1[12] = in[6];
4531   bf1[16] = in[1];
4532   bf1[20] = in[5];
4533   bf1[24] = in[3];
4534   bf1[28] = in[7];
4535 
4536   // stage 2
4537   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4538   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4539   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4540   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4541   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4542   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4543   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4544   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4545 
4546   // stage 3
4547   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4548   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4549 
4550   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4551   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4552   bf1[17] = bf1[16];
4553   bf1[18] = bf1[19];
4554   bf1[21] = bf1[20];
4555   bf1[22] = bf1[23];
4556   bf1[25] = bf1[24];
4557   bf1[26] = bf1[27];
4558   bf1[29] = bf1[28];
4559   bf1[30] = bf1[31];
4560 
4561   // stage 4 :
4562   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4563   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4564 
4565   bf1[9] = bf1[8];
4566   bf1[10] = bf1[11];
4567   bf1[13] = bf1[12];
4568   bf1[14] = bf1[15];
4569 
4570   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4571                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4572 
4573   // stage 5
4574   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4575   bf1[1] = bf1[0];
4576   bf1[5] = bf1[4];
4577   bf1[6] = bf1[7];
4578 
4579   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4580                        &clamp_hi, &rounding, bit);
4581 
4582   // stage 6
4583   bf1[3] = bf1[0];
4584   bf1[2] = bf1[1];
4585 
4586   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4587                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4588 
4589   // stage 7
4590   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4591                        &rounding, bit);
4592 
4593   // stage 8
4594   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4595                        &rounding, bit);
4596 
4597   // stage 9
4598   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4599 }
4600 
idct32x32_low16_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4601 static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
4602                                    int do_cols, int bd, int out_shift) {
4603   const int32_t *cospi = cospi_arr(bit);
4604   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4605   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4606   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4607   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4608   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4609   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4610   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4611   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4612   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4613   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4614   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4615   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4616   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4617   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4618   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4619   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4620   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4621   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4622   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4623   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4624   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4625   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4626   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4627   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4628   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4629   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4630   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4631   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4632   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4633   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4634   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4635   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4636   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4637   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4638   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4639   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4640   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4641   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4642   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4643   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4644   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4645   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4646   __m128i bf1[32];
4647 
4648   // stage 0
4649   // stage 1
4650 
4651   bf1[0] = in[0];
4652   bf1[2] = in[8];
4653   bf1[4] = in[4];
4654   bf1[6] = in[12];
4655   bf1[8] = in[2];
4656   bf1[10] = in[10];
4657   bf1[12] = in[6];
4658   bf1[14] = in[14];
4659   bf1[16] = in[1];
4660   bf1[18] = in[9];
4661   bf1[20] = in[5];
4662   bf1[22] = in[13];
4663   bf1[24] = in[3];
4664   bf1[26] = in[11];
4665   bf1[28] = in[7];
4666   bf1[30] = in[15];
4667 
4668   // stage 2
4669   bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
4670   bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
4671   bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
4672   bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
4673   bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
4674   bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
4675   bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
4676   bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
4677   bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
4678   bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
4679   bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
4680   bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
4681   bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
4682   bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
4683   bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
4684   bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
4685 
4686   // stage 3
4687   bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
4688   bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
4689   bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
4690   bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
4691   bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
4692   bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
4693   bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
4694   bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
4695 
4696   addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4697   addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4698   addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4699   addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4700   addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4701   addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4702   addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4703   addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4704   // stage 4
4705   bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
4706   bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
4707   bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
4708   bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
4709 
4710   addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
4711   addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
4712   addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
4713   addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
4714 
4715   idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
4716                        &cospi24, &cospi40, &cospim24, &rounding, bit);
4717 
4718   // stage 5
4719   bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
4720   bf1[1] = bf1[0];
4721   bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
4722   bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
4723 
4724   addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4725   addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4726 
4727   idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
4728                        &clamp_hi, &rounding, bit);
4729 
4730   // stage 6
4731   addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
4732   addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
4733 
4734   idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
4735                        &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
4736 
4737   // stage 7
4738   idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4739                        &rounding, bit);
4740 
4741   // stage 8
4742   idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
4743                        &rounding, bit);
4744   // stage 9
4745   idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, &clamp_lo, &clamp_hi);
4746 }
4747 
idct32x32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)4748 static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
4749                              int bd, int out_shift) {
4750   const int32_t *cospi = cospi_arr(bit);
4751   const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
4752   const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
4753   const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
4754   const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
4755   const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
4756   const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
4757   const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
4758   const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
4759   const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
4760   const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
4761   const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
4762   const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
4763   const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
4764   const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
4765   const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
4766   const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
4767   const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
4768   const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
4769   const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
4770   const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
4771   const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
4772   const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
4773   const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
4774   const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
4775   const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
4776   const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
4777   const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
4778   const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
4779   const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
4780   const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
4781   const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
4782   const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
4783   const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
4784   const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
4785   const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
4786   const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
4787   const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
4788   const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
4789   const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
4790   const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
4791   const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
4792   const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
4793   const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
4794   const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
4795   const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
4796   const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
4797   const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
4798   const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
4799   const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
4800   const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
4801   const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
4802   const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
4803   const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
4804   const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
4805   __m128i bf1[32], bf0[32];
4806 
4807   // stage 0
4808   // stage 1
4809   bf1[0] = in[0];
4810   bf1[1] = in[16];
4811   bf1[2] = in[8];
4812   bf1[3] = in[24];
4813   bf1[4] = in[4];
4814   bf1[5] = in[20];
4815   bf1[6] = in[12];
4816   bf1[7] = in[28];
4817   bf1[8] = in[2];
4818   bf1[9] = in[18];
4819   bf1[10] = in[10];
4820   bf1[11] = in[26];
4821   bf1[12] = in[6];
4822   bf1[13] = in[22];
4823   bf1[14] = in[14];
4824   bf1[15] = in[30];
4825   bf1[16] = in[1];
4826   bf1[17] = in[17];
4827   bf1[18] = in[9];
4828   bf1[19] = in[25];
4829   bf1[20] = in[5];
4830   bf1[21] = in[21];
4831   bf1[22] = in[13];
4832   bf1[23] = in[29];
4833   bf1[24] = in[3];
4834   bf1[25] = in[19];
4835   bf1[26] = in[11];
4836   bf1[27] = in[27];
4837   bf1[28] = in[7];
4838   bf1[29] = in[23];
4839   bf1[30] = in[15];
4840   bf1[31] = in[31];
4841 
4842   // stage 2
4843   bf0[0] = bf1[0];
4844   bf0[1] = bf1[1];
4845   bf0[2] = bf1[2];
4846   bf0[3] = bf1[3];
4847   bf0[4] = bf1[4];
4848   bf0[5] = bf1[5];
4849   bf0[6] = bf1[6];
4850   bf0[7] = bf1[7];
4851   bf0[8] = bf1[8];
4852   bf0[9] = bf1[9];
4853   bf0[10] = bf1[10];
4854   bf0[11] = bf1[11];
4855   bf0[12] = bf1[12];
4856   bf0[13] = bf1[13];
4857   bf0[14] = bf1[14];
4858   bf0[15] = bf1[15];
4859   bf0[16] =
4860       half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
4861   bf0[17] =
4862       half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
4863   bf0[18] =
4864       half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
4865   bf0[19] =
4866       half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
4867   bf0[20] =
4868       half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
4869   bf0[21] =
4870       half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
4871   bf0[22] =
4872       half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
4873   bf0[23] =
4874       half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
4875   bf0[24] =
4876       half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
4877   bf0[25] =
4878       half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
4879   bf0[26] =
4880       half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
4881   bf0[27] =
4882       half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
4883   bf0[28] =
4884       half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
4885   bf0[29] =
4886       half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
4887   bf0[30] =
4888       half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
4889   bf0[31] =
4890       half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
4891 
4892   // stage 3
4893   bf1[0] = bf0[0];
4894   bf1[1] = bf0[1];
4895   bf1[2] = bf0[2];
4896   bf1[3] = bf0[3];
4897   bf1[4] = bf0[4];
4898   bf1[5] = bf0[5];
4899   bf1[6] = bf0[6];
4900   bf1[7] = bf0[7];
4901   bf1[8] =
4902       half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
4903   bf1[9] =
4904       half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
4905   bf1[10] =
4906       half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
4907   bf1[11] =
4908       half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
4909   bf1[12] =
4910       half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
4911   bf1[13] =
4912       half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
4913   bf1[14] =
4914       half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
4915   bf1[15] =
4916       half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
4917 
4918   addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
4919   addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
4920   addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
4921   addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
4922   addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
4923   addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
4924   addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
4925   addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
4926 
4927   // stage 4
4928   bf0[0] = bf1[0];
4929   bf0[1] = bf1[1];
4930   bf0[2] = bf1[2];
4931   bf0[3] = bf1[3];
4932   bf0[4] =
4933       half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
4934   bf0[5] =
4935       half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
4936   bf0[6] =
4937       half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
4938   bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
4939 
4940   addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
4941   addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
4942   addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
4943   addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
4944 
4945   bf0[16] = bf1[16];
4946   bf0[17] =
4947       half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
4948   bf0[18] =
4949       half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
4950   bf0[19] = bf1[19];
4951   bf0[20] = bf1[20];
4952   bf0[21] =
4953       half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
4954   bf0[22] =
4955       half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
4956   bf0[23] = bf1[23];
4957   bf0[24] = bf1[24];
4958   bf0[25] =
4959       half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
4960   bf0[26] =
4961       half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
4962   bf0[27] = bf1[27];
4963   bf0[28] = bf1[28];
4964   bf0[29] =
4965       half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
4966   bf0[30] =
4967       half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
4968   bf0[31] = bf1[31];
4969 
4970   // stage 5
4971   bf1[0] =
4972       half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
4973   bf1[1] =
4974       half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
4975   bf1[2] =
4976       half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
4977   bf1[3] =
4978       half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
4979   addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
4980   addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
4981   bf1[8] = bf0[8];
4982   bf1[9] =
4983       half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
4984   bf1[10] =
4985       half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
4986   bf1[11] = bf0[11];
4987   bf1[12] = bf0[12];
4988   bf1[13] =
4989       half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
4990   bf1[14] =
4991       half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
4992   bf1[15] = bf0[15];
4993   addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
4994   addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
4995   addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
4996   addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
4997   addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
4998   addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
4999   addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
5000   addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
5001 
5002   // stage 6
5003   addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
5004   addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
5005   bf0[4] = bf1[4];
5006   bf0[5] =
5007       half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5008   bf0[6] =
5009       half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
5010   bf0[7] = bf1[7];
5011   addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
5012   addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
5013   addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
5014   addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
5015   bf0[16] = bf1[16];
5016   bf0[17] = bf1[17];
5017   bf0[18] =
5018       half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
5019   bf0[19] =
5020       half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
5021   bf0[20] =
5022       half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
5023   bf0[21] =
5024       half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
5025   bf0[22] = bf1[22];
5026   bf0[23] = bf1[23];
5027   bf0[24] = bf1[24];
5028   bf0[25] = bf1[25];
5029   bf0[26] =
5030       half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
5031   bf0[27] =
5032       half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
5033   bf0[28] =
5034       half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
5035   bf0[29] =
5036       half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
5037   bf0[30] = bf1[30];
5038   bf0[31] = bf1[31];
5039 
5040   // stage 7
5041   addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
5042   addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
5043   addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
5044   addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
5045   bf1[8] = bf0[8];
5046   bf1[9] = bf0[9];
5047   bf1[10] =
5048       half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5049   bf1[11] =
5050       half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5051   bf1[12] =
5052       half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
5053   bf1[13] =
5054       half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
5055   bf1[14] = bf0[14];
5056   bf1[15] = bf0[15];
5057   addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
5058   addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
5059   addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
5060   addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
5061   addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
5062   addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
5063   addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
5064   addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
5065 
5066   // stage 8
5067   addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
5068   addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
5069   addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
5070   addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
5071   addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
5072   addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
5073   addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
5074   addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
5075   bf0[16] = bf1[16];
5076   bf0[17] = bf1[17];
5077   bf0[18] = bf1[18];
5078   bf0[19] = bf1[19];
5079   bf0[20] =
5080       half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5081   bf0[21] =
5082       half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5083   bf0[22] =
5084       half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5085   bf0[23] =
5086       half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5087   bf0[24] =
5088       half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
5089   bf0[25] =
5090       half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
5091   bf0[26] =
5092       half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
5093   bf0[27] =
5094       half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
5095   bf0[28] = bf1[28];
5096   bf0[29] = bf1[29];
5097   bf0[30] = bf1[30];
5098   bf0[31] = bf1[31];
5099 
5100   // stage 9
5101   addsub_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo, &clamp_hi);
5102   addsub_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo, &clamp_hi);
5103   addsub_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo, &clamp_hi);
5104   addsub_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo, &clamp_hi);
5105   addsub_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo, &clamp_hi);
5106   addsub_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo, &clamp_hi);
5107   addsub_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo, &clamp_hi);
5108   addsub_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo, &clamp_hi);
5109   addsub_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo, &clamp_hi);
5110   addsub_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo, &clamp_hi);
5111   addsub_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo, &clamp_hi);
5112   addsub_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo, &clamp_hi);
5113   addsub_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo, &clamp_hi);
5114   addsub_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo, &clamp_hi);
5115   addsub_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo, &clamp_hi);
5116   addsub_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo, &clamp_hi);
5117 
5118   if (!do_cols) {
5119     const int log_range_out = AOMMAX(16, bd + 6);
5120     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
5121     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
5122     round_shift_8x8(out, out_shift);
5123     round_shift_8x8(out + 16, out_shift);
5124     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
5125   }
5126 }
5127 
av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5128 static void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input,
5129                                                uint8_t *dest, int stride,
5130                                                const TxfmParam *txfm_param) {
5131   int bd = txfm_param->bd;
5132   const TX_TYPE tx_type = txfm_param->tx_type;
5133   const int32_t *src = cast_to_int32(input);
5134   switch (tx_type) {
5135     case IDTX:
5136     case H_DCT:
5137     case H_ADST:
5138     case H_FLIPADST:
5139     case V_DCT:
5140     case V_ADST:
5141     case V_FLIPADST:
5142       av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
5143                                                 txfm_param->tx_size,
5144                                                 txfm_param->eob, bd);
5145       break;
5146     default:
5147       av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
5148                                     tx_type, bd);
5149       break;
5150   }
5151 }
av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5152 static void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input,
5153                                                uint8_t *dest, int stride,
5154                                                const TxfmParam *txfm_param) {
5155   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5156   int eob = txfm_param->eob;
5157   int bd = txfm_param->bd;
5158   int lossless = txfm_param->lossless;
5159   const int32_t *src = cast_to_int32(input);
5160   const TX_TYPE tx_type = txfm_param->tx_type;
5161   if (lossless) {
5162     assert(tx_type == DCT_DCT);
5163     av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
5164     return;
5165   }
5166   av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
5167                                 bd);
5168 }
iidentity32_sse4_1(__m128i * in,__m128i * out,int bit,int do_cols,int bd,int out_shift)5169 static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
5170                                int bd, int out_shift) {
5171   (void)bit;
5172   for (int i = 0; i < 32; i += 16) {
5173     out[i] = _mm_slli_epi32(in[i], 2);
5174     out[i + 1] = _mm_slli_epi32(in[i + 1], 2);
5175     out[i + 2] = _mm_slli_epi32(in[i + 2], 2);
5176     out[i + 3] = _mm_slli_epi32(in[i + 3], 2);
5177     out[i + 4] = _mm_slli_epi32(in[i + 4], 2);
5178     out[i + 5] = _mm_slli_epi32(in[i + 5], 2);
5179     out[i + 6] = _mm_slli_epi32(in[i + 6], 2);
5180     out[i + 7] = _mm_slli_epi32(in[i + 7], 2);
5181     out[i + 8] = _mm_slli_epi32(in[i + 8], 2);
5182     out[i + 9] = _mm_slli_epi32(in[i + 9], 2);
5183     out[i + 10] = _mm_slli_epi32(in[i + 10], 2);
5184     out[i + 11] = _mm_slli_epi32(in[i + 11], 2);
5185     out[i + 12] = _mm_slli_epi32(in[i + 12], 2);
5186     out[i + 13] = _mm_slli_epi32(in[i + 13], 2);
5187     out[i + 14] = _mm_slli_epi32(in[i + 14], 2);
5188     out[i + 15] = _mm_slli_epi32(in[i + 15], 2);
5189   }
5190 
5191   if (!do_cols) {
5192     const int log_range_out = AOMMAX(16, bd + 6);
5193     const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
5194     const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
5195     round_shift_8x8(out, out_shift);
5196     round_shift_8x8(out + 16, out_shift);
5197     highbd_clamp_epi32_sse4_1(out, out, &clamp_lo_out, &clamp_hi_out, 32);
5198   }
5199 }
5200 static const transform_1d_sse4_1
5201     highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
5202       {
5203           { idct4x4_sse4_1, NULL, NULL, NULL },
5204           { iadst4x4_sse4_1, NULL, NULL, NULL },
5205           { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
5206       },
5207       { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
5208         { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
5209         { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
5210       {
5211           { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
5212             NULL },
5213           { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
5214             NULL },
5215           { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
5216       },
5217       { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
5218           idct32x32_sse4_1 },
5219         { NULL, NULL, NULL, NULL },
5220         { iidentity32_sse4_1, NULL, NULL, NULL } },
5221       { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
5222           idct64x64_sse4_1 },
5223         { NULL, NULL, NULL, NULL },
5224         { NULL, NULL, NULL, NULL } }
5225     };
highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5226 static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
5227                                                     uint16_t *output,
5228                                                     int stride, TX_TYPE tx_type,
5229                                                     TX_SIZE tx_size, int eob,
5230                                                     const int bd) {
5231   __m128i buf1[64];
5232   int eobx, eoby;
5233   get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
5234   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5235   const int txw_idx = get_txw_idx(tx_size);
5236   const int txh_idx = get_txh_idx(tx_size);
5237   const int txfm_size_col = tx_size_wide[tx_size];
5238   const int txfm_size_row = tx_size_high[tx_size];
5239   const int buf_size_w = AOMMIN(32, txfm_size_col);
5240   const int buf_size_w_div4 = buf_size_w >> 2;
5241   const int buf_size_h_div8 = (eoby + 8) >> 3;
5242   const int row_max = AOMMIN(32, txfm_size_row);
5243   const int input_stride = row_max;
5244   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5245   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
5246   const transform_1d_sse4_1 row_txfm =
5247       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5248   const transform_1d_sse4_1 col_txfm =
5249       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
5250   int ud_flip, lr_flip;
5251   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5252 
5253   for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
5254     __m128i buf0[16];
5255     load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5256     if (rect_type == 1 || rect_type == -1) {
5257       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
5258                                            NewInvSqrt2);
5259     }
5260     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5261 
5262     __m128i *_buf1 = buf1 + i * 4;
5263 
5264     for (int j = 0; j < buf_size_w_div4; ++j) {
5265       __m128i *buf0_cur = buf0 + j * 4;
5266       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5267                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5268       _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5269       _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5270       _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5271       _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5272     }
5273   }
5274   for (int i = 0; i < buf_size_w_div4; i++) {
5275     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5276              bd, 0);
5277 
5278     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5279                                     buf1 + i * txfm_size_row, txfm_size_row,
5280                                     -shift[1]);
5281   }
5282 
5283   // write to buffer
5284   for (int i = 0; i < (txfm_size_col >> 3); i++) {
5285     highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
5286                                    stride, ud_flip, txfm_size_row, bd);
5287   }
5288 }
highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5289 static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
5290                                                     uint16_t *output,
5291                                                     int stride, TX_TYPE tx_type,
5292                                                     TX_SIZE tx_size, int eob,
5293                                                     const int bd) {
5294   __m128i buf1[64];
5295   int eobx, eoby;
5296   get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
5297   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5298   const int txw_idx = get_txw_idx(tx_size);
5299   const int txh_idx = get_txh_idx(tx_size);
5300   const int txfm_size_col = tx_size_wide[tx_size];
5301   const int txfm_size_row = tx_size_high[tx_size];
5302   const int buf_size_w_div4 = AOMMIN(32, txfm_size_col) >> 2;
5303   const int row_max = AOMMIN(32, txfm_size_row);
5304   const int input_stride = row_max;
5305   const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
5306   const int buf_size_nonzero_w = buf_size_nonzero_w_div8 << 3;
5307   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5308   const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
5309   const transform_1d_sse4_1 row_txfm =
5310       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
5311   const transform_1d_sse4_1 col_txfm =
5312       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5313   int ud_flip, lr_flip;
5314   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5315 
5316   for (int i = 0; i < (row_max >> 2); ++i) {
5317     __m128i buf0[16];
5318     load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5319                             buf_size_nonzero_w);
5320     if (rect_type == 1 || rect_type == -1) {
5321       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
5322                                            NewInvSqrt2);
5323     }
5324     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5325 
5326     __m128i *_buf1 = buf1 + i * 4;
5327     if (lr_flip) {
5328       for (int j = 0; j < buf_size_w_div4; ++j) {
5329         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5330                       buf0[4 * j],
5331                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
5332                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
5333                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
5334                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
5335       }
5336     } else {
5337       for (int j = 0; j < buf_size_w_div4; ++j) {
5338         TRANSPOSE_4X4(
5339             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5340             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5341             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5342       }
5343     }
5344   }
5345   for (int i = 0; i < buf_size_w_div4; i++) {
5346     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5347              bd, 0);
5348 
5349     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5350                                     buf1 + i * txfm_size_row, txfm_size_row,
5351                                     -shift[1]);
5352   }
5353 
5354   // write to buffer
5355   {
5356     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5357       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5358                                      output + 8 * i, stride, ud_flip,
5359                                      txfm_size_row, bd);
5360     }
5361   }
5362 }
highbd_inv_txfm2d_add_idtx_ssse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5363 static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
5364                                               uint16_t *output, int stride,
5365                                               TX_TYPE tx_type, TX_SIZE tx_size,
5366                                               int eob, const int bd) {
5367   (void)eob;
5368   __m128i buf1[64 * 4];
5369   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5370   const int txw_idx = get_txw_idx(tx_size);
5371   const int txh_idx = get_txh_idx(tx_size);
5372   const int txfm_size_col = tx_size_wide[tx_size];
5373   const int txfm_size_row = tx_size_high[tx_size];
5374   const int row_max = AOMMIN(32, txfm_size_row);
5375   const int input_stride = row_max;
5376   const int buf_size_w = AOMMIN(32, txfm_size_col);
5377   const int buf_size_w_div4 = buf_size_w >> 2;
5378   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5379   const transform_1d_sse4_1 row_txfm =
5380       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5381   const transform_1d_sse4_1 col_txfm =
5382       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5383 
5384   for (int i = 0; i < (row_max >> 2); ++i) {
5385     __m128i buf0[32];
5386     load_buffer_32bit_input(input + i * 4, input_stride, buf0, buf_size_w);
5387     if (rect_type == 1 || rect_type == -1) {
5388       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_w, 0,
5389                                            NewInvSqrt2);
5390     }
5391     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5392 
5393     __m128i *_buf1 = buf1 + i * 4;
5394     for (int j = 0; j < buf_size_w_div4; ++j) {
5395       __m128i *buf0_cur = buf0 + j * 4;
5396       TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
5397                     buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
5398       _buf1[j * txfm_size_row + 0] = buf0_cur[0];
5399       _buf1[j * txfm_size_row + 1] = buf0_cur[1];
5400       _buf1[j * txfm_size_row + 2] = buf0_cur[2];
5401       _buf1[j * txfm_size_row + 3] = buf0_cur[3];
5402     }
5403   }
5404   for (int i = 0; i < buf_size_w_div4; i++) {
5405     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5406              bd, 0);
5407 
5408     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5409                                     buf1 + i * txfm_size_row, txfm_size_row,
5410                                     -shift[1]);
5411   }
5412 
5413   // write to buffer
5414   {
5415     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5416       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5417                                      output + 8 * i, stride, 0, txfm_size_row,
5418                                      bd);
5419     }
5420   }
5421 }
highbd_inv_txfm2d_add_no_identity_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5422 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
5423                                                     uint16_t *output,
5424                                                     int stride, TX_TYPE tx_type,
5425                                                     TX_SIZE tx_size, int eob,
5426                                                     const int bd) {
5427   __m128i buf1[64 * 16];
5428   int eobx, eoby;
5429   get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
5430   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5431   const int txw_idx = get_txw_idx(tx_size);
5432   const int txh_idx = get_txh_idx(tx_size);
5433   const int txfm_size_col = tx_size_wide[tx_size];
5434   const int txfm_size_row = tx_size_high[tx_size];
5435   const int buf_size_w_div4 = txfm_size_col >> 2;
5436   const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
5437   const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
5438   const int input_stride = AOMMIN(32, txfm_size_row);
5439   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
5440 
5441   const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
5442   const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
5443   const transform_1d_sse4_1 row_txfm =
5444       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
5445   const transform_1d_sse4_1 col_txfm =
5446       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
5447 
5448   assert(col_txfm != NULL);
5449   assert(row_txfm != NULL);
5450   int ud_flip, lr_flip;
5451   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5452 
5453   // 1st stage: column transform
5454   for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
5455     __m128i buf0[64];
5456     load_buffer_32bit_input(input + i * 4, input_stride, buf0,
5457                             buf_size_nonzero_w);
5458     if (rect_type == 1 || rect_type == -1) {
5459       av1_round_shift_rect_array_32_sse4_1(buf0, buf0, buf_size_nonzero_w, 0,
5460                                            NewInvSqrt2);
5461     }
5462     row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5463 
5464     __m128i *_buf1 = buf1 + i * 4;
5465     if (lr_flip) {
5466       for (int j = 0; j < buf_size_w_div4; ++j) {
5467         TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5468                       buf0[4 * j],
5469                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 0],
5470                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 1],
5471                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 2],
5472                       _buf1[txfm_size_row * (buf_size_w_div4 - 1 - j) + 3]);
5473       }
5474     } else {
5475       for (int j = 0; j < buf_size_w_div4; ++j) {
5476         TRANSPOSE_4X4(
5477             buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
5478             _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
5479             _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
5480       }
5481     }
5482   }
5483   // 2nd stage: column transform
5484   for (int i = 0; i < buf_size_w_div4; i++) {
5485     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, INV_COS_BIT, 1,
5486              bd, 0);
5487 
5488     av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
5489                                     buf1 + i * txfm_size_row, txfm_size_row,
5490                                     -shift[1]);
5491   }
5492 
5493   // write to buffer
5494   {
5495     for (int i = 0; i < (txfm_size_col >> 3); i++) {
5496       highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
5497                                      output + 8 * i, stride, ud_flip,
5498                                      txfm_size_row, bd);
5499     }
5500   }
5501 }
5502 
highbd_inv_txfm2d_add_4x8_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5503 static void highbd_inv_txfm2d_add_4x8_sse41(const int32_t *input,
5504                                             uint16_t *output, int stride,
5505                                             TX_TYPE tx_type, TX_SIZE tx_size,
5506                                             int eob, const int bd) {
5507   (void)eob;
5508   __m128i buf1[8];
5509   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5510   const int txw_idx = get_txw_idx(tx_size);
5511   const int txh_idx = get_txh_idx(tx_size);
5512   const int txfm_size_col = tx_size_wide[tx_size];
5513   const int txfm_size_row = tx_size_high[tx_size];
5514   const transform_1d_sse4_1 row_txfm =
5515       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5516   const transform_1d_sse4_1 col_txfm =
5517       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][1];
5518   const int input_stride = AOMMIN(32, txfm_size_row);
5519 
5520   assert(col_txfm != NULL);
5521   assert(row_txfm != NULL);
5522   int ud_flip, lr_flip;
5523   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5524 
5525   // 1st stage: column transform
5526   __m128i buf0[8];
5527   load_buffer_32bit_input(input, input_stride, buf0, txfm_size_col);
5528   load_buffer_32bit_input(input + 4, input_stride, buf0 + 4, txfm_size_col);
5529   av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_row, 0,
5530                                        NewInvSqrt2);
5531   row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5532   row_txfm(buf0 + 4, buf0 + 4, INV_COS_BIT, 0, bd, -shift[0]);
5533 
5534   if (lr_flip) {
5535     TRANSPOSE_4X4(buf0[3], buf0[2], buf0[1], buf0[0], buf1[0], buf1[1], buf1[2],
5536                   buf1[3]);
5537 
5538     TRANSPOSE_4X4(buf0[7], buf0[6], buf0[5], buf0[4], buf1[4], buf1[5], buf1[6],
5539                   buf1[7]);
5540   } else {
5541     TRANSPOSE_4X4(buf0[0], buf0[1], buf0[2], buf0[3], buf1[0], buf1[1], buf1[2],
5542                   buf1[3]);
5543 
5544     TRANSPOSE_4X4(buf0[4], buf0[5], buf0[6], buf0[7], buf1[4], buf1[5], buf1[6],
5545                   buf1[7]);
5546   }
5547 
5548   // 2nd stage: column transform
5549   col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
5550 
5551   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5552 
5553   // write to buffer
5554   highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5555                                  bd);
5556 }
5557 
highbd_inv_txfm2d_add_8x4_sse41(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5558 static void highbd_inv_txfm2d_add_8x4_sse41(const int32_t *input,
5559                                             uint16_t *output, int stride,
5560                                             TX_TYPE tx_type, TX_SIZE tx_size,
5561                                             int eob, const int bd) {
5562   (void)eob;
5563   __m128i buf1[8];
5564   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5565   const int txw_idx = get_txw_idx(tx_size);
5566   const int txh_idx = get_txh_idx(tx_size);
5567   const int txfm_size_col = tx_size_wide[tx_size];
5568   const int txfm_size_row = tx_size_high[tx_size];
5569   const transform_1d_sse4_1 row_txfm =
5570       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][1];
5571   const transform_1d_sse4_1 col_txfm =
5572       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5573 
5574   assert(col_txfm != NULL);
5575   assert(row_txfm != NULL);
5576   int ud_flip, lr_flip;
5577   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5578 
5579   // 1st stage: column transform
5580   __m128i buf0[8];
5581   const int32_t *input_row = input;
5582   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5583 
5584   av1_round_shift_rect_array_32_sse4_1(buf0, buf0, txfm_size_col, 0,
5585                                        NewInvSqrt2);
5586   row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5587 
5588   __m128i *buf1_ptr;
5589   if (lr_flip) {
5590     flip_buf_sse2(buf0, buf1, txfm_size_col);
5591     buf1_ptr = buf1;
5592   } else {
5593     buf1_ptr = buf0;
5594   }
5595 
5596   // 2nd stage: column transform
5597   for (int i = 0; i < 2; i++) {
5598     __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
5599     transpose_32bit_4x4(buf1_cur, buf1_cur);
5600     col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
5601   }
5602   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5603   // write to buffer
5604   highbd_write_buffer_8xn_sse4_1(buf1_ptr, output, stride, ud_flip,
5605                                  txfm_size_row, bd);
5606 }
5607 
highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5608 static void highbd_inv_txfm2d_add_4x16_sse4_1(const int32_t *input,
5609                                               uint16_t *output, int stride,
5610                                               TX_TYPE tx_type, TX_SIZE tx_size,
5611                                               int eob, const int bd) {
5612   (void)eob;
5613   __m128i buf1[16];
5614   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5615   const int txw_idx = get_txw_idx(tx_size);
5616   const int txh_idx = get_txh_idx(tx_size);
5617   const int txfm_size_col = tx_size_wide[tx_size];
5618   const int txfm_size_row = tx_size_high[tx_size];
5619   const int buf_size_h_div8 = txfm_size_row >> 2;
5620   const transform_1d_sse4_1 row_txfm =
5621       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
5622   const transform_1d_sse4_1 col_txfm =
5623       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][2];
5624   const int input_stride = AOMMIN(32, txfm_size_row);
5625 
5626   assert(col_txfm != NULL);
5627   assert(row_txfm != NULL);
5628   int ud_flip, lr_flip;
5629   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5630 
5631   // 1st stage: column transform
5632   __m128i buf0[16];
5633   for (int i = 0; i < (txfm_size_row >> 2); i++) {
5634     const int32_t *input_row = input + i * 4;
5635     __m128i *buf0_cur = buf0 + i * 4;
5636     load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_col);
5637     row_txfm(buf0_cur, buf0_cur, INV_COS_BIT, 0, bd, -shift[0]);
5638   }
5639 
5640   if (lr_flip) {
5641     for (int j = 0; j < buf_size_h_div8; ++j) {
5642       TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
5643                     buf0[4 * j], buf1[4 * j], buf1[4 * j + 1], buf1[4 * j + 2],
5644                     buf1[4 * j + 3]);
5645     }
5646   } else {
5647     for (int j = 0; j < buf_size_h_div8; ++j) {
5648       TRANSPOSE_4X4(buf0[4 * j], buf0[4 * j + 1], buf0[4 * j + 2],
5649                     buf0[4 * j + 3], buf1[4 * j], buf1[4 * j + 1],
5650                     buf1[4 * j + 2], buf1[4 * j + 3]);
5651     }
5652   }
5653 
5654   // 2nd stage: column transform
5655   col_txfm(buf1, buf1, INV_COS_BIT, 1, bd, 0);
5656 
5657   av1_round_shift_array_32_sse4_1(buf1, buf1, txfm_size_row, -shift[1]);
5658 
5659   // write to buffer
5660   highbd_write_buffer_4xn_sse4_1(buf1, output, stride, ud_flip, txfm_size_row,
5661                                  bd);
5662 }
5663 
highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t * input,uint16_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5664 static void highbd_inv_txfm2d_add_16x4_sse4_1(const int32_t *input,
5665                                               uint16_t *output, int stride,
5666                                               TX_TYPE tx_type, TX_SIZE tx_size,
5667                                               int eob, const int bd) {
5668   (void)eob;
5669   __m128i buf1[16];
5670   const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
5671   const int txw_idx = get_txw_idx(tx_size);
5672   const int txh_idx = get_txh_idx(tx_size);
5673   const int txfm_size_col = tx_size_wide[tx_size];
5674   const int txfm_size_row = tx_size_high[tx_size];
5675   const int buf_size_w_div8 = txfm_size_col >> 2;
5676   const transform_1d_sse4_1 row_txfm =
5677       highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][2];
5678   const transform_1d_sse4_1 col_txfm =
5679       highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
5680 
5681   assert(col_txfm != NULL);
5682   assert(row_txfm != NULL);
5683   int ud_flip, lr_flip;
5684   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
5685 
5686   // 1st stage: column transform
5687   __m128i buf0[16];
5688   const int32_t *input_row = input;
5689   load_buffer_32bit_input(input_row, 4, buf0, txfm_size_col);
5690 
5691   row_txfm(buf0, buf0, INV_COS_BIT, 0, bd, -shift[0]);
5692 
5693   __m128i *buf1_ptr;
5694   if (lr_flip) {
5695     flip_buf_sse2(buf0, buf1, txfm_size_col);
5696     buf1_ptr = buf1;
5697   } else {
5698     buf1_ptr = buf0;
5699   }
5700 
5701   // 2nd stage: column transform
5702   for (int i = 0; i < buf_size_w_div8; i++) {
5703     __m128i *buf1_cur = buf1_ptr + i * txfm_size_row;
5704     transpose_32bit_4x4(buf1_cur, buf1_cur);
5705     col_txfm(buf1_cur, buf1_cur, INV_COS_BIT, 1, bd, 0);
5706   }
5707   av1_round_shift_array_32_sse4_1(buf1_ptr, buf1_ptr, txfm_size_col, -shift[1]);
5708 
5709   // write to buffer
5710   for (int i = 0; i < (txfm_size_col >> 3); i++) {
5711     highbd_write_buffer_8xn_sse4_1(buf1_ptr + i * txfm_size_row * 2,
5712                                    output + 8 * i, stride, ud_flip,
5713                                    txfm_size_row, bd);
5714   }
5715 }
5716 
av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t * input,uint8_t * output,int stride,TX_TYPE tx_type,TX_SIZE tx_size,int eob,const int bd)5717 void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
5718                                                uint8_t *output, int stride,
5719                                                TX_TYPE tx_type, TX_SIZE tx_size,
5720                                                int eob, const int bd) {
5721   switch (tx_type) {
5722     case DCT_DCT:
5723     case ADST_DCT:
5724     case DCT_ADST:
5725     case ADST_ADST:
5726     case FLIPADST_DCT:
5727     case DCT_FLIPADST:
5728     case FLIPADST_FLIPADST:
5729     case ADST_FLIPADST:
5730     case FLIPADST_ADST:
5731       highbd_inv_txfm2d_add_no_identity_sse41(
5732           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5733           bd);
5734       break;
5735     case V_DCT:
5736     case V_ADST:
5737     case V_FLIPADST:
5738       highbd_inv_txfm2d_add_h_identity_ssse41(
5739           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5740           bd);
5741       break;
5742     case H_DCT:
5743     case H_ADST:
5744     case H_FLIPADST:
5745       highbd_inv_txfm2d_add_v_identity_ssse41(
5746           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
5747           bd);
5748       break;
5749     case IDTX:
5750       highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
5751                                         stride, tx_type, tx_size, eob, bd);
5752       break;
5753     default: assert(0); break;
5754   }
5755 }
5756 
av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5757 static void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input,
5758                                                uint8_t *dest, int stride,
5759                                                const TxfmParam *txfm_param) {
5760   int bd = txfm_param->bd;
5761   const TX_TYPE tx_type = txfm_param->tx_type;
5762   const TX_SIZE tx_size = txfm_param->tx_size;
5763   int eob = txfm_param->eob;
5764   highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5765                                   tx_type, tx_size, eob, bd);
5766 }
5767 
av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5768 static void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input,
5769                                                uint8_t *dest, int stride,
5770                                                const TxfmParam *txfm_param) {
5771   int bd = txfm_param->bd;
5772   const TX_TYPE tx_type = txfm_param->tx_type;
5773   const TX_SIZE tx_size = txfm_param->tx_size;
5774   int eob = txfm_param->eob;
5775   highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
5776                                   tx_type, tx_size, eob, bd);
5777 }
5778 
av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5779 static void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input,
5780                                                 uint8_t *dest, int stride,
5781                                                 const TxfmParam *txfm_param) {
5782   int bd = txfm_param->bd;
5783   const TX_TYPE tx_type = txfm_param->tx_type;
5784   const TX_SIZE tx_size = txfm_param->tx_size;
5785   int eob = txfm_param->eob;
5786   highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5787                                     tx_type, tx_size, eob, bd);
5788 }
5789 
av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5790 static void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input,
5791                                                 uint8_t *dest, int stride,
5792                                                 const TxfmParam *txfm_param) {
5793   int bd = txfm_param->bd;
5794   const TX_TYPE tx_type = txfm_param->tx_type;
5795   const TX_SIZE tx_size = txfm_param->tx_size;
5796   int eob = txfm_param->eob;
5797   highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
5798                                     tx_type, tx_size, eob, bd);
5799 }
5800 
av1_highbd_inv_txfm_add_sse4_1(const tran_low_t * input,uint8_t * dest,int stride,const TxfmParam * txfm_param)5801 void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
5802                                     int stride, const TxfmParam *txfm_param) {
5803   assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
5804   const TX_SIZE tx_size = txfm_param->tx_size;
5805   switch (tx_size) {
5806     case TX_8X8:
5807       av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
5808       break;
5809     case TX_4X8:
5810       av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
5811       break;
5812     case TX_8X4:
5813       av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
5814       break;
5815     case TX_4X4:
5816       av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
5817       break;
5818     case TX_16X4:
5819       av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
5820       break;
5821     case TX_4X16:
5822       av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
5823       break;
5824     default:
5825       av1_highbd_inv_txfm2d_add_universe_sse4_1(
5826           input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
5827           txfm_param->bd);
5828       break;
5829   }
5830 }
5831