xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <smmintrin.h>  // SSE4.1
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
16 #include "vpx_dsp/x86/inv_txfm_sse2.h"
17 #include "vpx_dsp/x86/inv_txfm_ssse3.h"
18 #include "vpx_dsp/x86/transpose_sse2.h"
19 #include "vpx_dsp/x86/txfm_common_sse2.h"
20 
highbd_idct32_4x32_quarter_2_stage_4_to_6(__m128i * const step1,__m128i * const out)21 static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
22     __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
23   __m128i step2[32];
24 
25   // stage 4
26   step2[8] = step1[8];
27   step2[15] = step1[15];
28   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
29                           &step2[9], &step2[14]);
30   highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
31                           &step2[10], &step2[13]);
32   step2[11] = step1[11];
33   step2[12] = step1[12];
34 
35   // stage 5
36   step1[8] = _mm_add_epi32(step2[8], step2[11]);
37   step1[9] = _mm_add_epi32(step2[9], step2[10]);
38   step1[10] = _mm_sub_epi32(step2[9], step2[10]);
39   step1[11] = _mm_sub_epi32(step2[8], step2[11]);
40   step1[12] = _mm_sub_epi32(step2[15], step2[12]);
41   step1[13] = _mm_sub_epi32(step2[14], step2[13]);
42   step1[14] = _mm_add_epi32(step2[14], step2[13]);
43   step1[15] = _mm_add_epi32(step2[15], step2[12]);
44 
45   // stage 6
46   out[8] = step1[8];
47   out[9] = step1[9];
48   highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
49                           &out[10], &out[13]);
50   highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
51                           &out[11], &out[12]);
52   out[14] = step1[14];
53   out[15] = step1[15];
54 }
55 
highbd_idct32_4x32_quarter_3_4_stage_4_to_7(__m128i * const step1,__m128i * const out)56 static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
57     __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
58   __m128i step2[32];
59 
60   // stage 4
61   step2[16] = _mm_add_epi32(step1[16], step1[19]);
62   step2[17] = _mm_add_epi32(step1[17], step1[18]);
63   step2[18] = _mm_sub_epi32(step1[17], step1[18]);
64   step2[19] = _mm_sub_epi32(step1[16], step1[19]);
65   step2[20] = _mm_sub_epi32(step1[23], step1[20]);
66   step2[21] = _mm_sub_epi32(step1[22], step1[21]);
67   step2[22] = _mm_add_epi32(step1[22], step1[21]);
68   step2[23] = _mm_add_epi32(step1[23], step1[20]);
69 
70   step2[24] = _mm_add_epi32(step1[24], step1[27]);
71   step2[25] = _mm_add_epi32(step1[25], step1[26]);
72   step2[26] = _mm_sub_epi32(step1[25], step1[26]);
73   step2[27] = _mm_sub_epi32(step1[24], step1[27]);
74   step2[28] = _mm_sub_epi32(step1[31], step1[28]);
75   step2[29] = _mm_sub_epi32(step1[30], step1[29]);
76   step2[30] = _mm_add_epi32(step1[29], step1[30]);
77   step2[31] = _mm_add_epi32(step1[28], step1[31]);
78 
79   // stage 5
80   step1[16] = step2[16];
81   step1[17] = step2[17];
82   highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
83                           &step1[18], &step1[29]);
84   highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
85                           &step1[19], &step1[28]);
86   highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
87                           &step1[20], &step1[27]);
88   highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
89                           &step1[21], &step1[26]);
90   step1[22] = step2[22];
91   step1[23] = step2[23];
92   step1[24] = step2[24];
93   step1[25] = step2[25];
94   step1[30] = step2[30];
95   step1[31] = step2[31];
96 
97   // stage 6
98   step2[16] = _mm_add_epi32(step1[16], step1[23]);
99   step2[17] = _mm_add_epi32(step1[17], step1[22]);
100   step2[18] = _mm_add_epi32(step1[18], step1[21]);
101   step2[19] = _mm_add_epi32(step1[19], step1[20]);
102   step2[20] = _mm_sub_epi32(step1[19], step1[20]);
103   step2[21] = _mm_sub_epi32(step1[18], step1[21]);
104   step2[22] = _mm_sub_epi32(step1[17], step1[22]);
105   step2[23] = _mm_sub_epi32(step1[16], step1[23]);
106 
107   step2[24] = _mm_sub_epi32(step1[31], step1[24]);
108   step2[25] = _mm_sub_epi32(step1[30], step1[25]);
109   step2[26] = _mm_sub_epi32(step1[29], step1[26]);
110   step2[27] = _mm_sub_epi32(step1[28], step1[27]);
111   step2[28] = _mm_add_epi32(step1[27], step1[28]);
112   step2[29] = _mm_add_epi32(step1[26], step1[29]);
113   step2[30] = _mm_add_epi32(step1[25], step1[30]);
114   step2[31] = _mm_add_epi32(step1[24], step1[31]);
115 
116   // stage 7
117   out[16] = step2[16];
118   out[17] = step2[17];
119   out[18] = step2[18];
120   out[19] = step2[19];
121   highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
122                           &out[20], &out[27]);
123   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
124                           &out[21], &out[26]);
125   highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
126                           &out[22], &out[25]);
127   highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
128                           &out[23], &out[24]);
129   out[28] = step2[28];
130   out[29] = step2[29];
131   out[30] = step2[30];
132   out[31] = step2[31];
133 }
134 
135 // Group the coefficient calculation into smaller functions to prevent stack
136 // spillover in 32x32 idct optimizations:
137 // quarter_1: 0-7
138 // quarter_2: 8-15
139 // quarter_3_4: 16-23, 24-31
140 
141 // For each 4x32 block __m128i in[32],
142 // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
143 // output pixels: 0-7 in __m128i out[32]
highbd_idct32_1024_4x32_quarter_1(const __m128i * const in,__m128i * const out)144 static INLINE void highbd_idct32_1024_4x32_quarter_1(
145     const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
146   __m128i step1[8], step2[8];
147 
148   // stage 3
149   highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
150                           &step1[7]);
151   highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
152                           &step1[6]);
153 
154   // stage 4
155   highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
156                           &step2[0]);
157   highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
158                           &step2[3]);
159   step2[4] = _mm_add_epi32(step1[4], step1[5]);
160   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
161   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
162   step2[7] = _mm_add_epi32(step1[7], step1[6]);
163 
164   // stage 5
165   step1[0] = _mm_add_epi32(step2[0], step2[3]);
166   step1[1] = _mm_add_epi32(step2[1], step2[2]);
167   step1[2] = _mm_sub_epi32(step2[1], step2[2]);
168   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
169   step1[4] = step2[4];
170   highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
171                           &step1[5], &step1[6]);
172   step1[7] = step2[7];
173 
174   // stage 6
175   out[0] = _mm_add_epi32(step1[0], step1[7]);
176   out[1] = _mm_add_epi32(step1[1], step1[6]);
177   out[2] = _mm_add_epi32(step1[2], step1[5]);
178   out[3] = _mm_add_epi32(step1[3], step1[4]);
179   out[4] = _mm_sub_epi32(step1[3], step1[4]);
180   out[5] = _mm_sub_epi32(step1[2], step1[5]);
181   out[6] = _mm_sub_epi32(step1[1], step1[6]);
182   out[7] = _mm_sub_epi32(step1[0], step1[7]);
183 }
184 
185 // For each 4x32 block __m128i in[32],
186 // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
187 // output pixels: 8-15 in __m128i out[32]
highbd_idct32_1024_4x32_quarter_2(const __m128i * in,__m128i * out)188 static INLINE void highbd_idct32_1024_4x32_quarter_2(
189     const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
190   __m128i step1[32], step2[32];
191 
192   // stage 2
193   highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
194                           &step2[15]);
195   highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
196                           &step2[14]);
197   highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
198                           &step2[13]);
199   highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
200                           &step2[12]);
201 
202   // stage 3
203   step1[8] = _mm_add_epi32(step2[8], step2[9]);
204   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
205   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
206   step1[15] = _mm_add_epi32(step2[15], step2[14]);
207   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
208   step1[11] = _mm_add_epi32(step2[11], step2[10]);
209   step1[12] = _mm_add_epi32(step2[12], step2[13]);
210   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
211 
212   highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
213 }
214 
highbd_idct32_1024_4x32_quarter_1_2(const __m128i * const in,__m128i * const out)215 static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
216     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
217   __m128i temp[16];
218   highbd_idct32_1024_4x32_quarter_1(in, temp);
219   highbd_idct32_1024_4x32_quarter_2(in, temp);
220   // stage 7
221   highbd_add_sub_butterfly(temp, out, 16);
222 }
223 
224 // For each 4x32 block __m128i in[32],
225 // Input with odd index,
226 // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
227 // output pixels: 16-23, 24-31 in __m128i out[32]
highbd_idct32_1024_4x32_quarter_3_4(const __m128i * const in,__m128i * const out)228 static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
229     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
230   __m128i step1[32], step2[32];
231 
232   // stage 1
233   highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
234                           &step1[31]);
235   highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
236                           &step1[30]);
237   highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
238                           &step1[29]);
239   highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
240                           &step1[28]);
241 
242   highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
243                           &step1[27]);
244   highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
245                           &step1[26]);
246 
247   highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
248                           &step1[25]);
249   highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
250                           &step1[24]);
251 
252   // stage 2
253   step2[16] = _mm_add_epi32(step1[16], step1[17]);
254   step2[17] = _mm_sub_epi32(step1[16], step1[17]);
255   step2[18] = _mm_sub_epi32(step1[19], step1[18]);
256   step2[19] = _mm_add_epi32(step1[19], step1[18]);
257   step2[20] = _mm_add_epi32(step1[20], step1[21]);
258   step2[21] = _mm_sub_epi32(step1[20], step1[21]);
259   step2[22] = _mm_sub_epi32(step1[23], step1[22]);
260   step2[23] = _mm_add_epi32(step1[23], step1[22]);
261 
262   step2[24] = _mm_add_epi32(step1[24], step1[25]);
263   step2[25] = _mm_sub_epi32(step1[24], step1[25]);
264   step2[26] = _mm_sub_epi32(step1[27], step1[26]);
265   step2[27] = _mm_add_epi32(step1[27], step1[26]);
266   step2[28] = _mm_add_epi32(step1[28], step1[29]);
267   step2[29] = _mm_sub_epi32(step1[28], step1[29]);
268   step2[30] = _mm_sub_epi32(step1[31], step1[30]);
269   step2[31] = _mm_add_epi32(step1[31], step1[30]);
270 
271   // stage 3
272   step1[16] = step2[16];
273   step1[31] = step2[31];
274   highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
275                           &step1[17], &step1[30]);
276   highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
277                           &step1[18], &step1[29]);
278   step1[19] = step2[19];
279   step1[20] = step2[20];
280   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
281                           &step1[21], &step1[26]);
282   highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
283                           &step1[22], &step1[25]);
284   step1[23] = step2[23];
285   step1[24] = step2[24];
286   step1[27] = step2[27];
287   step1[28] = step2[28];
288 
289   highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
290 }
291 
highbd_idct32_1024_4x32(__m128i * const io)292 static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
293   __m128i temp[32];
294 
295   highbd_idct32_1024_4x32_quarter_1_2(io, temp);
296   highbd_idct32_1024_4x32_quarter_3_4(io, temp);
297   // final stage
298   highbd_add_sub_butterfly(temp, io, 32);
299 }
300 
vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)301 void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
302                                           uint16_t *dest, int stride, int bd) {
303   int i, j;
304 
305   if (bd == 8) {
306     __m128i col[4][32], io[32];
307 
308     // rows
309     for (i = 0; i < 4; i++) {
310       highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
311       highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
312       highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
313       highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
314       idct32_1024_8x32(io, col[i]);
315       input += 32 << 3;
316     }
317 
318     // columns
319     for (i = 0; i < 32; i += 8) {
320       // Transpose 32x8 block to 8x32 block
321       transpose_16bit_8x8(col[0] + i, io);
322       transpose_16bit_8x8(col[1] + i, io + 8);
323       transpose_16bit_8x8(col[2] + i, io + 16);
324       transpose_16bit_8x8(col[3] + i, io + 24);
325       idct32_1024_8x32(io, io);
326       for (j = 0; j < 32; ++j) {
327         highbd_write_buffer_8(dest + j * stride, io[j], bd);
328       }
329       dest += 8;
330     }
331   } else {
332     __m128i all[8][32], out[32], *in;
333 
334     for (i = 0; i < 8; i++) {
335       in = all[i];
336       highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
337       highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
338       highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
339       highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
340       highbd_idct32_1024_4x32(in);
341       input += 4 * 32;
342     }
343 
344     for (i = 0; i < 32; i += 4) {
345       transpose_32bit_4x4(all[0] + i, out + 0);
346       transpose_32bit_4x4(all[1] + i, out + 4);
347       transpose_32bit_4x4(all[2] + i, out + 8);
348       transpose_32bit_4x4(all[3] + i, out + 12);
349       transpose_32bit_4x4(all[4] + i, out + 16);
350       transpose_32bit_4x4(all[5] + i, out + 20);
351       transpose_32bit_4x4(all[6] + i, out + 24);
352       transpose_32bit_4x4(all[7] + i, out + 28);
353       highbd_idct32_1024_4x32(out);
354 
355       for (j = 0; j < 32; ++j) {
356         highbd_write_buffer_4(dest + j * stride, out[j], bd);
357       }
358       dest += 4;
359     }
360   }
361 }
362 
363 // -----------------------------------------------------------------------------
364 
365 // For each 4x32 block __m128i in[32],
366 // Input with index, 0, 4, 8, 12
367 // output pixels: 0-7 in __m128i out[32]
highbd_idct32_135_4x32_quarter_1(const __m128i * const in,__m128i * const out)368 static INLINE void highbd_idct32_135_4x32_quarter_1(
369     const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
370   __m128i step1[8], step2[8];
371 
372   // stage 3
373   highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
374                                   &step1[7]);
375   highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
376                                   &step1[6]);
377 
378   // stage 4
379   highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
380                                   &step2[0]);
381   highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
382                                   &step2[3]);
383   step2[4] = _mm_add_epi32(step1[4], step1[5]);
384   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
385   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
386   step2[7] = _mm_add_epi32(step1[7], step1[6]);
387 
388   // stage 5
389   step1[0] = _mm_add_epi32(step2[0], step2[3]);
390   step1[1] = _mm_add_epi32(step2[1], step2[2]);
391   step1[2] = _mm_sub_epi32(step2[1], step2[2]);
392   step1[3] = _mm_sub_epi32(step2[0], step2[3]);
393   step1[4] = step2[4];
394   highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
395                           &step1[5], &step1[6]);
396   step1[7] = step2[7];
397 
398   // stage 6
399   out[0] = _mm_add_epi32(step1[0], step1[7]);
400   out[1] = _mm_add_epi32(step1[1], step1[6]);
401   out[2] = _mm_add_epi32(step1[2], step1[5]);
402   out[3] = _mm_add_epi32(step1[3], step1[4]);
403   out[4] = _mm_sub_epi32(step1[3], step1[4]);
404   out[5] = _mm_sub_epi32(step1[2], step1[5]);
405   out[6] = _mm_sub_epi32(step1[1], step1[6]);
406   out[7] = _mm_sub_epi32(step1[0], step1[7]);
407 }
408 
409 // For each 4x32 block __m128i in[32],
410 // Input with index, 2, 6, 10, 14
411 // output pixels: 8-15 in __m128i out[32]
highbd_idct32_135_4x32_quarter_2(const __m128i * in,__m128i * out)412 static INLINE void highbd_idct32_135_4x32_quarter_2(
413     const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
414   __m128i step1[32], step2[32];
415 
416   // stage 2
417   highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
418                                   &step2[15]);
419   highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
420                                   &step2[14]);
421   highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
422                                   &step2[13]);
423   highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
424                                   &step2[12]);
425 
426   // stage 3
427   step1[8] = _mm_add_epi32(step2[8], step2[9]);
428   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
429   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
430   step1[15] = _mm_add_epi32(step2[15], step2[14]);
431   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
432   step1[11] = _mm_add_epi32(step2[11], step2[10]);
433   step1[12] = _mm_add_epi32(step2[12], step2[13]);
434   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
435 
436   highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
437 }
438 
highbd_idct32_135_4x32_quarter_1_2(const __m128i * const in,__m128i * const out)439 static INLINE void highbd_idct32_135_4x32_quarter_1_2(
440     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
441   __m128i temp[16];
442   highbd_idct32_135_4x32_quarter_1(in, temp);
443   highbd_idct32_135_4x32_quarter_2(in, temp);
444   // stage 7
445   highbd_add_sub_butterfly(temp, out, 16);
446 }
447 
448 // For each 4x32 block __m128i in[32],
449 // Input with odd index,
450 // 1, 3, 5, 7, 9, 11, 13, 15
451 // output pixels: 16-23, 24-31 in __m128i out[32]
highbd_idct32_135_4x32_quarter_3_4(const __m128i * const in,__m128i * const out)452 static INLINE void highbd_idct32_135_4x32_quarter_3_4(
453     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
454   __m128i step1[32], step2[32];
455 
456   // stage 1
457   highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
458                                   &step1[31]);
459   highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
460                                   &step1[30]);
461   highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
462                                   &step1[29]);
463   highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
464                                   &step1[28]);
465 
466   highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
467                                   &step1[27]);
468   highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
469                                   &step1[26]);
470 
471   highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
472                                   &step1[25]);
473   highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
474                                   &step1[24]);
475 
476   // stage 2
477   step2[16] = _mm_add_epi32(step1[16], step1[17]);
478   step2[17] = _mm_sub_epi32(step1[16], step1[17]);
479   step2[18] = _mm_sub_epi32(step1[19], step1[18]);
480   step2[19] = _mm_add_epi32(step1[19], step1[18]);
481   step2[20] = _mm_add_epi32(step1[20], step1[21]);
482   step2[21] = _mm_sub_epi32(step1[20], step1[21]);
483   step2[22] = _mm_sub_epi32(step1[23], step1[22]);
484   step2[23] = _mm_add_epi32(step1[23], step1[22]);
485 
486   step2[24] = _mm_add_epi32(step1[24], step1[25]);
487   step2[25] = _mm_sub_epi32(step1[24], step1[25]);
488   step2[26] = _mm_sub_epi32(step1[27], step1[26]);
489   step2[27] = _mm_add_epi32(step1[27], step1[26]);
490   step2[28] = _mm_add_epi32(step1[28], step1[29]);
491   step2[29] = _mm_sub_epi32(step1[28], step1[29]);
492   step2[30] = _mm_sub_epi32(step1[31], step1[30]);
493   step2[31] = _mm_add_epi32(step1[31], step1[30]);
494 
495   // stage 3
496   step1[16] = step2[16];
497   step1[31] = step2[31];
498   highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
499                           &step1[17], &step1[30]);
500   highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
501                           &step1[18], &step1[29]);
502   step1[19] = step2[19];
503   step1[20] = step2[20];
504   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
505                           &step1[21], &step1[26]);
506   highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
507                           &step1[22], &step1[25]);
508   step1[23] = step2[23];
509   step1[24] = step2[24];
510   step1[27] = step2[27];
511   step1[28] = step2[28];
512 
513   highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
514 }
515 
highbd_idct32_135_4x32(__m128i * const io)516 static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
517   __m128i temp[32];
518 
519   highbd_idct32_135_4x32_quarter_1_2(io, temp);
520   highbd_idct32_135_4x32_quarter_3_4(io, temp);
521   // final stage
522   highbd_add_sub_butterfly(temp, io, 32);
523 }
524 
vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)525 void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
526                                          uint16_t *dest, int stride, int bd) {
527   int i, j;
528 
529   if (bd == 8) {
530     __m128i col[2][32], in[32], out[32];
531 
532     // rows
533     for (i = 0; i < 2; i++) {
534       highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
535       highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
536       idct32_135_8x32_ssse3(in, col[i]);
537       input += 32 << 3;
538     }
539 
540     // columns
541     for (i = 0; i < 32; i += 8) {
542       transpose_16bit_8x8(col[0] + i, in);
543       transpose_16bit_8x8(col[1] + i, in + 8);
544       idct32_135_8x32_ssse3(in, out);
545       for (j = 0; j < 32; ++j) {
546         highbd_write_buffer_8(dest + j * stride, out[j], bd);
547       }
548       dest += 8;
549     }
550   } else {
551     __m128i all[8][32], out[32], *in;
552 
553     for (i = 0; i < 4; i++) {
554       in = all[i];
555       highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
556       highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
557       highbd_idct32_135_4x32(in);
558       input += 4 * 32;
559     }
560 
561     for (i = 0; i < 32; i += 4) {
562       transpose_32bit_4x4(all[0] + i, out + 0);
563       transpose_32bit_4x4(all[1] + i, out + 4);
564       transpose_32bit_4x4(all[2] + i, out + 8);
565       transpose_32bit_4x4(all[3] + i, out + 12);
566       highbd_idct32_135_4x32(out);
567 
568       for (j = 0; j < 32; ++j) {
569         highbd_write_buffer_4(dest + j * stride, out[j], bd);
570       }
571       dest += 4;
572     }
573   }
574 }
575 
576 // -----------------------------------------------------------------------------
577 
578 // For each 4x32 block __m128i in[32],
579 // Input with index, 0, 4
580 // output pixels: 0-7 in __m128i out[32]
highbd_idct32_34_4x32_quarter_1(const __m128i * const in,__m128i * const out)581 static INLINE void highbd_idct32_34_4x32_quarter_1(
582     const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
583   __m128i step1[8], step2[8];
584 
585   // stage 3
586   highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
587                                   &step1[7]);
588 
589   // stage 4
590   highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
591                                   &step2[0]);
592   step2[4] = step1[4];
593   step2[5] = step1[4];
594   step2[6] = step1[7];
595   step2[7] = step1[7];
596 
597   // stage 5
598   step1[0] = step2[0];
599   step1[1] = step2[1];
600   step1[2] = step2[1];
601   step1[3] = step2[0];
602   step1[4] = step2[4];
603   highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
604                           &step1[5], &step1[6]);
605   step1[7] = step2[7];
606 
607   // stage 6
608   out[0] = _mm_add_epi32(step1[0], step1[7]);
609   out[1] = _mm_add_epi32(step1[1], step1[6]);
610   out[2] = _mm_add_epi32(step1[2], step1[5]);
611   out[3] = _mm_add_epi32(step1[3], step1[4]);
612   out[4] = _mm_sub_epi32(step1[3], step1[4]);
613   out[5] = _mm_sub_epi32(step1[2], step1[5]);
614   out[6] = _mm_sub_epi32(step1[1], step1[6]);
615   out[7] = _mm_sub_epi32(step1[0], step1[7]);
616 }
617 
618 // For each 4x32 block __m128i in[32],
619 // Input with index, 2, 6
620 // output pixels: 8-15 in __m128i out[32]
highbd_idct32_34_4x32_quarter_2(const __m128i * in,__m128i * out)621 static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
622                                                    __m128i *out /*out[16]*/) {
623   __m128i step1[32], step2[32];
624 
625   // stage 2
626   highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
627                                   &step2[15]);
628   highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
629                                   &step2[12]);
630 
631   // stage 3
632   step1[8] = step2[8];
633   step1[9] = step2[8];
634   step1[14] = step2[15];
635   step1[15] = step2[15];
636   step1[10] = step2[11];
637   step1[11] = step2[11];
638   step1[12] = step2[12];
639   step1[13] = step2[12];
640 
641   highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
642 }
643 
highbd_idct32_34_4x32_quarter_1_2(const __m128i * const in,__m128i * const out)644 static INLINE void highbd_idct32_34_4x32_quarter_1_2(
645     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
646   __m128i temp[16];
647   highbd_idct32_34_4x32_quarter_1(in, temp);
648   highbd_idct32_34_4x32_quarter_2(in, temp);
649   // stage 7
650   highbd_add_sub_butterfly(temp, out, 16);
651 }
652 
653 // For each 4x32 block __m128i in[32],
654 // Input with odd index,
655 // 1, 3, 5, 7
656 // output pixels: 16-23, 24-31 in __m128i out[32]
highbd_idct32_34_4x32_quarter_3_4(const __m128i * const in,__m128i * const out)657 static INLINE void highbd_idct32_34_4x32_quarter_3_4(
658     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
659   __m128i step1[32], step2[32];
660 
661   // stage 1
662   highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
663                                   &step1[31]);
664   highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
665                                   &step1[28]);
666 
667   highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
668                                   &step1[27]);
669   highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
670                                   &step1[24]);
671 
672   // stage 2
673   step2[16] = step1[16];
674   step2[17] = step1[16];
675   step2[18] = step1[19];
676   step2[19] = step1[19];
677   step2[20] = step1[20];
678   step2[21] = step1[20];
679   step2[22] = step1[23];
680   step2[23] = step1[23];
681 
682   step2[24] = step1[24];
683   step2[25] = step1[24];
684   step2[26] = step1[27];
685   step2[27] = step1[27];
686   step2[28] = step1[28];
687   step2[29] = step1[28];
688   step2[30] = step1[31];
689   step2[31] = step1[31];
690 
691   // stage 3
692   step1[16] = step2[16];
693   step1[31] = step2[31];
694   highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
695                           &step1[17], &step1[30]);
696   highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
697                           &step1[18], &step1[29]);
698   step1[19] = step2[19];
699   step1[20] = step2[20];
700   highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
701                           &step1[21], &step1[26]);
702   highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
703                           &step1[22], &step1[25]);
704   step1[23] = step2[23];
705   step1[24] = step2[24];
706   step1[27] = step2[27];
707   step1[28] = step2[28];
708 
709   highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
710 }
711 
highbd_idct32_34_4x32(__m128i * const io)712 static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
713   __m128i temp[32];
714 
715   highbd_idct32_34_4x32_quarter_1_2(io, temp);
716   highbd_idct32_34_4x32_quarter_3_4(io, temp);
717   // final stage
718   highbd_add_sub_butterfly(temp, io, 32);
719 }
720 
vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)721 void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
722                                         int stride, int bd) {
723   int i, j;
724 
725   if (bd == 8) {
726     __m128i col[32], in[32], out[32];
727 
728     // rows
729     highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
730     idct32_34_8x32_ssse3(in, col);
731 
732     // columns
733     for (i = 0; i < 32; i += 8) {
734       transpose_16bit_8x8(col + i, in);
735       idct32_34_8x32_ssse3(in, out);
736       for (j = 0; j < 32; ++j) {
737         highbd_write_buffer_8(dest + j * stride, out[j], bd);
738       }
739       dest += 8;
740     }
741   } else {
742     __m128i all[8][32], out[32], *in;
743 
744     for (i = 0; i < 4; i++) {
745       in = all[i];
746       highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
747       highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
748       highbd_idct32_34_4x32(in);
749       input += 4 * 32;
750     }
751 
752     for (i = 0; i < 32; i += 4) {
753       transpose_32bit_4x4(all[0] + i, out + 0);
754       transpose_32bit_4x4(all[1] + i, out + 4);
755       transpose_32bit_4x4(all[2] + i, out + 8);
756       transpose_32bit_4x4(all[3] + i, out + 12);
757       highbd_idct32_34_4x32(out);
758 
759       for (j = 0; j < 32; ++j) {
760         highbd_write_buffer_4(dest + j * stride, out[j], bd);
761       }
762       dest += 4;
763     }
764   }
765 }
766