1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <smmintrin.h> // SSE4.1
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
16 #include "vpx_dsp/x86/inv_txfm_sse2.h"
17 #include "vpx_dsp/x86/transpose_sse2.h"
18 #include "vpx_dsp/x86/txfm_common_sse2.h"
19
highbd_idct16_4col_stage5(const __m128i * const in,__m128i * const out)20 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
21 __m128i *const out) {
22 // stage 5
23 out[0] = _mm_add_epi32(in[0], in[3]);
24 out[1] = _mm_add_epi32(in[1], in[2]);
25 out[2] = _mm_sub_epi32(in[1], in[2]);
26 out[3] = _mm_sub_epi32(in[0], in[3]);
27 highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
28 out[8] = _mm_add_epi32(in[8], in[11]);
29 out[9] = _mm_add_epi32(in[9], in[10]);
30 out[10] = _mm_sub_epi32(in[9], in[10]);
31 out[11] = _mm_sub_epi32(in[8], in[11]);
32 out[12] = _mm_sub_epi32(in[15], in[12]);
33 out[13] = _mm_sub_epi32(in[14], in[13]);
34 out[14] = _mm_add_epi32(in[14], in[13]);
35 out[15] = _mm_add_epi32(in[15], in[12]);
36 }
37
highbd_idct16_4col_stage6(const __m128i * const in,__m128i * const out)38 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
39 __m128i *const out) {
40 out[0] = _mm_add_epi32(in[0], in[7]);
41 out[1] = _mm_add_epi32(in[1], in[6]);
42 out[2] = _mm_add_epi32(in[2], in[5]);
43 out[3] = _mm_add_epi32(in[3], in[4]);
44 out[4] = _mm_sub_epi32(in[3], in[4]);
45 out[5] = _mm_sub_epi32(in[2], in[5]);
46 out[6] = _mm_sub_epi32(in[1], in[6]);
47 out[7] = _mm_sub_epi32(in[0], in[7]);
48 out[8] = in[8];
49 out[9] = in[9];
50 highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
51 highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
52 out[14] = in[14];
53 out[15] = in[15];
54 }
55
vpx_highbd_idct16_4col_sse4_1(__m128i * const io)56 void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
57 __m128i step1[16], step2[16];
58
59 // stage 2
60 highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
61 &step2[15]);
62 highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
63 &step2[14]);
64 highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
65 &step2[13]);
66 highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
67 &step2[12]);
68
69 // stage 3
70 highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
71 &step1[7]);
72 highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
73 &step1[6]);
74 step1[8] = _mm_add_epi32(step2[8], step2[9]);
75 step1[9] = _mm_sub_epi32(step2[8], step2[9]);
76 step1[10] = _mm_sub_epi32(step2[11], step2[10]);
77 step1[11] = _mm_add_epi32(step2[11], step2[10]);
78 step1[12] = _mm_add_epi32(step2[12], step2[13]);
79 step1[13] = _mm_sub_epi32(step2[12], step2[13]);
80 step1[14] = _mm_sub_epi32(step2[15], step2[14]);
81 step1[15] = _mm_add_epi32(step2[15], step2[14]);
82
83 // stage 4
84 highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
85 highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
86 &step2[3]);
87 highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
88 &step2[9], &step2[14]);
89 highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
90 &step2[13], &step2[10]);
91 step2[5] = _mm_sub_epi32(step1[4], step1[5]);
92 step1[4] = _mm_add_epi32(step1[4], step1[5]);
93 step2[6] = _mm_sub_epi32(step1[7], step1[6]);
94 step1[7] = _mm_add_epi32(step1[7], step1[6]);
95 step2[8] = step1[8];
96 step2[11] = step1[11];
97 step2[12] = step1[12];
98 step2[15] = step1[15];
99
100 highbd_idct16_4col_stage5(step2, step1);
101 highbd_idct16_4col_stage6(step1, step2);
102 highbd_idct16_4col_stage7(step2, io);
103 }
104
highbd_idct16x16_38_4col(__m128i * const io)105 static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
106 __m128i step1[16], step2[16];
107 __m128i temp1[2];
108
109 // stage 2
110 highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
111 &step2[15]);
112 highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
113 &step2[14]);
114 highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
115 &step2[13]);
116 highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
117 &step2[12]);
118
119 // stage 3
120 highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
121 &step1[7]);
122 highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
123 &step1[6]);
124 step1[8] = _mm_add_epi32(step2[8], step2[9]);
125 step1[9] = _mm_sub_epi32(step2[8], step2[9]);
126 step1[10] = _mm_sub_epi32(step2[11], step2[10]);
127 step1[11] = _mm_add_epi32(step2[11], step2[10]);
128 step1[12] = _mm_add_epi32(step2[12], step2[13]);
129 step1[13] = _mm_sub_epi32(step2[12], step2[13]);
130 step1[14] = _mm_sub_epi32(step2[15], step2[14]);
131 step1[15] = _mm_add_epi32(step2[15], step2[14]);
132
133 // stage 4
134 extend_64bit(io[0], temp1);
135 step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
136 step2[1] = step2[0];
137 highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
138 &step2[3]);
139 highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
140 &step2[9], &step2[14]);
141 highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
142 &step2[13], &step2[10]);
143 step2[5] = _mm_sub_epi32(step1[4], step1[5]);
144 step1[4] = _mm_add_epi32(step1[4], step1[5]);
145 step2[6] = _mm_sub_epi32(step1[7], step1[6]);
146 step1[7] = _mm_add_epi32(step1[7], step1[6]);
147 step2[8] = step1[8];
148 step2[11] = step1[11];
149 step2[12] = step1[12];
150 step2[15] = step1[15];
151
152 highbd_idct16_4col_stage5(step2, step1);
153 highbd_idct16_4col_stage6(step1, step2);
154 highbd_idct16_4col_stage7(step2, io);
155 }
156
highbd_idct16x16_10_4col(__m128i * const io)157 static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
158 __m128i step1[16], step2[16];
159 __m128i temp[2];
160
161 // stage 2
162 highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
163 &step2[15]);
164 highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
165 &step2[12]);
166
167 // stage 3
168 highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
169 &step1[7]);
170 step1[8] = step2[8];
171 step1[9] = step2[8];
172 step1[10] = step2[11];
173 step1[11] = step2[11];
174 step1[12] = step2[12];
175 step1[13] = step2[12];
176 step1[14] = step2[15];
177 step1[15] = step2[15];
178
179 // stage 4
180 extend_64bit(io[0], temp);
181 step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
182 step2[1] = step2[0];
183 step2[2] = _mm_setzero_si128();
184 step2[3] = _mm_setzero_si128();
185 highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
186 &step2[9], &step2[14]);
187 highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
188 &step2[13], &step2[10]);
189 step2[5] = step1[4];
190 step2[6] = step1[7];
191 step2[8] = step1[8];
192 step2[11] = step1[11];
193 step2[12] = step1[12];
194 step2[15] = step1[15];
195
196 highbd_idct16_4col_stage5(step2, step1);
197 highbd_idct16_4col_stage6(step1, step2);
198 highbd_idct16_4col_stage7(step2, io);
199 }
200
vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)201 void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
202 uint16_t *dest, int stride, int bd) {
203 int i;
204 __m128i out[16], *in;
205
206 if (bd == 8) {
207 __m128i l[16], r[16];
208
209 in = l;
210 for (i = 0; i < 2; i++) {
211 highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
212 highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
213 idct16_8col(in, in);
214 in = r;
215 input += 128;
216 }
217
218 for (i = 0; i < 16; i += 8) {
219 int j;
220 transpose_16bit_8x8(l + i, out);
221 transpose_16bit_8x8(r + i, out + 8);
222 idct16_8col(out, out);
223
224 for (j = 0; j < 16; ++j) {
225 highbd_write_buffer_8(dest + j * stride, out[j], bd);
226 }
227 dest += 8;
228 }
229 } else {
230 __m128i all[4][16];
231
232 for (i = 0; i < 4; i++) {
233 in = all[i];
234 highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
235 highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
236 vpx_highbd_idct16_4col_sse4_1(in);
237 input += 4 * 16;
238 }
239
240 for (i = 0; i < 16; i += 4) {
241 int j;
242 transpose_32bit_4x4(all[0] + i, out + 0);
243 transpose_32bit_4x4(all[1] + i, out + 4);
244 transpose_32bit_4x4(all[2] + i, out + 8);
245 transpose_32bit_4x4(all[3] + i, out + 12);
246 vpx_highbd_idct16_4col_sse4_1(out);
247
248 for (j = 0; j < 16; ++j) {
249 highbd_write_buffer_4(dest + j * stride, out[j], bd);
250 }
251 dest += 4;
252 }
253 }
254 }
255
vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)256 void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
257 int stride, int bd) {
258 int i;
259 __m128i out[16];
260
261 if (bd == 8) {
262 __m128i in[16], temp[16];
263
264 highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
265 for (i = 8; i < 16; i++) {
266 in[i] = _mm_setzero_si128();
267 }
268 idct16_8col(in, temp);
269
270 for (i = 0; i < 16; i += 8) {
271 int j;
272 transpose_16bit_8x8(temp + i, in);
273 idct16_8col(in, out);
274
275 for (j = 0; j < 16; ++j) {
276 highbd_write_buffer_8(dest + j * stride, out[j], bd);
277 }
278 dest += 8;
279 }
280 } else {
281 __m128i all[2][16], *in;
282
283 for (i = 0; i < 2; i++) {
284 in = all[i];
285 highbd_load_transpose_32bit_8x4(input, 16, in);
286 highbd_idct16x16_38_4col(in);
287 input += 4 * 16;
288 }
289
290 for (i = 0; i < 16; i += 4) {
291 int j;
292 transpose_32bit_4x4(all[0] + i, out + 0);
293 transpose_32bit_4x4(all[1] + i, out + 4);
294 highbd_idct16x16_38_4col(out);
295
296 for (j = 0; j < 16; ++j) {
297 highbd_write_buffer_4(dest + j * stride, out[j], bd);
298 }
299 dest += 4;
300 }
301 }
302 }
303
vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)304 void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
305 int stride, int bd) {
306 int i;
307 __m128i out[16];
308
309 if (bd == 8) {
310 __m128i in[16], l[16];
311
312 in[0] = load_pack_8_32bit(input + 0 * 16);
313 in[1] = load_pack_8_32bit(input + 1 * 16);
314 in[2] = load_pack_8_32bit(input + 2 * 16);
315 in[3] = load_pack_8_32bit(input + 3 * 16);
316
317 idct16x16_10_pass1(in, l);
318
319 for (i = 0; i < 16; i += 8) {
320 int j;
321 idct16x16_10_pass2(l + i, in);
322
323 for (j = 0; j < 16; ++j) {
324 highbd_write_buffer_8(dest + j * stride, in[j], bd);
325 }
326 dest += 8;
327 }
328 } else {
329 __m128i all[2][16], *in;
330
331 for (i = 0; i < 2; i++) {
332 in = all[i];
333 highbd_load_transpose_32bit_4x4(input, 16, in);
334 highbd_idct16x16_10_4col(in);
335 input += 4 * 16;
336 }
337
338 for (i = 0; i < 16; i += 4) {
339 int j;
340 transpose_32bit_4x4(&all[0][i], out);
341 highbd_idct16x16_10_4col(out);
342
343 for (j = 0; j < 16; ++j) {
344 highbd_write_buffer_4(dest + j * stride, out[j], bd);
345 }
346 dest += 4;
347 }
348 }
349 }
350