xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <smmintrin.h>  // SSE4.1
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
16 #include "vpx_dsp/x86/inv_txfm_sse2.h"
17 #include "vpx_dsp/x86/transpose_sse2.h"
18 #include "vpx_dsp/x86/txfm_common_sse2.h"
19 
highbd_idct16_4col_stage5(const __m128i * const in,__m128i * const out)20 static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
21                                              __m128i *const out) {
22   // stage 5
23   out[0] = _mm_add_epi32(in[0], in[3]);
24   out[1] = _mm_add_epi32(in[1], in[2]);
25   out[2] = _mm_sub_epi32(in[1], in[2]);
26   out[3] = _mm_sub_epi32(in[0], in[3]);
27   highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
28   out[8] = _mm_add_epi32(in[8], in[11]);
29   out[9] = _mm_add_epi32(in[9], in[10]);
30   out[10] = _mm_sub_epi32(in[9], in[10]);
31   out[11] = _mm_sub_epi32(in[8], in[11]);
32   out[12] = _mm_sub_epi32(in[15], in[12]);
33   out[13] = _mm_sub_epi32(in[14], in[13]);
34   out[14] = _mm_add_epi32(in[14], in[13]);
35   out[15] = _mm_add_epi32(in[15], in[12]);
36 }
37 
highbd_idct16_4col_stage6(const __m128i * const in,__m128i * const out)38 static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
39                                              __m128i *const out) {
40   out[0] = _mm_add_epi32(in[0], in[7]);
41   out[1] = _mm_add_epi32(in[1], in[6]);
42   out[2] = _mm_add_epi32(in[2], in[5]);
43   out[3] = _mm_add_epi32(in[3], in[4]);
44   out[4] = _mm_sub_epi32(in[3], in[4]);
45   out[5] = _mm_sub_epi32(in[2], in[5]);
46   out[6] = _mm_sub_epi32(in[1], in[6]);
47   out[7] = _mm_sub_epi32(in[0], in[7]);
48   out[8] = in[8];
49   out[9] = in[9];
50   highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
51   highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
52   out[14] = in[14];
53   out[15] = in[15];
54 }
55 
vpx_highbd_idct16_4col_sse4_1(__m128i * const io)56 void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
57   __m128i step1[16], step2[16];
58 
59   // stage 2
60   highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
61                           &step2[15]);
62   highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
63                           &step2[14]);
64   highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
65                           &step2[13]);
66   highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
67                           &step2[12]);
68 
69   // stage 3
70   highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
71                           &step1[7]);
72   highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
73                           &step1[6]);
74   step1[8] = _mm_add_epi32(step2[8], step2[9]);
75   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
76   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
77   step1[11] = _mm_add_epi32(step2[11], step2[10]);
78   step1[12] = _mm_add_epi32(step2[12], step2[13]);
79   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
80   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
81   step1[15] = _mm_add_epi32(step2[15], step2[14]);
82 
83   // stage 4
84   highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
85   highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
86                           &step2[3]);
87   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
88                           &step2[9], &step2[14]);
89   highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
90                           &step2[13], &step2[10]);
91   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
92   step1[4] = _mm_add_epi32(step1[4], step1[5]);
93   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
94   step1[7] = _mm_add_epi32(step1[7], step1[6]);
95   step2[8] = step1[8];
96   step2[11] = step1[11];
97   step2[12] = step1[12];
98   step2[15] = step1[15];
99 
100   highbd_idct16_4col_stage5(step2, step1);
101   highbd_idct16_4col_stage6(step1, step2);
102   highbd_idct16_4col_stage7(step2, io);
103 }
104 
highbd_idct16x16_38_4col(__m128i * const io)105 static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
106   __m128i step1[16], step2[16];
107   __m128i temp1[2];
108 
109   // stage 2
110   highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
111                                   &step2[15]);
112   highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
113                                   &step2[14]);
114   highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
115                                   &step2[13]);
116   highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
117                                   &step2[12]);
118 
119   // stage 3
120   highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
121                                   &step1[7]);
122   highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
123                                   &step1[6]);
124   step1[8] = _mm_add_epi32(step2[8], step2[9]);
125   step1[9] = _mm_sub_epi32(step2[8], step2[9]);
126   step1[10] = _mm_sub_epi32(step2[11], step2[10]);
127   step1[11] = _mm_add_epi32(step2[11], step2[10]);
128   step1[12] = _mm_add_epi32(step2[12], step2[13]);
129   step1[13] = _mm_sub_epi32(step2[12], step2[13]);
130   step1[14] = _mm_sub_epi32(step2[15], step2[14]);
131   step1[15] = _mm_add_epi32(step2[15], step2[14]);
132 
133   // stage 4
134   extend_64bit(io[0], temp1);
135   step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
136   step2[1] = step2[0];
137   highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
138                                   &step2[3]);
139   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
140                           &step2[9], &step2[14]);
141   highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
142                           &step2[13], &step2[10]);
143   step2[5] = _mm_sub_epi32(step1[4], step1[5]);
144   step1[4] = _mm_add_epi32(step1[4], step1[5]);
145   step2[6] = _mm_sub_epi32(step1[7], step1[6]);
146   step1[7] = _mm_add_epi32(step1[7], step1[6]);
147   step2[8] = step1[8];
148   step2[11] = step1[11];
149   step2[12] = step1[12];
150   step2[15] = step1[15];
151 
152   highbd_idct16_4col_stage5(step2, step1);
153   highbd_idct16_4col_stage6(step1, step2);
154   highbd_idct16_4col_stage7(step2, io);
155 }
156 
highbd_idct16x16_10_4col(__m128i * const io)157 static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
158   __m128i step1[16], step2[16];
159   __m128i temp[2];
160 
161   // stage 2
162   highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
163                                   &step2[15]);
164   highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
165                                   &step2[12]);
166 
167   // stage 3
168   highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
169                                   &step1[7]);
170   step1[8] = step2[8];
171   step1[9] = step2[8];
172   step1[10] = step2[11];
173   step1[11] = step2[11];
174   step1[12] = step2[12];
175   step1[13] = step2[12];
176   step1[14] = step2[15];
177   step1[15] = step2[15];
178 
179   // stage 4
180   extend_64bit(io[0], temp);
181   step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
182   step2[1] = step2[0];
183   step2[2] = _mm_setzero_si128();
184   step2[3] = _mm_setzero_si128();
185   highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
186                           &step2[9], &step2[14]);
187   highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
188                           &step2[13], &step2[10]);
189   step2[5] = step1[4];
190   step2[6] = step1[7];
191   step2[8] = step1[8];
192   step2[11] = step1[11];
193   step2[12] = step1[12];
194   step2[15] = step1[15];
195 
196   highbd_idct16_4col_stage5(step2, step1);
197   highbd_idct16_4col_stage6(step1, step2);
198   highbd_idct16_4col_stage7(step2, io);
199 }
200 
vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)201 void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
202                                          uint16_t *dest, int stride, int bd) {
203   int i;
204   __m128i out[16], *in;
205 
206   if (bd == 8) {
207     __m128i l[16], r[16];
208 
209     in = l;
210     for (i = 0; i < 2; i++) {
211       highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
212       highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
213       idct16_8col(in, in);
214       in = r;
215       input += 128;
216     }
217 
218     for (i = 0; i < 16; i += 8) {
219       int j;
220       transpose_16bit_8x8(l + i, out);
221       transpose_16bit_8x8(r + i, out + 8);
222       idct16_8col(out, out);
223 
224       for (j = 0; j < 16; ++j) {
225         highbd_write_buffer_8(dest + j * stride, out[j], bd);
226       }
227       dest += 8;
228     }
229   } else {
230     __m128i all[4][16];
231 
232     for (i = 0; i < 4; i++) {
233       in = all[i];
234       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
235       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
236       vpx_highbd_idct16_4col_sse4_1(in);
237       input += 4 * 16;
238     }
239 
240     for (i = 0; i < 16; i += 4) {
241       int j;
242       transpose_32bit_4x4(all[0] + i, out + 0);
243       transpose_32bit_4x4(all[1] + i, out + 4);
244       transpose_32bit_4x4(all[2] + i, out + 8);
245       transpose_32bit_4x4(all[3] + i, out + 12);
246       vpx_highbd_idct16_4col_sse4_1(out);
247 
248       for (j = 0; j < 16; ++j) {
249         highbd_write_buffer_4(dest + j * stride, out[j], bd);
250       }
251       dest += 4;
252     }
253   }
254 }
255 
vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)256 void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
257                                         int stride, int bd) {
258   int i;
259   __m128i out[16];
260 
261   if (bd == 8) {
262     __m128i in[16], temp[16];
263 
264     highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
265     for (i = 8; i < 16; i++) {
266       in[i] = _mm_setzero_si128();
267     }
268     idct16_8col(in, temp);
269 
270     for (i = 0; i < 16; i += 8) {
271       int j;
272       transpose_16bit_8x8(temp + i, in);
273       idct16_8col(in, out);
274 
275       for (j = 0; j < 16; ++j) {
276         highbd_write_buffer_8(dest + j * stride, out[j], bd);
277       }
278       dest += 8;
279     }
280   } else {
281     __m128i all[2][16], *in;
282 
283     for (i = 0; i < 2; i++) {
284       in = all[i];
285       highbd_load_transpose_32bit_8x4(input, 16, in);
286       highbd_idct16x16_38_4col(in);
287       input += 4 * 16;
288     }
289 
290     for (i = 0; i < 16; i += 4) {
291       int j;
292       transpose_32bit_4x4(all[0] + i, out + 0);
293       transpose_32bit_4x4(all[1] + i, out + 4);
294       highbd_idct16x16_38_4col(out);
295 
296       for (j = 0; j < 16; ++j) {
297         highbd_write_buffer_4(dest + j * stride, out[j], bd);
298       }
299       dest += 4;
300     }
301   }
302 }
303 
vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int bd)304 void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
305                                         int stride, int bd) {
306   int i;
307   __m128i out[16];
308 
309   if (bd == 8) {
310     __m128i in[16], l[16];
311 
312     in[0] = load_pack_8_32bit(input + 0 * 16);
313     in[1] = load_pack_8_32bit(input + 1 * 16);
314     in[2] = load_pack_8_32bit(input + 2 * 16);
315     in[3] = load_pack_8_32bit(input + 3 * 16);
316 
317     idct16x16_10_pass1(in, l);
318 
319     for (i = 0; i < 16; i += 8) {
320       int j;
321       idct16x16_10_pass2(l + i, in);
322 
323       for (j = 0; j < 16; ++j) {
324         highbd_write_buffer_8(dest + j * stride, in[j], bd);
325       }
326       dest += 8;
327     }
328   } else {
329     __m128i all[2][16], *in;
330 
331     for (i = 0; i < 2; i++) {
332       in = all[i];
333       highbd_load_transpose_32bit_4x4(input, 16, in);
334       highbd_idct16x16_10_4col(in);
335       input += 4 * 16;
336     }
337 
338     for (i = 0; i < 16; i += 4) {
339       int j;
340       transpose_32bit_4x4(&all[0][i], out);
341       highbd_idct16x16_10_4col(out);
342 
343       for (j = 0; j < 16; ++j) {
344         highbd_write_buffer_4(dest + j * stride, out[j], bd);
345       }
346       dest += 4;
347     }
348   }
349 }
350