xref: /aosp_15_r20/external/libaom/av1/common/x86/av1_inv_txfm_ssse3.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
12 #define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
13 
14 #include <emmintrin.h>  // SSE2
15 #include <tmmintrin.h>  // SSSE3
16 
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19 
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/x86/transpose_sse2.h"
22 
23 #ifdef __cplusplus
24 extern "C" {
25 #endif
26 
27 #define btf_16_ssse3(w0, w1, in, out0, out1)    \
28   do {                                          \
29     const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
30     const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
31     const __m128i _in = in;                     \
32     out0 = _mm_mulhrs_epi16(_in, _w0);          \
33     out1 = _mm_mulhrs_epi16(_in, _w1);          \
34   } while (0)
35 
36 #define btf_16_adds_subs_sse2(in0, in1) \
37   do {                                  \
38     const __m128i _in0 = in0;           \
39     const __m128i _in1 = in1;           \
40     in0 = _mm_adds_epi16(_in0, _in1);   \
41     in1 = _mm_subs_epi16(_in0, _in1);   \
42   } while (0)
43 
44 #define btf_16_subs_adds_sse2(in0, in1) \
45   do {                                  \
46     const __m128i _in0 = in0;           \
47     const __m128i _in1 = in1;           \
48     in1 = _mm_subs_epi16(_in0, _in1);   \
49     in0 = _mm_adds_epi16(_in0, _in1);   \
50   } while (0)
51 
52 #define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
53   do {                                                  \
54     const __m128i _in0 = in0;                           \
55     const __m128i _in1 = in1;                           \
56     out0 = _mm_adds_epi16(_in0, _in1);                  \
57     out1 = _mm_subs_epi16(_in0, _in1);                  \
58   } while (0)
59 
round_shift_16bit_ssse3(__m128i * in,int size,int bit)60 static inline void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
61   if (bit < 0) {
62     const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
63     for (int i = 0; i < size; ++i) {
64       in[i] = _mm_mulhrs_epi16(in[i], scale);
65     }
66   } else if (bit > 0) {
67     for (int i = 0; i < size; ++i) {
68       in[i] = _mm_slli_epi16(in[i], bit);
69     }
70   }
71 }
72 
73 // 1D itx types
74 enum {
75   IDCT_1D,
76   IADST_1D,
77   IFLIPADST_1D = IADST_1D,
78   IIDENTITY_1D,
79   ITX_TYPES_1D,
80 } UENUM1BYTE(ITX_TYPE_1D);
81 
82 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
83   IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
84   IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
85   IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
86   IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
87 };
88 
89 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
90   IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
91   IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
92   IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
93   IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
94 };
95 
96 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
97   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
98 };
99 
100 DECLARE_ALIGNED(16, static const int16_t,
101                 av1_eob_to_eobxy_16x16_default[16]) = {
102   0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
103   0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
104 };
105 
106 DECLARE_ALIGNED(16, static const int16_t,
107                 av1_eob_to_eobxy_32x32_default[32]) = {
108   0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
109   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
110   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
111   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
112 };
113 
114 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
115   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
116   0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
117 };
118 
119 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
120   0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
121 };
122 
123 DECLARE_ALIGNED(16, static const int16_t,
124                 av1_eob_to_eobxy_16x32_default[32]) = {
125   0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
126   0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
127   0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
128   0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
129 };
130 
131 DECLARE_ALIGNED(16, static const int16_t,
132                 av1_eob_to_eobxy_32x16_default[16]) = {
133   0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
134   0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
135 };
136 
137 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
138   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
139   0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
140   0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
141   0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
142 };
143 
144 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
145   0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
146 };
147 
148 DECLARE_ALIGNED(16, static const int16_t *,
149                 av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
150   NULL,
151   av1_eob_to_eobxy_8x8_default,
152   av1_eob_to_eobxy_16x16_default,
153   av1_eob_to_eobxy_32x32_default,
154   av1_eob_to_eobxy_32x32_default,
155   NULL,
156   NULL,
157   av1_eob_to_eobxy_8x16_default,
158   av1_eob_to_eobxy_16x8_default,
159   av1_eob_to_eobxy_16x32_default,
160   av1_eob_to_eobxy_32x16_default,
161   av1_eob_to_eobxy_32x32_default,
162   av1_eob_to_eobxy_32x32_default,
163   NULL,
164   NULL,
165   av1_eob_to_eobxy_8x32_default,
166   av1_eob_to_eobxy_32x8_default,
167   av1_eob_to_eobxy_16x32_default,
168   av1_eob_to_eobxy_32x16_default,
169 };
170 
171 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
172   0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
173   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
174 };
175 
176 // Transform block width in log2 for eob (size of 64 map to 32)
177 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
178   2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
179 };
180 
get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size,int eob)181 static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
182                                               TX_SIZE tx_size, int eob) {
183   if (eob == 1) {
184     *eobx = 0;
185     *eoby = 0;
186     return;
187   }
188 
189   const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
190   const int eob_row = (eob - 1) >> tx_w_log2;
191   const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
192   *eobx = eobxy & 0xFF;
193   *eoby = eobxy >> 8;
194 }
195 
196 static int eob_fill[32] = {
197   0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
198   31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
199 };
200 
get_eobx_eoby_scan_h_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)201 static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
202                                                  TX_SIZE tx_size, int eob) {
203   eob -= 1;
204   const int txfm_size_col = tx_size_wide[tx_size];
205   const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
206   *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
207   const int temp_eoby = eob / (eobx_max + 1);
208   assert(temp_eoby < 32);
209   *eoby = eob_fill[temp_eoby];
210 }
211 
get_eobx_eoby_scan_v_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)212 static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
213                                                  TX_SIZE tx_size, int eob) {
214   eob -= 1;
215   const int txfm_size_row = tx_size_high[tx_size];
216   const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
217   *eobx = eob_fill[eob / (eoby_max + 1)];
218   *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
219 }
220 
221 typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output);
222 
223 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
224                                     int stride, TX_TYPE tx_type,
225                                     TX_SIZE tx_size, int eob);
226 
227 void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
228                                          int stride, TX_SIZE tx_size);
229 
230 void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
231                                                uint8_t *output, int stride,
232                                                TX_TYPE tx_type, TX_SIZE tx_size,
233                                                int eob);
234 void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
235                                                uint8_t *output, int stride,
236                                                TX_TYPE tx_type, TX_SIZE tx_size,
237                                                int eob);
238 
239 void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output);
240 
241 void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output);
242 
243 #ifdef __cplusplus
244 }  // extern "C"
245 #endif
246 
247 #endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
248