1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11 #ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
12 #define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
13
14 #include <emmintrin.h> // SSE2
15 #include <tmmintrin.h> // SSSE3
16
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/x86/transpose_sse2.h"
22
23 #ifdef __cplusplus
24 extern "C" {
25 #endif
26
27 #define btf_16_ssse3(w0, w1, in, out0, out1) \
28 do { \
29 const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
30 const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
31 const __m128i _in = in; \
32 out0 = _mm_mulhrs_epi16(_in, _w0); \
33 out1 = _mm_mulhrs_epi16(_in, _w1); \
34 } while (0)
35
36 #define btf_16_adds_subs_sse2(in0, in1) \
37 do { \
38 const __m128i _in0 = in0; \
39 const __m128i _in1 = in1; \
40 in0 = _mm_adds_epi16(_in0, _in1); \
41 in1 = _mm_subs_epi16(_in0, _in1); \
42 } while (0)
43
44 #define btf_16_subs_adds_sse2(in0, in1) \
45 do { \
46 const __m128i _in0 = in0; \
47 const __m128i _in1 = in1; \
48 in1 = _mm_subs_epi16(_in0, _in1); \
49 in0 = _mm_adds_epi16(_in0, _in1); \
50 } while (0)
51
52 #define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
53 do { \
54 const __m128i _in0 = in0; \
55 const __m128i _in1 = in1; \
56 out0 = _mm_adds_epi16(_in0, _in1); \
57 out1 = _mm_subs_epi16(_in0, _in1); \
58 } while (0)
59
round_shift_16bit_ssse3(__m128i * in,int size,int bit)60 static inline void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
61 if (bit < 0) {
62 const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
63 for (int i = 0; i < size; ++i) {
64 in[i] = _mm_mulhrs_epi16(in[i], scale);
65 }
66 } else if (bit > 0) {
67 for (int i = 0; i < size; ++i) {
68 in[i] = _mm_slli_epi16(in[i], bit);
69 }
70 }
71 }
72
73 // 1D itx types
74 enum {
75 IDCT_1D,
76 IADST_1D,
77 IFLIPADST_1D = IADST_1D,
78 IIDENTITY_1D,
79 ITX_TYPES_1D,
80 } UENUM1BYTE(ITX_TYPE_1D);
81
82 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
83 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
84 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
85 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
86 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
87 };
88
89 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
90 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
91 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
92 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
93 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
94 };
95
96 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
97 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
98 };
99
100 DECLARE_ALIGNED(16, static const int16_t,
101 av1_eob_to_eobxy_16x16_default[16]) = {
102 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
103 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
104 };
105
106 DECLARE_ALIGNED(16, static const int16_t,
107 av1_eob_to_eobxy_32x32_default[32]) = {
108 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
109 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
110 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
111 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
112 };
113
114 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
115 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
116 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
117 };
118
119 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
120 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
121 };
122
123 DECLARE_ALIGNED(16, static const int16_t,
124 av1_eob_to_eobxy_16x32_default[32]) = {
125 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
126 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
127 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
128 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
129 };
130
131 DECLARE_ALIGNED(16, static const int16_t,
132 av1_eob_to_eobxy_32x16_default[16]) = {
133 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
134 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
135 };
136
137 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
138 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
139 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
140 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
141 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
142 };
143
144 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
145 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
146 };
147
148 DECLARE_ALIGNED(16, static const int16_t *,
149 av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
150 NULL,
151 av1_eob_to_eobxy_8x8_default,
152 av1_eob_to_eobxy_16x16_default,
153 av1_eob_to_eobxy_32x32_default,
154 av1_eob_to_eobxy_32x32_default,
155 NULL,
156 NULL,
157 av1_eob_to_eobxy_8x16_default,
158 av1_eob_to_eobxy_16x8_default,
159 av1_eob_to_eobxy_16x32_default,
160 av1_eob_to_eobxy_32x16_default,
161 av1_eob_to_eobxy_32x32_default,
162 av1_eob_to_eobxy_32x32_default,
163 NULL,
164 NULL,
165 av1_eob_to_eobxy_8x32_default,
166 av1_eob_to_eobxy_32x8_default,
167 av1_eob_to_eobxy_16x32_default,
168 av1_eob_to_eobxy_32x16_default,
169 };
170
171 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
172 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
173 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
174 };
175
176 // Transform block width in log2 for eob (size of 64 map to 32)
177 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
178 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
179 };
180
get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size,int eob)181 static inline void get_eobx_eoby_scan_default(int *eobx, int *eoby,
182 TX_SIZE tx_size, int eob) {
183 if (eob == 1) {
184 *eobx = 0;
185 *eoby = 0;
186 return;
187 }
188
189 const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
190 const int eob_row = (eob - 1) >> tx_w_log2;
191 const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
192 *eobx = eobxy & 0xFF;
193 *eoby = eobxy >> 8;
194 }
195
196 static int eob_fill[32] = {
197 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
198 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
199 };
200
get_eobx_eoby_scan_h_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)201 static inline void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
202 TX_SIZE tx_size, int eob) {
203 eob -= 1;
204 const int txfm_size_col = tx_size_wide[tx_size];
205 const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
206 *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
207 const int temp_eoby = eob / (eobx_max + 1);
208 assert(temp_eoby < 32);
209 *eoby = eob_fill[temp_eoby];
210 }
211
get_eobx_eoby_scan_v_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)212 static inline void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
213 TX_SIZE tx_size, int eob) {
214 eob -= 1;
215 const int txfm_size_row = tx_size_high[tx_size];
216 const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
217 *eobx = eob_fill[eob / (eoby_max + 1)];
218 *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
219 }
220
221 typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output);
222
223 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
224 int stride, TX_TYPE tx_type,
225 TX_SIZE tx_size, int eob);
226
227 void av1_lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input, uint8_t *output,
228 int stride, TX_SIZE tx_size);
229
230 void av1_lowbd_inv_txfm2d_add_h_identity_ssse3(const int32_t *input,
231 uint8_t *output, int stride,
232 TX_TYPE tx_type, TX_SIZE tx_size,
233 int eob);
234 void av1_lowbd_inv_txfm2d_add_v_identity_ssse3(const int32_t *input,
235 uint8_t *output, int stride,
236 TX_TYPE tx_type, TX_SIZE tx_size,
237 int eob);
238
239 void av1_iadst8_low1_ssse3(const __m128i *input, __m128i *output);
240
241 void av1_idct8_low1_ssse3(const __m128i *input, __m128i *output);
242
243 #ifdef __cplusplus
244 } // extern "C"
245 #endif
246
247 #endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
248