1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/cdef.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19
20 #include <emmintrin.h>
21 #include <tmmintrin.h>
22
23 #include <algorithm>
24 #include <cassert>
25 #include <cstddef>
26 #include <cstdint>
27 #include <cstdlib>
28
29 #include "src/dsp/constants.h"
30 #include "src/dsp/dsp.h"
31 #include "src/dsp/x86/common_sse4.h"
32 #include "src/dsp/x86/transpose_sse4.h"
33 #include "src/utils/common.h"
34 #include "src/utils/constants.h"
35
36 namespace libgav1 {
37 namespace dsp {
38 namespace low_bitdepth {
39 namespace {
40
41 #include "src/dsp/cdef.inc"
42
43 // Used when calculating odd |cost[x]| values.
44 // Holds elements 1 3 5 7 7 7 7 7
45 alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
46 420, 210, 140, 105, 105, 105, 105, 105};
47
48 // ----------------------------------------------------------------------------
49 // Refer to CdefDirection_C().
50 //
51 // int32_t partial[8][15] = {};
52 // for (int i = 0; i < 8; ++i) {
53 // for (int j = 0; j < 8; ++j) {
54 // const int x = 1;
55 // partial[0][i + j] += x;
56 // partial[1][i + j / 2] += x;
57 // partial[2][i] += x;
58 // partial[3][3 + i - j / 2] += x;
59 // partial[4][7 + i - j] += x;
60 // partial[5][3 - i / 2 + j] += x;
61 // partial[6][j] += x;
62 // partial[7][i / 2 + j] += x;
63 // }
64 // }
65 //
66 // Using the code above, generate the position count for partial[8][15].
67 //
68 // partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
69 // partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
70 // partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
71 // partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
72 // partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
73 // partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
74 // partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
75 // partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
76 //
77 // The SIMD code shifts the input horizontally, then adds vertically to get the
78 // correct partial value for the given position.
79 // ----------------------------------------------------------------------------
80
81 // ----------------------------------------------------------------------------
82 // partial[0][i + j] += x;
83 //
84 // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
85 // 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
86 // 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
87 // 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
88 // 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
89 // 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
90 // 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
91 // 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
92 //
93 // partial[4] is the same except the source is reversed.
AddPartial_D0_D4(__m128i * v_src_16,__m128i * partial_lo,__m128i * partial_hi)94 LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
95 __m128i* partial_lo,
96 __m128i* partial_hi) {
97 // 00 01 02 03 04 05 06 07
98 *partial_lo = v_src_16[0];
99 // 00 00 00 00 00 00 00 00
100 *partial_hi = _mm_setzero_si128();
101
102 // 00 10 11 12 13 14 15 16
103 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
104 // 17 00 00 00 00 00 00 00
105 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
106
107 // 00 00 20 21 22 23 24 25
108 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
109 // 26 27 00 00 00 00 00 00
110 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
111
112 // 00 00 00 30 31 32 33 34
113 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
114 // 35 36 37 00 00 00 00 00
115 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
116
117 // 00 00 00 00 40 41 42 43
118 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
119 // 44 45 46 47 00 00 00 00
120 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
121
122 // 00 00 00 00 00 50 51 52
123 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
124 // 53 54 55 56 57 00 00 00
125 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
126
127 // 00 00 00 00 00 00 60 61
128 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
129 // 62 63 64 65 66 67 00 00
130 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
131
132 // 00 00 00 00 00 00 00 70
133 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
134 // 71 72 73 74 75 76 77 00
135 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
136 }
137
138 // ----------------------------------------------------------------------------
139 // partial[1][i + j / 2] += x;
140 //
141 // A0 = src[0] + src[1], A1 = src[2] + src[3], ...
142 //
143 // A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
144 // 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
145 // 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
146 // 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
147 // 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
148 // 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
149 // 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
150 // 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
151 //
152 // partial[3] is the same except the source is reversed.
AddPartial_D1_D3(__m128i * v_src_16,__m128i * partial_lo,__m128i * partial_hi)153 LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
154 __m128i* partial_lo,
155 __m128i* partial_hi) {
156 __m128i v_d1_temp[8];
157 const __m128i v_zero = _mm_setzero_si128();
158
159 for (int i = 0; i < 8; ++i) {
160 v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
161 }
162
163 *partial_lo = *partial_hi = v_zero;
164 // A0 A1 A2 A3 00 00 00 00
165 *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
166
167 // 00 B0 B1 B2 B3 00 00 00
168 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
169
170 // 00 00 C0 C1 C2 C3 00 00
171 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
172 // 00 00 00 D0 D1 D2 D3 00
173 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
174 // 00 00 00 00 E0 E1 E2 E3
175 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
176
177 // 00 00 00 00 00 F0 F1 F2
178 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
179 // F3 00 00 00 00 00 00 00
180 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
181
182 // 00 00 00 00 00 00 G0 G1
183 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
184 // G2 G3 00 00 00 00 00 00
185 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
186
187 // 00 00 00 00 00 00 00 H0
188 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
189 // H1 H2 H3 00 00 00 00 00
190 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
191 }
192
193 // ----------------------------------------------------------------------------
194 // partial[7][i / 2 + j] += x;
195 //
196 // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
197 // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
198 // 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
199 // 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
200 // 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
201 // 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
202 // 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
203 // 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
204 //
205 // partial[5] is the same except the source is reversed.
AddPartial_D5_D7(__m128i * v_src,__m128i * partial_lo,__m128i * partial_hi)206 LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
207 __m128i* partial_hi) {
208 __m128i v_pair_add[4];
209 // Add vertical source pairs.
210 v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
211 v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
212 v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
213 v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
214
215 // 00 01 02 03 04 05 06 07
216 // 10 11 12 13 14 15 16 17
217 *partial_lo = v_pair_add[0];
218 // 00 00 00 00 00 00 00 00
219 // 00 00 00 00 00 00 00 00
220 *partial_hi = _mm_setzero_si128();
221
222 // 00 20 21 22 23 24 25 26
223 // 00 30 31 32 33 34 35 36
224 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
225 // 27 00 00 00 00 00 00 00
226 // 37 00 00 00 00 00 00 00
227 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
228
229 // 00 00 40 41 42 43 44 45
230 // 00 00 50 51 52 53 54 55
231 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
232 // 46 47 00 00 00 00 00 00
233 // 56 57 00 00 00 00 00 00
234 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
235
236 // 00 00 00 60 61 62 63 64
237 // 00 00 00 70 71 72 73 74
238 *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
239 // 65 66 67 00 00 00 00 00
240 // 75 76 77 00 00 00 00 00
241 *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
242 }
243
AddPartial(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t stride,__m128i * partial_lo,__m128i * partial_hi)244 LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
245 ptrdiff_t stride, __m128i* partial_lo,
246 __m128i* partial_hi) {
247 // 8x8 input
248 // 00 01 02 03 04 05 06 07
249 // 10 11 12 13 14 15 16 17
250 // 20 21 22 23 24 25 26 27
251 // 30 31 32 33 34 35 36 37
252 // 40 41 42 43 44 45 46 47
253 // 50 51 52 53 54 55 56 57
254 // 60 61 62 63 64 65 66 67
255 // 70 71 72 73 74 75 76 77
256 __m128i v_src[8];
257 for (auto& i : v_src) {
258 i = LoadLo8(src);
259 src += stride;
260 }
261
262 const __m128i v_zero = _mm_setzero_si128();
263 // partial for direction 2
264 // --------------------------------------------------------------------------
265 // partial[2][i] += x;
266 // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
267 // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
268 // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
269 // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
270 // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
271 // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
272 // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
273 // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
274 const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
275 const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
276 const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
277 const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
278 const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
279 const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
280 const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
281 const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
282 const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
283 const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
284 const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
285 const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
286 partial_lo[2] =
287 _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
288 _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
289
290 __m128i v_src_16[8];
291 for (int i = 0; i < 8; ++i) {
292 v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
293 }
294
295 // partial for direction 6
296 // --------------------------------------------------------------------------
297 // partial[6][j] += x;
298 // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
299 // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
300 // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
301 // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
302 // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
303 // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
304 // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
305 // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
306 partial_lo[6] = v_src_16[0];
307 for (int i = 1; i < 8; ++i) {
308 partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
309 }
310
311 // partial for direction 0
312 AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
313
314 // partial for direction 1
315 AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
316
317 // partial for direction 7
318 AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
319
320 __m128i v_src_reverse[8];
321 const __m128i reverser =
322 _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
323 for (int i = 0; i < 8; ++i) {
324 v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
325 }
326
327 // partial for direction 4
328 AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
329
330 // partial for direction 3
331 AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
332
333 // partial for direction 5
334 AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
335 }
336
SumVector_S32(__m128i a)337 inline uint32_t SumVector_S32(__m128i a) {
338 a = _mm_hadd_epi32(a, a);
339 a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
340 return _mm_cvtsi128_si32(a);
341 }
342
343 // |cost[0]| and |cost[4]| square the input and sum with the corresponding
344 // element from the other end of the vector:
345 // |kCdefDivisionTable[]| element:
346 // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
347 // kCdefDivisionTable[i + 1];
348 // cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
Cost0Or4(const __m128i a,const __m128i b,const __m128i division_table[2])349 inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
350 const __m128i division_table[2]) {
351 // Reverse and clear upper 2 bytes.
352 const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
353 0x03020504, 0x07060908, 0x0b0a0d0c);
354 // 14 13 12 11 10 09 08 ZZ
355 const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
356 // 00 14 01 13 02 12 03 11
357 const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
358 // 04 10 05 09 06 08 07 ZZ
359 const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
360
361 // Square(partial[0][i]) + Square(partial[0][14 - i])
362 const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
363 const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
364
365 const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
366 const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
367 return SumVector_S32(_mm_add_epi32(c, d));
368 }
369
CostOdd(const __m128i a,const __m128i b,const __m128i division_table[2])370 inline uint32_t CostOdd(const __m128i a, const __m128i b,
371 const __m128i division_table[2]) {
372 // Reverse and clear upper 10 bytes.
373 const __m128i reverser =
374 _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
375 static_cast<int>(0x80800100), 0x03020504);
376 // 10 09 08 ZZ ZZ ZZ ZZ ZZ
377 const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
378 // 00 10 01 09 02 08 03 ZZ
379 const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
380 // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
381 const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
382
383 // Square(partial[0][i]) + Square(partial[0][10 - i])
384 const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
385 const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
386
387 const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
388 const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
389 return SumVector_S32(_mm_add_epi32(c, d));
390 }
391
392 // Sum of squared elements.
SquareSum_S16(const __m128i a)393 inline uint32_t SquareSum_S16(const __m128i a) {
394 const __m128i square = _mm_madd_epi16(a, a);
395 return SumVector_S32(square);
396 }
397
CdefDirection_SSE4_1(const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride,uint8_t * LIBGAV1_RESTRICT const direction,int * LIBGAV1_RESTRICT const variance)398 void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source,
399 ptrdiff_t stride,
400 uint8_t* LIBGAV1_RESTRICT const direction,
401 int* LIBGAV1_RESTRICT const variance) {
402 assert(direction != nullptr);
403 assert(variance != nullptr);
404 const auto* src = static_cast<const uint8_t*>(source);
405 uint32_t cost[8];
406 __m128i partial_lo[8], partial_hi[8];
407
408 AddPartial(src, stride, partial_lo, partial_hi);
409
410 cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
411 cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
412
413 const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
414 LoadUnaligned16(kCdefDivisionTable + 4)};
415
416 cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
417 cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
418
419 const __m128i division_table_odd[2] = {
420 LoadAligned16(kCdefDivisionTableOddPadded),
421 LoadAligned16(kCdefDivisionTableOddPadded + 4)};
422
423 cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
424 cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
425 cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
426 cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
427
428 uint32_t best_cost = 0;
429 *direction = 0;
430 for (int i = 0; i < 8; ++i) {
431 if (cost[i] > best_cost) {
432 best_cost = cost[i];
433 *direction = i;
434 }
435 }
436 *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
437 }
438
439 // -------------------------------------------------------------------------
440 // CdefFilter
441
442 // Load 4 vectors based on the given |direction|.
LoadDirection(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,__m128i * output,const int direction)443 inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
444 const ptrdiff_t stride, __m128i* output,
445 const int direction) {
446 // Each |direction| describes a different set of source values. Expand this
447 // set by negating each set. For |direction| == 0 this gives a diagonal line
448 // from top right to bottom left. The first value is y, the second x. Negative
449 // y values move up.
450 // a b c d
451 // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
452 // c
453 // a
454 // 0
455 // b
456 // d
457 const int y_0 = kCdefDirections[direction][0][0];
458 const int x_0 = kCdefDirections[direction][0][1];
459 const int y_1 = kCdefDirections[direction][1][0];
460 const int x_1 = kCdefDirections[direction][1][1];
461 output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
462 output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
463 output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
464 output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
465 }
466
467 // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
468 // do 2 rows at a time.
LoadDirection4(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,__m128i * output,const int direction)469 void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
470 const ptrdiff_t stride, __m128i* output,
471 const int direction) {
472 const int y_0 = kCdefDirections[direction][0][0];
473 const int x_0 = kCdefDirections[direction][0][1];
474 const int y_1 = kCdefDirections[direction][1][0];
475 const int x_1 = kCdefDirections[direction][1][1];
476 output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
477 src - y_0 * stride + stride - x_0);
478 output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
479 src + y_0 * stride + stride + x_0);
480 output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
481 src - y_1 * stride + stride - x_1);
482 output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
483 src + y_1 * stride + stride + x_1);
484 }
485
Constrain(const __m128i & pixel,const __m128i & reference,const __m128i & damping,const __m128i & threshold)486 inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
487 const __m128i& damping, const __m128i& threshold) {
488 const __m128i diff = _mm_sub_epi16(pixel, reference);
489 const __m128i abs_diff = _mm_abs_epi16(diff);
490 // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
491 // 0, std::abs(diff))
492 const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
493 // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
494 // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
495 // larger than threshold. Subtract using saturation will return 0 when pixel
496 // == kCdefLargeValue.
497 static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
498 const __m128i thresh_minus_shifted_diff =
499 _mm_subs_epu16(threshold, shifted_diff);
500 const __m128i clamp_abs_diff =
501 _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
502 // Restore the sign.
503 return _mm_sign_epi16(clamp_abs_diff, diff);
504 }
505
ApplyConstrainAndTap(const __m128i & pixel,const __m128i & val,const __m128i & tap,const __m128i & damping,const __m128i & threshold)506 inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
507 const __m128i& tap, const __m128i& damping,
508 const __m128i& threshold) {
509 const __m128i constrained = Constrain(val, pixel, damping, threshold);
510 return _mm_mullo_epi16(constrained, tap);
511 }
512
513 template <int width, bool enable_primary = true, bool enable_secondary = true>
CdefFilter_SSE4_1(const uint16_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,const int primary_strength,const int secondary_strength,const int damping,const int direction,void * LIBGAV1_RESTRICT dest,const ptrdiff_t dst_stride)514 void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src,
515 const ptrdiff_t src_stride, const int height,
516 const int primary_strength, const int secondary_strength,
517 const int damping, const int direction,
518 void* LIBGAV1_RESTRICT dest,
519 const ptrdiff_t dst_stride) {
520 static_assert(width == 8 || width == 4, "Invalid CDEF width.");
521 static_assert(enable_primary || enable_secondary, "");
522 constexpr bool clipping_required = enable_primary && enable_secondary;
523 auto* dst = static_cast<uint8_t*>(dest);
524 __m128i primary_damping_shift, secondary_damping_shift;
525
526 // FloorLog2() requires input to be > 0.
527 // 8-bit damping range: Y: [3, 6], UV: [2, 5].
528 if (enable_primary) {
529 // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
530 // for UV filtering.
531 primary_damping_shift =
532 _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
533 }
534 if (enable_secondary) {
535 // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
536 // necessary.
537 assert(damping - FloorLog2(secondary_strength) >= 0);
538 secondary_damping_shift =
539 _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
540 }
541
542 const __m128i primary_tap_0 =
543 _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
544 const __m128i primary_tap_1 =
545 _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
546 const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
547 const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
548 const __m128i cdef_large_value_mask =
549 _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
550 const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
551 const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
552
553 int y = height;
554 do {
555 __m128i pixel;
556 if (width == 8) {
557 pixel = LoadUnaligned16(src);
558 } else {
559 pixel = LoadHi8(LoadLo8(src), src + src_stride);
560 }
561
562 __m128i min = pixel;
563 __m128i max = pixel;
564 __m128i sum;
565
566 if (enable_primary) {
567 // Primary |direction|.
568 __m128i primary_val[4];
569 if (width == 8) {
570 LoadDirection(src, src_stride, primary_val, direction);
571 } else {
572 LoadDirection4(src, src_stride, primary_val, direction);
573 }
574
575 if (clipping_required) {
576 min = _mm_min_epu16(min, primary_val[0]);
577 min = _mm_min_epu16(min, primary_val[1]);
578 min = _mm_min_epu16(min, primary_val[2]);
579 min = _mm_min_epu16(min, primary_val[3]);
580
581 // The source is 16 bits, however, we only really care about the lower
582 // 8 bits. The upper 8 bits contain the "large" flag. After the final
583 // primary max has been calculated, zero out the upper 8 bits. Use this
584 // to find the "16 bit" max.
585 const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
586 const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
587 const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
588 max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
589 }
590
591 sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
592 primary_damping_shift, primary_threshold);
593 sum = _mm_add_epi16(
594 sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
595 primary_damping_shift, primary_threshold));
596 sum = _mm_add_epi16(
597 sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
598 primary_damping_shift, primary_threshold));
599 sum = _mm_add_epi16(
600 sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
601 primary_damping_shift, primary_threshold));
602 } else {
603 sum = _mm_setzero_si128();
604 }
605
606 if (enable_secondary) {
607 // Secondary |direction| values (+/- 2). Clamp |direction|.
608 __m128i secondary_val[8];
609 if (width == 8) {
610 LoadDirection(src, src_stride, secondary_val, direction + 2);
611 LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
612 } else {
613 LoadDirection4(src, src_stride, secondary_val, direction + 2);
614 LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
615 }
616
617 if (clipping_required) {
618 min = _mm_min_epu16(min, secondary_val[0]);
619 min = _mm_min_epu16(min, secondary_val[1]);
620 min = _mm_min_epu16(min, secondary_val[2]);
621 min = _mm_min_epu16(min, secondary_val[3]);
622 min = _mm_min_epu16(min, secondary_val[4]);
623 min = _mm_min_epu16(min, secondary_val[5]);
624 min = _mm_min_epu16(min, secondary_val[6]);
625 min = _mm_min_epu16(min, secondary_val[7]);
626
627 const __m128i max_s01 =
628 _mm_max_epu8(secondary_val[0], secondary_val[1]);
629 const __m128i max_s23 =
630 _mm_max_epu8(secondary_val[2], secondary_val[3]);
631 const __m128i max_s45 =
632 _mm_max_epu8(secondary_val[4], secondary_val[5]);
633 const __m128i max_s67 =
634 _mm_max_epu8(secondary_val[6], secondary_val[7]);
635 const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
636 _mm_max_epu8(max_s45, max_s67));
637 max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
638 }
639
640 sum = _mm_add_epi16(
641 sum,
642 ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
643 secondary_damping_shift, secondary_threshold));
644 sum = _mm_add_epi16(
645 sum,
646 ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
647 secondary_damping_shift, secondary_threshold));
648 sum = _mm_add_epi16(
649 sum,
650 ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
651 secondary_damping_shift, secondary_threshold));
652 sum = _mm_add_epi16(
653 sum,
654 ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
655 secondary_damping_shift, secondary_threshold));
656 sum = _mm_add_epi16(
657 sum,
658 ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
659 secondary_damping_shift, secondary_threshold));
660 sum = _mm_add_epi16(
661 sum,
662 ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
663 secondary_damping_shift, secondary_threshold));
664 sum = _mm_add_epi16(
665 sum,
666 ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
667 secondary_damping_shift, secondary_threshold));
668 sum = _mm_add_epi16(
669 sum,
670 ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
671 secondary_damping_shift, secondary_threshold));
672 }
673 // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
674 const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
675 // 8 + sum
676 sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
677 // (... - (sum < 0)) >> 4
678 sum = _mm_add_epi16(sum, sum_lt_0);
679 sum = _mm_srai_epi16(sum, 4);
680 // pixel + ...
681 sum = _mm_add_epi16(sum, pixel);
682 if (clipping_required) {
683 // Clip3
684 sum = _mm_min_epi16(sum, max);
685 sum = _mm_max_epi16(sum, min);
686 }
687
688 const __m128i result = _mm_packus_epi16(sum, sum);
689 if (width == 8) {
690 src += src_stride;
691 StoreLo8(dst, result);
692 dst += dst_stride;
693 --y;
694 } else {
695 src += src_stride << 1;
696 Store4(dst, result);
697 dst += dst_stride;
698 Store4(dst, _mm_srli_si128(result, 4));
699 dst += dst_stride;
700 y -= 2;
701 }
702 } while (y != 0);
703 }
704
Init8bpp()705 void Init8bpp() {
706 Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
707 assert(dsp != nullptr);
708 dsp->cdef_direction = CdefDirection_SSE4_1;
709 dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
710 dsp->cdef_filters[0][1] =
711 CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
712 dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
713 dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
714 dsp->cdef_filters[1][1] =
715 CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
716 dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
717 }
718
719 } // namespace
720 } // namespace low_bitdepth
721
CdefInit_SSE4_1()722 void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
723
724 } // namespace dsp
725 } // namespace libgav1
726 #else // !LIBGAV1_TARGETING_SSE4_1
727 namespace libgav1 {
728 namespace dsp {
729
CdefInit_SSE4_1()730 void CdefInit_SSE4_1() {}
731
732 } // namespace dsp
733 } // namespace libgav1
734 #endif // LIBGAV1_TARGETING_SSE4_1
735