xref: /aosp_15_r20/external/libgav1/src/dsp/x86/cdef_sse4.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop // Copyright 2020 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop //      http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop 
15*09537850SAkhilesh Sanikop #include "src/dsp/cdef.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop 
18*09537850SAkhilesh Sanikop #if LIBGAV1_TARGETING_SSE4_1
19*09537850SAkhilesh Sanikop 
20*09537850SAkhilesh Sanikop #include <emmintrin.h>
21*09537850SAkhilesh Sanikop #include <tmmintrin.h>
22*09537850SAkhilesh Sanikop 
23*09537850SAkhilesh Sanikop #include <algorithm>
24*09537850SAkhilesh Sanikop #include <cassert>
25*09537850SAkhilesh Sanikop #include <cstddef>
26*09537850SAkhilesh Sanikop #include <cstdint>
27*09537850SAkhilesh Sanikop #include <cstdlib>
28*09537850SAkhilesh Sanikop 
29*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
30*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
31*09537850SAkhilesh Sanikop #include "src/dsp/x86/common_sse4.h"
32*09537850SAkhilesh Sanikop #include "src/dsp/x86/transpose_sse4.h"
33*09537850SAkhilesh Sanikop #include "src/utils/common.h"
34*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
35*09537850SAkhilesh Sanikop 
36*09537850SAkhilesh Sanikop namespace libgav1 {
37*09537850SAkhilesh Sanikop namespace dsp {
38*09537850SAkhilesh Sanikop namespace low_bitdepth {
39*09537850SAkhilesh Sanikop namespace {
40*09537850SAkhilesh Sanikop 
41*09537850SAkhilesh Sanikop #include "src/dsp/cdef.inc"
42*09537850SAkhilesh Sanikop 
43*09537850SAkhilesh Sanikop // Used when calculating odd |cost[x]| values.
44*09537850SAkhilesh Sanikop // Holds elements 1 3 5 7 7 7 7 7
45*09537850SAkhilesh Sanikop alignas(16) constexpr uint32_t kCdefDivisionTableOddPadded[] = {
46*09537850SAkhilesh Sanikop     420, 210, 140, 105, 105, 105, 105, 105};
47*09537850SAkhilesh Sanikop 
48*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
49*09537850SAkhilesh Sanikop // Refer to CdefDirection_C().
50*09537850SAkhilesh Sanikop //
51*09537850SAkhilesh Sanikop // int32_t partial[8][15] = {};
52*09537850SAkhilesh Sanikop // for (int i = 0; i < 8; ++i) {
53*09537850SAkhilesh Sanikop //   for (int j = 0; j < 8; ++j) {
54*09537850SAkhilesh Sanikop //     const int x = 1;
55*09537850SAkhilesh Sanikop //     partial[0][i + j] += x;
56*09537850SAkhilesh Sanikop //     partial[1][i + j / 2] += x;
57*09537850SAkhilesh Sanikop //     partial[2][i] += x;
58*09537850SAkhilesh Sanikop //     partial[3][3 + i - j / 2] += x;
59*09537850SAkhilesh Sanikop //     partial[4][7 + i - j] += x;
60*09537850SAkhilesh Sanikop //     partial[5][3 - i / 2 + j] += x;
61*09537850SAkhilesh Sanikop //     partial[6][j] += x;
62*09537850SAkhilesh Sanikop //     partial[7][i / 2 + j] += x;
63*09537850SAkhilesh Sanikop //   }
64*09537850SAkhilesh Sanikop // }
65*09537850SAkhilesh Sanikop //
66*09537850SAkhilesh Sanikop // Using the code above, generate the position count for partial[8][15].
67*09537850SAkhilesh Sanikop //
68*09537850SAkhilesh Sanikop // partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
69*09537850SAkhilesh Sanikop // partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
70*09537850SAkhilesh Sanikop // partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
71*09537850SAkhilesh Sanikop // partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
72*09537850SAkhilesh Sanikop // partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
73*09537850SAkhilesh Sanikop // partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
74*09537850SAkhilesh Sanikop // partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
75*09537850SAkhilesh Sanikop // partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
76*09537850SAkhilesh Sanikop //
77*09537850SAkhilesh Sanikop // The SIMD code shifts the input horizontally, then adds vertically to get the
78*09537850SAkhilesh Sanikop // correct partial value for the given position.
79*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
80*09537850SAkhilesh Sanikop 
81*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
82*09537850SAkhilesh Sanikop // partial[0][i + j] += x;
83*09537850SAkhilesh Sanikop //
84*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
85*09537850SAkhilesh Sanikop // 00 10 11 12 13 14 15 16  17 00 00 00 00 00 00
86*09537850SAkhilesh Sanikop // 00 00 20 21 22 23 24 25  26 27 00 00 00 00 00
87*09537850SAkhilesh Sanikop // 00 00 00 30 31 32 33 34  35 36 37 00 00 00 00
88*09537850SAkhilesh Sanikop // 00 00 00 00 40 41 42 43  44 45 46 47 00 00 00
89*09537850SAkhilesh Sanikop // 00 00 00 00 00 50 51 52  53 54 55 56 57 00 00
90*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 60 61  62 63 64 65 66 67 00
91*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 70  71 72 73 74 75 76 77
92*09537850SAkhilesh Sanikop //
93*09537850SAkhilesh Sanikop // partial[4] is the same except the source is reversed.
AddPartial_D0_D4(__m128i * v_src_16,__m128i * partial_lo,__m128i * partial_hi)94*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m128i* v_src_16,
95*09537850SAkhilesh Sanikop                                             __m128i* partial_lo,
96*09537850SAkhilesh Sanikop                                             __m128i* partial_hi) {
97*09537850SAkhilesh Sanikop   // 00 01 02 03 04 05 06 07
98*09537850SAkhilesh Sanikop   *partial_lo = v_src_16[0];
99*09537850SAkhilesh Sanikop   // 00 00 00 00 00 00 00 00
100*09537850SAkhilesh Sanikop   *partial_hi = _mm_setzero_si128();
101*09537850SAkhilesh Sanikop 
102*09537850SAkhilesh Sanikop   // 00 10 11 12 13 14 15 16
103*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[1], 2));
104*09537850SAkhilesh Sanikop   // 17 00 00 00 00 00 00 00
105*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[1], 14));
106*09537850SAkhilesh Sanikop 
107*09537850SAkhilesh Sanikop   // 00 00 20 21 22 23 24 25
108*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[2], 4));
109*09537850SAkhilesh Sanikop   // 26 27 00 00 00 00 00 00
110*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[2], 12));
111*09537850SAkhilesh Sanikop 
112*09537850SAkhilesh Sanikop   // 00 00 00 30 31 32 33 34
113*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[3], 6));
114*09537850SAkhilesh Sanikop   // 35 36 37 00 00 00 00 00
115*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[3], 10));
116*09537850SAkhilesh Sanikop 
117*09537850SAkhilesh Sanikop   // 00 00 00 00 40 41 42 43
118*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[4], 8));
119*09537850SAkhilesh Sanikop   // 44 45 46 47 00 00 00 00
120*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[4], 8));
121*09537850SAkhilesh Sanikop 
122*09537850SAkhilesh Sanikop   // 00 00 00 00 00 50 51 52
123*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[5], 10));
124*09537850SAkhilesh Sanikop   // 53 54 55 56 57 00 00 00
125*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[5], 6));
126*09537850SAkhilesh Sanikop 
127*09537850SAkhilesh Sanikop   // 00 00 00 00 00 00 60 61
128*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[6], 12));
129*09537850SAkhilesh Sanikop   // 62 63 64 65 66 67 00 00
130*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[6], 4));
131*09537850SAkhilesh Sanikop 
132*09537850SAkhilesh Sanikop   // 00 00 00 00 00 00 00 70
133*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_src_16[7], 14));
134*09537850SAkhilesh Sanikop   // 71 72 73 74 75 76 77 00
135*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_src_16[7], 2));
136*09537850SAkhilesh Sanikop }
137*09537850SAkhilesh Sanikop 
138*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
139*09537850SAkhilesh Sanikop // partial[1][i + j / 2] += x;
140*09537850SAkhilesh Sanikop //
141*09537850SAkhilesh Sanikop // A0 = src[0] + src[1], A1 = src[2] + src[3], ...
142*09537850SAkhilesh Sanikop //
143*09537850SAkhilesh Sanikop // A0 A1 A2 A3 00 00 00 00  00 00 00 00 00 00 00
144*09537850SAkhilesh Sanikop // 00 B0 B1 B2 B3 00 00 00  00 00 00 00 00 00 00
145*09537850SAkhilesh Sanikop // 00 00 C0 C1 C2 C3 00 00  00 00 00 00 00 00 00
146*09537850SAkhilesh Sanikop // 00 00 00 D0 D1 D2 D3 00  00 00 00 00 00 00 00
147*09537850SAkhilesh Sanikop // 00 00 00 00 E0 E1 E2 E3  00 00 00 00 00 00 00
148*09537850SAkhilesh Sanikop // 00 00 00 00 00 F0 F1 F2  F3 00 00 00 00 00 00
149*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 G0 G1  G2 G3 00 00 00 00 00
150*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 H0  H1 H2 H3 00 00 00 00
151*09537850SAkhilesh Sanikop //
152*09537850SAkhilesh Sanikop // partial[3] is the same except the source is reversed.
AddPartial_D1_D3(__m128i * v_src_16,__m128i * partial_lo,__m128i * partial_hi)153*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m128i* v_src_16,
154*09537850SAkhilesh Sanikop                                             __m128i* partial_lo,
155*09537850SAkhilesh Sanikop                                             __m128i* partial_hi) {
156*09537850SAkhilesh Sanikop   __m128i v_d1_temp[8];
157*09537850SAkhilesh Sanikop   const __m128i v_zero = _mm_setzero_si128();
158*09537850SAkhilesh Sanikop 
159*09537850SAkhilesh Sanikop   for (int i = 0; i < 8; ++i) {
160*09537850SAkhilesh Sanikop     v_d1_temp[i] = _mm_hadd_epi16(v_src_16[i], v_zero);
161*09537850SAkhilesh Sanikop   }
162*09537850SAkhilesh Sanikop 
163*09537850SAkhilesh Sanikop   *partial_lo = *partial_hi = v_zero;
164*09537850SAkhilesh Sanikop   // A0 A1 A2 A3 00 00 00 00
165*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, v_d1_temp[0]);
166*09537850SAkhilesh Sanikop 
167*09537850SAkhilesh Sanikop   // 00 B0 B1 B2 B3 00 00 00
168*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[1], 2));
169*09537850SAkhilesh Sanikop 
170*09537850SAkhilesh Sanikop   // 00 00 C0 C1 C2 C3 00 00
171*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[2], 4));
172*09537850SAkhilesh Sanikop   // 00 00 00 D0 D1 D2 D3 00
173*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[3], 6));
174*09537850SAkhilesh Sanikop   // 00 00 00 00 E0 E1 E2 E3
175*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[4], 8));
176*09537850SAkhilesh Sanikop 
177*09537850SAkhilesh Sanikop   // 00 00 00 00 00 F0 F1 F2
178*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[5], 10));
179*09537850SAkhilesh Sanikop   // F3 00 00 00 00 00 00 00
180*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[5], 6));
181*09537850SAkhilesh Sanikop 
182*09537850SAkhilesh Sanikop   // 00 00 00 00 00 00 G0 G1
183*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[6], 12));
184*09537850SAkhilesh Sanikop   // G2 G3 00 00 00 00 00 00
185*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[6], 4));
186*09537850SAkhilesh Sanikop 
187*09537850SAkhilesh Sanikop   // 00 00 00 00 00 00 00 H0
188*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_d1_temp[7], 14));
189*09537850SAkhilesh Sanikop   // H1 H2 H3 00 00 00 00 00
190*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_d1_temp[7], 2));
191*09537850SAkhilesh Sanikop }
192*09537850SAkhilesh Sanikop 
193*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
194*09537850SAkhilesh Sanikop // partial[7][i / 2 + j] += x;
195*09537850SAkhilesh Sanikop //
196*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00
197*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00
198*09537850SAkhilesh Sanikop // 00 20 21 22 23 24 25 26  27 00 00 00 00 00 00
199*09537850SAkhilesh Sanikop // 00 30 31 32 33 34 35 36  37 00 00 00 00 00 00
200*09537850SAkhilesh Sanikop // 00 00 40 41 42 43 44 45  46 47 00 00 00 00 00
201*09537850SAkhilesh Sanikop // 00 00 50 51 52 53 54 55  56 57 00 00 00 00 00
202*09537850SAkhilesh Sanikop // 00 00 00 60 61 62 63 64  65 66 67 00 00 00 00
203*09537850SAkhilesh Sanikop // 00 00 00 70 71 72 73 74  75 76 77 00 00 00 00
204*09537850SAkhilesh Sanikop //
205*09537850SAkhilesh Sanikop // partial[5] is the same except the source is reversed.
AddPartial_D5_D7(__m128i * v_src,__m128i * partial_lo,__m128i * partial_hi)206*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(__m128i* v_src, __m128i* partial_lo,
207*09537850SAkhilesh Sanikop                                             __m128i* partial_hi) {
208*09537850SAkhilesh Sanikop   __m128i v_pair_add[4];
209*09537850SAkhilesh Sanikop   // Add vertical source pairs.
210*09537850SAkhilesh Sanikop   v_pair_add[0] = _mm_add_epi16(v_src[0], v_src[1]);
211*09537850SAkhilesh Sanikop   v_pair_add[1] = _mm_add_epi16(v_src[2], v_src[3]);
212*09537850SAkhilesh Sanikop   v_pair_add[2] = _mm_add_epi16(v_src[4], v_src[5]);
213*09537850SAkhilesh Sanikop   v_pair_add[3] = _mm_add_epi16(v_src[6], v_src[7]);
214*09537850SAkhilesh Sanikop 
215*09537850SAkhilesh Sanikop   // 00 01 02 03 04 05 06 07
216*09537850SAkhilesh Sanikop   // 10 11 12 13 14 15 16 17
217*09537850SAkhilesh Sanikop   *partial_lo = v_pair_add[0];
218*09537850SAkhilesh Sanikop   // 00 00 00 00 00 00 00 00
219*09537850SAkhilesh Sanikop   // 00 00 00 00 00 00 00 00
220*09537850SAkhilesh Sanikop   *partial_hi = _mm_setzero_si128();
221*09537850SAkhilesh Sanikop 
222*09537850SAkhilesh Sanikop   // 00 20 21 22 23 24 25 26
223*09537850SAkhilesh Sanikop   // 00 30 31 32 33 34 35 36
224*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[1], 2));
225*09537850SAkhilesh Sanikop   // 27 00 00 00 00 00 00 00
226*09537850SAkhilesh Sanikop   // 37 00 00 00 00 00 00 00
227*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[1], 14));
228*09537850SAkhilesh Sanikop 
229*09537850SAkhilesh Sanikop   // 00 00 40 41 42 43 44 45
230*09537850SAkhilesh Sanikop   // 00 00 50 51 52 53 54 55
231*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[2], 4));
232*09537850SAkhilesh Sanikop   // 46 47 00 00 00 00 00 00
233*09537850SAkhilesh Sanikop   // 56 57 00 00 00 00 00 00
234*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[2], 12));
235*09537850SAkhilesh Sanikop 
236*09537850SAkhilesh Sanikop   // 00 00 00 60 61 62 63 64
237*09537850SAkhilesh Sanikop   // 00 00 00 70 71 72 73 74
238*09537850SAkhilesh Sanikop   *partial_lo = _mm_add_epi16(*partial_lo, _mm_slli_si128(v_pair_add[3], 6));
239*09537850SAkhilesh Sanikop   // 65 66 67 00 00 00 00 00
240*09537850SAkhilesh Sanikop   // 75 76 77 00 00 00 00 00
241*09537850SAkhilesh Sanikop   *partial_hi = _mm_add_epi16(*partial_hi, _mm_srli_si128(v_pair_add[3], 10));
242*09537850SAkhilesh Sanikop }
243*09537850SAkhilesh Sanikop 
AddPartial(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t stride,__m128i * partial_lo,__m128i * partial_hi)244*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
245*09537850SAkhilesh Sanikop                                       ptrdiff_t stride, __m128i* partial_lo,
246*09537850SAkhilesh Sanikop                                       __m128i* partial_hi) {
247*09537850SAkhilesh Sanikop   // 8x8 input
248*09537850SAkhilesh Sanikop   // 00 01 02 03 04 05 06 07
249*09537850SAkhilesh Sanikop   // 10 11 12 13 14 15 16 17
250*09537850SAkhilesh Sanikop   // 20 21 22 23 24 25 26 27
251*09537850SAkhilesh Sanikop   // 30 31 32 33 34 35 36 37
252*09537850SAkhilesh Sanikop   // 40 41 42 43 44 45 46 47
253*09537850SAkhilesh Sanikop   // 50 51 52 53 54 55 56 57
254*09537850SAkhilesh Sanikop   // 60 61 62 63 64 65 66 67
255*09537850SAkhilesh Sanikop   // 70 71 72 73 74 75 76 77
256*09537850SAkhilesh Sanikop   __m128i v_src[8];
257*09537850SAkhilesh Sanikop   for (auto& i : v_src) {
258*09537850SAkhilesh Sanikop     i = LoadLo8(src);
259*09537850SAkhilesh Sanikop     src += stride;
260*09537850SAkhilesh Sanikop   }
261*09537850SAkhilesh Sanikop 
262*09537850SAkhilesh Sanikop   const __m128i v_zero = _mm_setzero_si128();
263*09537850SAkhilesh Sanikop   // partial for direction 2
264*09537850SAkhilesh Sanikop   // --------------------------------------------------------------------------
265*09537850SAkhilesh Sanikop   // partial[2][i] += x;
266*09537850SAkhilesh Sanikop   // 00 10 20 30 40 50 60 70  00 00 00 00 00 00 00 00
267*09537850SAkhilesh Sanikop   // 01 11 21 33 41 51 61 71  00 00 00 00 00 00 00 00
268*09537850SAkhilesh Sanikop   // 02 12 22 33 42 52 62 72  00 00 00 00 00 00 00 00
269*09537850SAkhilesh Sanikop   // 03 13 23 33 43 53 63 73  00 00 00 00 00 00 00 00
270*09537850SAkhilesh Sanikop   // 04 14 24 34 44 54 64 74  00 00 00 00 00 00 00 00
271*09537850SAkhilesh Sanikop   // 05 15 25 35 45 55 65 75  00 00 00 00 00 00 00 00
272*09537850SAkhilesh Sanikop   // 06 16 26 36 46 56 66 76  00 00 00 00 00 00 00 00
273*09537850SAkhilesh Sanikop   // 07 17 27 37 47 57 67 77  00 00 00 00 00 00 00 00
274*09537850SAkhilesh Sanikop   const __m128i v_src_4_0 = _mm_unpacklo_epi64(v_src[0], v_src[4]);
275*09537850SAkhilesh Sanikop   const __m128i v_src_5_1 = _mm_unpacklo_epi64(v_src[1], v_src[5]);
276*09537850SAkhilesh Sanikop   const __m128i v_src_6_2 = _mm_unpacklo_epi64(v_src[2], v_src[6]);
277*09537850SAkhilesh Sanikop   const __m128i v_src_7_3 = _mm_unpacklo_epi64(v_src[3], v_src[7]);
278*09537850SAkhilesh Sanikop   const __m128i v_hsum_4_0 = _mm_sad_epu8(v_src_4_0, v_zero);
279*09537850SAkhilesh Sanikop   const __m128i v_hsum_5_1 = _mm_sad_epu8(v_src_5_1, v_zero);
280*09537850SAkhilesh Sanikop   const __m128i v_hsum_6_2 = _mm_sad_epu8(v_src_6_2, v_zero);
281*09537850SAkhilesh Sanikop   const __m128i v_hsum_7_3 = _mm_sad_epu8(v_src_7_3, v_zero);
282*09537850SAkhilesh Sanikop   const __m128i v_hsum_1_0 = _mm_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
283*09537850SAkhilesh Sanikop   const __m128i v_hsum_3_2 = _mm_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
284*09537850SAkhilesh Sanikop   const __m128i v_hsum_5_4 = _mm_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
285*09537850SAkhilesh Sanikop   const __m128i v_hsum_7_6 = _mm_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
286*09537850SAkhilesh Sanikop   partial_lo[2] =
287*09537850SAkhilesh Sanikop       _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
288*09537850SAkhilesh Sanikop                          _mm_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
289*09537850SAkhilesh Sanikop 
290*09537850SAkhilesh Sanikop   __m128i v_src_16[8];
291*09537850SAkhilesh Sanikop   for (int i = 0; i < 8; ++i) {
292*09537850SAkhilesh Sanikop     v_src_16[i] = _mm_cvtepu8_epi16(v_src[i]);
293*09537850SAkhilesh Sanikop   }
294*09537850SAkhilesh Sanikop 
295*09537850SAkhilesh Sanikop   // partial for direction 6
296*09537850SAkhilesh Sanikop   // --------------------------------------------------------------------------
297*09537850SAkhilesh Sanikop   // partial[6][j] += x;
298*09537850SAkhilesh Sanikop   // 00 01 02 03 04 05 06 07  00 00 00 00 00 00 00 00
299*09537850SAkhilesh Sanikop   // 10 11 12 13 14 15 16 17  00 00 00 00 00 00 00 00
300*09537850SAkhilesh Sanikop   // 20 21 22 23 24 25 26 27  00 00 00 00 00 00 00 00
301*09537850SAkhilesh Sanikop   // 30 31 32 33 34 35 36 37  00 00 00 00 00 00 00 00
302*09537850SAkhilesh Sanikop   // 40 41 42 43 44 45 46 47  00 00 00 00 00 00 00 00
303*09537850SAkhilesh Sanikop   // 50 51 52 53 54 55 56 57  00 00 00 00 00 00 00 00
304*09537850SAkhilesh Sanikop   // 60 61 62 63 64 65 66 67  00 00 00 00 00 00 00 00
305*09537850SAkhilesh Sanikop   // 70 71 72 73 74 75 76 77  00 00 00 00 00 00 00 00
306*09537850SAkhilesh Sanikop   partial_lo[6] = v_src_16[0];
307*09537850SAkhilesh Sanikop   for (int i = 1; i < 8; ++i) {
308*09537850SAkhilesh Sanikop     partial_lo[6] = _mm_add_epi16(partial_lo[6], v_src_16[i]);
309*09537850SAkhilesh Sanikop   }
310*09537850SAkhilesh Sanikop 
311*09537850SAkhilesh Sanikop   // partial for direction 0
312*09537850SAkhilesh Sanikop   AddPartial_D0_D4(v_src_16, &partial_lo[0], &partial_hi[0]);
313*09537850SAkhilesh Sanikop 
314*09537850SAkhilesh Sanikop   // partial for direction 1
315*09537850SAkhilesh Sanikop   AddPartial_D1_D3(v_src_16, &partial_lo[1], &partial_hi[1]);
316*09537850SAkhilesh Sanikop 
317*09537850SAkhilesh Sanikop   // partial for direction 7
318*09537850SAkhilesh Sanikop   AddPartial_D5_D7(v_src_16, &partial_lo[7], &partial_hi[7]);
319*09537850SAkhilesh Sanikop 
320*09537850SAkhilesh Sanikop   __m128i v_src_reverse[8];
321*09537850SAkhilesh Sanikop   const __m128i reverser =
322*09537850SAkhilesh Sanikop       _mm_set_epi32(0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
323*09537850SAkhilesh Sanikop   for (int i = 0; i < 8; ++i) {
324*09537850SAkhilesh Sanikop     v_src_reverse[i] = _mm_shuffle_epi8(v_src_16[i], reverser);
325*09537850SAkhilesh Sanikop   }
326*09537850SAkhilesh Sanikop 
327*09537850SAkhilesh Sanikop   // partial for direction 4
328*09537850SAkhilesh Sanikop   AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
329*09537850SAkhilesh Sanikop 
330*09537850SAkhilesh Sanikop   // partial for direction 3
331*09537850SAkhilesh Sanikop   AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
332*09537850SAkhilesh Sanikop 
333*09537850SAkhilesh Sanikop   // partial for direction 5
334*09537850SAkhilesh Sanikop   AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
335*09537850SAkhilesh Sanikop }
336*09537850SAkhilesh Sanikop 
SumVector_S32(__m128i a)337*09537850SAkhilesh Sanikop inline uint32_t SumVector_S32(__m128i a) {
338*09537850SAkhilesh Sanikop   a = _mm_hadd_epi32(a, a);
339*09537850SAkhilesh Sanikop   a = _mm_add_epi32(a, _mm_srli_si128(a, 4));
340*09537850SAkhilesh Sanikop   return _mm_cvtsi128_si32(a);
341*09537850SAkhilesh Sanikop }
342*09537850SAkhilesh Sanikop 
343*09537850SAkhilesh Sanikop // |cost[0]| and |cost[4]| square the input and sum with the corresponding
344*09537850SAkhilesh Sanikop // element from the other end of the vector:
345*09537850SAkhilesh Sanikop // |kCdefDivisionTable[]| element:
346*09537850SAkhilesh Sanikop // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
347*09537850SAkhilesh Sanikop //             kCdefDivisionTable[i + 1];
348*09537850SAkhilesh Sanikop // cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
Cost0Or4(const __m128i a,const __m128i b,const __m128i division_table[2])349*09537850SAkhilesh Sanikop inline uint32_t Cost0Or4(const __m128i a, const __m128i b,
350*09537850SAkhilesh Sanikop                          const __m128i division_table[2]) {
351*09537850SAkhilesh Sanikop   // Reverse and clear upper 2 bytes.
352*09537850SAkhilesh Sanikop   const __m128i reverser = _mm_set_epi32(static_cast<int>(0x80800100),
353*09537850SAkhilesh Sanikop                                          0x03020504, 0x07060908, 0x0b0a0d0c);
354*09537850SAkhilesh Sanikop   // 14 13 12 11 10 09 08 ZZ
355*09537850SAkhilesh Sanikop   const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
356*09537850SAkhilesh Sanikop   // 00 14 01 13 02 12 03 11
357*09537850SAkhilesh Sanikop   const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
358*09537850SAkhilesh Sanikop   // 04 10 05 09 06 08 07 ZZ
359*09537850SAkhilesh Sanikop   const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
360*09537850SAkhilesh Sanikop 
361*09537850SAkhilesh Sanikop   // Square(partial[0][i]) + Square(partial[0][14 - i])
362*09537850SAkhilesh Sanikop   const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
363*09537850SAkhilesh Sanikop   const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
364*09537850SAkhilesh Sanikop 
365*09537850SAkhilesh Sanikop   const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
366*09537850SAkhilesh Sanikop   const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
367*09537850SAkhilesh Sanikop   return SumVector_S32(_mm_add_epi32(c, d));
368*09537850SAkhilesh Sanikop }
369*09537850SAkhilesh Sanikop 
CostOdd(const __m128i a,const __m128i b,const __m128i division_table[2])370*09537850SAkhilesh Sanikop inline uint32_t CostOdd(const __m128i a, const __m128i b,
371*09537850SAkhilesh Sanikop                         const __m128i division_table[2]) {
372*09537850SAkhilesh Sanikop   // Reverse and clear upper 10 bytes.
373*09537850SAkhilesh Sanikop   const __m128i reverser =
374*09537850SAkhilesh Sanikop       _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
375*09537850SAkhilesh Sanikop                     static_cast<int>(0x80800100), 0x03020504);
376*09537850SAkhilesh Sanikop   // 10 09 08 ZZ ZZ ZZ ZZ ZZ
377*09537850SAkhilesh Sanikop   const __m128i b_reversed = _mm_shuffle_epi8(b, reverser);
378*09537850SAkhilesh Sanikop   // 00 10 01 09 02 08 03 ZZ
379*09537850SAkhilesh Sanikop   const __m128i ab_lo = _mm_unpacklo_epi16(a, b_reversed);
380*09537850SAkhilesh Sanikop   // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
381*09537850SAkhilesh Sanikop   const __m128i ab_hi = _mm_unpackhi_epi16(a, b_reversed);
382*09537850SAkhilesh Sanikop 
383*09537850SAkhilesh Sanikop   // Square(partial[0][i]) + Square(partial[0][10 - i])
384*09537850SAkhilesh Sanikop   const __m128i square_lo = _mm_madd_epi16(ab_lo, ab_lo);
385*09537850SAkhilesh Sanikop   const __m128i square_hi = _mm_madd_epi16(ab_hi, ab_hi);
386*09537850SAkhilesh Sanikop 
387*09537850SAkhilesh Sanikop   const __m128i c = _mm_mullo_epi32(square_lo, division_table[0]);
388*09537850SAkhilesh Sanikop   const __m128i d = _mm_mullo_epi32(square_hi, division_table[1]);
389*09537850SAkhilesh Sanikop   return SumVector_S32(_mm_add_epi32(c, d));
390*09537850SAkhilesh Sanikop }
391*09537850SAkhilesh Sanikop 
392*09537850SAkhilesh Sanikop // Sum of squared elements.
SquareSum_S16(const __m128i a)393*09537850SAkhilesh Sanikop inline uint32_t SquareSum_S16(const __m128i a) {
394*09537850SAkhilesh Sanikop   const __m128i square = _mm_madd_epi16(a, a);
395*09537850SAkhilesh Sanikop   return SumVector_S32(square);
396*09537850SAkhilesh Sanikop }
397*09537850SAkhilesh Sanikop 
CdefDirection_SSE4_1(const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride,uint8_t * LIBGAV1_RESTRICT const direction,int * LIBGAV1_RESTRICT const variance)398*09537850SAkhilesh Sanikop void CdefDirection_SSE4_1(const void* LIBGAV1_RESTRICT const source,
399*09537850SAkhilesh Sanikop                           ptrdiff_t stride,
400*09537850SAkhilesh Sanikop                           uint8_t* LIBGAV1_RESTRICT const direction,
401*09537850SAkhilesh Sanikop                           int* LIBGAV1_RESTRICT const variance) {
402*09537850SAkhilesh Sanikop   assert(direction != nullptr);
403*09537850SAkhilesh Sanikop   assert(variance != nullptr);
404*09537850SAkhilesh Sanikop   const auto* src = static_cast<const uint8_t*>(source);
405*09537850SAkhilesh Sanikop   uint32_t cost[8];
406*09537850SAkhilesh Sanikop   __m128i partial_lo[8], partial_hi[8];
407*09537850SAkhilesh Sanikop 
408*09537850SAkhilesh Sanikop   AddPartial(src, stride, partial_lo, partial_hi);
409*09537850SAkhilesh Sanikop 
410*09537850SAkhilesh Sanikop   cost[2] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[2]);
411*09537850SAkhilesh Sanikop   cost[6] = kCdefDivisionTable[7] * SquareSum_S16(partial_lo[6]);
412*09537850SAkhilesh Sanikop 
413*09537850SAkhilesh Sanikop   const __m128i division_table[2] = {LoadUnaligned16(kCdefDivisionTable),
414*09537850SAkhilesh Sanikop                                      LoadUnaligned16(kCdefDivisionTable + 4)};
415*09537850SAkhilesh Sanikop 
416*09537850SAkhilesh Sanikop   cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
417*09537850SAkhilesh Sanikop   cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
418*09537850SAkhilesh Sanikop 
419*09537850SAkhilesh Sanikop   const __m128i division_table_odd[2] = {
420*09537850SAkhilesh Sanikop       LoadAligned16(kCdefDivisionTableOddPadded),
421*09537850SAkhilesh Sanikop       LoadAligned16(kCdefDivisionTableOddPadded + 4)};
422*09537850SAkhilesh Sanikop 
423*09537850SAkhilesh Sanikop   cost[1] = CostOdd(partial_lo[1], partial_hi[1], division_table_odd);
424*09537850SAkhilesh Sanikop   cost[3] = CostOdd(partial_lo[3], partial_hi[3], division_table_odd);
425*09537850SAkhilesh Sanikop   cost[5] = CostOdd(partial_lo[5], partial_hi[5], division_table_odd);
426*09537850SAkhilesh Sanikop   cost[7] = CostOdd(partial_lo[7], partial_hi[7], division_table_odd);
427*09537850SAkhilesh Sanikop 
428*09537850SAkhilesh Sanikop   uint32_t best_cost = 0;
429*09537850SAkhilesh Sanikop   *direction = 0;
430*09537850SAkhilesh Sanikop   for (int i = 0; i < 8; ++i) {
431*09537850SAkhilesh Sanikop     if (cost[i] > best_cost) {
432*09537850SAkhilesh Sanikop       best_cost = cost[i];
433*09537850SAkhilesh Sanikop       *direction = i;
434*09537850SAkhilesh Sanikop     }
435*09537850SAkhilesh Sanikop   }
436*09537850SAkhilesh Sanikop   *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
437*09537850SAkhilesh Sanikop }
438*09537850SAkhilesh Sanikop 
439*09537850SAkhilesh Sanikop // -------------------------------------------------------------------------
440*09537850SAkhilesh Sanikop // CdefFilter
441*09537850SAkhilesh Sanikop 
442*09537850SAkhilesh Sanikop // Load 4 vectors based on the given |direction|.
LoadDirection(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,__m128i * output,const int direction)443*09537850SAkhilesh Sanikop inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
444*09537850SAkhilesh Sanikop                           const ptrdiff_t stride, __m128i* output,
445*09537850SAkhilesh Sanikop                           const int direction) {
446*09537850SAkhilesh Sanikop   // Each |direction| describes a different set of source values. Expand this
447*09537850SAkhilesh Sanikop   // set by negating each set. For |direction| == 0 this gives a diagonal line
448*09537850SAkhilesh Sanikop   // from top right to bottom left. The first value is y, the second x. Negative
449*09537850SAkhilesh Sanikop   // y values move up.
450*09537850SAkhilesh Sanikop   //    a       b         c       d
451*09537850SAkhilesh Sanikop   // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
452*09537850SAkhilesh Sanikop   //         c
453*09537850SAkhilesh Sanikop   //       a
454*09537850SAkhilesh Sanikop   //     0
455*09537850SAkhilesh Sanikop   //   b
456*09537850SAkhilesh Sanikop   // d
457*09537850SAkhilesh Sanikop   const int y_0 = kCdefDirections[direction][0][0];
458*09537850SAkhilesh Sanikop   const int x_0 = kCdefDirections[direction][0][1];
459*09537850SAkhilesh Sanikop   const int y_1 = kCdefDirections[direction][1][0];
460*09537850SAkhilesh Sanikop   const int x_1 = kCdefDirections[direction][1][1];
461*09537850SAkhilesh Sanikop   output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
462*09537850SAkhilesh Sanikop   output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
463*09537850SAkhilesh Sanikop   output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
464*09537850SAkhilesh Sanikop   output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
465*09537850SAkhilesh Sanikop }
466*09537850SAkhilesh Sanikop 
467*09537850SAkhilesh Sanikop // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
468*09537850SAkhilesh Sanikop // do 2 rows at a time.
LoadDirection4(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,__m128i * output,const int direction)469*09537850SAkhilesh Sanikop void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
470*09537850SAkhilesh Sanikop                     const ptrdiff_t stride, __m128i* output,
471*09537850SAkhilesh Sanikop                     const int direction) {
472*09537850SAkhilesh Sanikop   const int y_0 = kCdefDirections[direction][0][0];
473*09537850SAkhilesh Sanikop   const int x_0 = kCdefDirections[direction][0][1];
474*09537850SAkhilesh Sanikop   const int y_1 = kCdefDirections[direction][1][0];
475*09537850SAkhilesh Sanikop   const int x_1 = kCdefDirections[direction][1][1];
476*09537850SAkhilesh Sanikop   output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
477*09537850SAkhilesh Sanikop                       src - y_0 * stride + stride - x_0);
478*09537850SAkhilesh Sanikop   output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
479*09537850SAkhilesh Sanikop                       src + y_0 * stride + stride + x_0);
480*09537850SAkhilesh Sanikop   output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
481*09537850SAkhilesh Sanikop                       src - y_1 * stride + stride - x_1);
482*09537850SAkhilesh Sanikop   output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
483*09537850SAkhilesh Sanikop                       src + y_1 * stride + stride + x_1);
484*09537850SAkhilesh Sanikop }
485*09537850SAkhilesh Sanikop 
Constrain(const __m128i & pixel,const __m128i & reference,const __m128i & damping,const __m128i & threshold)486*09537850SAkhilesh Sanikop inline __m128i Constrain(const __m128i& pixel, const __m128i& reference,
487*09537850SAkhilesh Sanikop                          const __m128i& damping, const __m128i& threshold) {
488*09537850SAkhilesh Sanikop   const __m128i diff = _mm_sub_epi16(pixel, reference);
489*09537850SAkhilesh Sanikop   const __m128i abs_diff = _mm_abs_epi16(diff);
490*09537850SAkhilesh Sanikop   // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
491*09537850SAkhilesh Sanikop   //                    0, std::abs(diff))
492*09537850SAkhilesh Sanikop   const __m128i shifted_diff = _mm_srl_epi16(abs_diff, damping);
493*09537850SAkhilesh Sanikop   // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
494*09537850SAkhilesh Sanikop   // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
495*09537850SAkhilesh Sanikop   // larger than threshold. Subtract using saturation will return 0 when pixel
496*09537850SAkhilesh Sanikop   // == kCdefLargeValue.
497*09537850SAkhilesh Sanikop   static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
498*09537850SAkhilesh Sanikop   const __m128i thresh_minus_shifted_diff =
499*09537850SAkhilesh Sanikop       _mm_subs_epu16(threshold, shifted_diff);
500*09537850SAkhilesh Sanikop   const __m128i clamp_abs_diff =
501*09537850SAkhilesh Sanikop       _mm_min_epi16(thresh_minus_shifted_diff, abs_diff);
502*09537850SAkhilesh Sanikop   // Restore the sign.
503*09537850SAkhilesh Sanikop   return _mm_sign_epi16(clamp_abs_diff, diff);
504*09537850SAkhilesh Sanikop }
505*09537850SAkhilesh Sanikop 
ApplyConstrainAndTap(const __m128i & pixel,const __m128i & val,const __m128i & tap,const __m128i & damping,const __m128i & threshold)506*09537850SAkhilesh Sanikop inline __m128i ApplyConstrainAndTap(const __m128i& pixel, const __m128i& val,
507*09537850SAkhilesh Sanikop                                     const __m128i& tap, const __m128i& damping,
508*09537850SAkhilesh Sanikop                                     const __m128i& threshold) {
509*09537850SAkhilesh Sanikop   const __m128i constrained = Constrain(val, pixel, damping, threshold);
510*09537850SAkhilesh Sanikop   return _mm_mullo_epi16(constrained, tap);
511*09537850SAkhilesh Sanikop }
512*09537850SAkhilesh Sanikop 
513*09537850SAkhilesh Sanikop template <int width, bool enable_primary = true, bool enable_secondary = true>
CdefFilter_SSE4_1(const uint16_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,const int primary_strength,const int secondary_strength,const int damping,const int direction,void * LIBGAV1_RESTRICT dest,const ptrdiff_t dst_stride)514*09537850SAkhilesh Sanikop void CdefFilter_SSE4_1(const uint16_t* LIBGAV1_RESTRICT src,
515*09537850SAkhilesh Sanikop                        const ptrdiff_t src_stride, const int height,
516*09537850SAkhilesh Sanikop                        const int primary_strength, const int secondary_strength,
517*09537850SAkhilesh Sanikop                        const int damping, const int direction,
518*09537850SAkhilesh Sanikop                        void* LIBGAV1_RESTRICT dest,
519*09537850SAkhilesh Sanikop                        const ptrdiff_t dst_stride) {
520*09537850SAkhilesh Sanikop   static_assert(width == 8 || width == 4, "Invalid CDEF width.");
521*09537850SAkhilesh Sanikop   static_assert(enable_primary || enable_secondary, "");
522*09537850SAkhilesh Sanikop   constexpr bool clipping_required = enable_primary && enable_secondary;
523*09537850SAkhilesh Sanikop   auto* dst = static_cast<uint8_t*>(dest);
524*09537850SAkhilesh Sanikop   __m128i primary_damping_shift, secondary_damping_shift;
525*09537850SAkhilesh Sanikop 
526*09537850SAkhilesh Sanikop   // FloorLog2() requires input to be > 0.
527*09537850SAkhilesh Sanikop   // 8-bit damping range: Y: [3, 6], UV: [2, 5].
528*09537850SAkhilesh Sanikop   if (enable_primary) {
529*09537850SAkhilesh Sanikop     // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
530*09537850SAkhilesh Sanikop     // for UV filtering.
531*09537850SAkhilesh Sanikop     primary_damping_shift =
532*09537850SAkhilesh Sanikop         _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
533*09537850SAkhilesh Sanikop   }
534*09537850SAkhilesh Sanikop   if (enable_secondary) {
535*09537850SAkhilesh Sanikop     // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
536*09537850SAkhilesh Sanikop     // necessary.
537*09537850SAkhilesh Sanikop     assert(damping - FloorLog2(secondary_strength) >= 0);
538*09537850SAkhilesh Sanikop     secondary_damping_shift =
539*09537850SAkhilesh Sanikop         _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
540*09537850SAkhilesh Sanikop   }
541*09537850SAkhilesh Sanikop 
542*09537850SAkhilesh Sanikop   const __m128i primary_tap_0 =
543*09537850SAkhilesh Sanikop       _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][0]);
544*09537850SAkhilesh Sanikop   const __m128i primary_tap_1 =
545*09537850SAkhilesh Sanikop       _mm_set1_epi16(kCdefPrimaryTaps[primary_strength & 1][1]);
546*09537850SAkhilesh Sanikop   const __m128i secondary_tap_0 = _mm_set1_epi16(kCdefSecondaryTap0);
547*09537850SAkhilesh Sanikop   const __m128i secondary_tap_1 = _mm_set1_epi16(kCdefSecondaryTap1);
548*09537850SAkhilesh Sanikop   const __m128i cdef_large_value_mask =
549*09537850SAkhilesh Sanikop       _mm_set1_epi16(static_cast<int16_t>(~kCdefLargeValue));
550*09537850SAkhilesh Sanikop   const __m128i primary_threshold = _mm_set1_epi16(primary_strength);
551*09537850SAkhilesh Sanikop   const __m128i secondary_threshold = _mm_set1_epi16(secondary_strength);
552*09537850SAkhilesh Sanikop 
553*09537850SAkhilesh Sanikop   int y = height;
554*09537850SAkhilesh Sanikop   do {
555*09537850SAkhilesh Sanikop     __m128i pixel;
556*09537850SAkhilesh Sanikop     if (width == 8) {
557*09537850SAkhilesh Sanikop       pixel = LoadUnaligned16(src);
558*09537850SAkhilesh Sanikop     } else {
559*09537850SAkhilesh Sanikop       pixel = LoadHi8(LoadLo8(src), src + src_stride);
560*09537850SAkhilesh Sanikop     }
561*09537850SAkhilesh Sanikop 
562*09537850SAkhilesh Sanikop     __m128i min = pixel;
563*09537850SAkhilesh Sanikop     __m128i max = pixel;
564*09537850SAkhilesh Sanikop     __m128i sum;
565*09537850SAkhilesh Sanikop 
566*09537850SAkhilesh Sanikop     if (enable_primary) {
567*09537850SAkhilesh Sanikop       // Primary |direction|.
568*09537850SAkhilesh Sanikop       __m128i primary_val[4];
569*09537850SAkhilesh Sanikop       if (width == 8) {
570*09537850SAkhilesh Sanikop         LoadDirection(src, src_stride, primary_val, direction);
571*09537850SAkhilesh Sanikop       } else {
572*09537850SAkhilesh Sanikop         LoadDirection4(src, src_stride, primary_val, direction);
573*09537850SAkhilesh Sanikop       }
574*09537850SAkhilesh Sanikop 
575*09537850SAkhilesh Sanikop       if (clipping_required) {
576*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, primary_val[0]);
577*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, primary_val[1]);
578*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, primary_val[2]);
579*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, primary_val[3]);
580*09537850SAkhilesh Sanikop 
581*09537850SAkhilesh Sanikop         // The source is 16 bits, however, we only really care about the lower
582*09537850SAkhilesh Sanikop         // 8 bits.  The upper 8 bits contain the "large" flag.  After the final
583*09537850SAkhilesh Sanikop         // primary max has been calculated, zero out the upper 8 bits.  Use this
584*09537850SAkhilesh Sanikop         // to find the "16 bit" max.
585*09537850SAkhilesh Sanikop         const __m128i max_p01 = _mm_max_epu8(primary_val[0], primary_val[1]);
586*09537850SAkhilesh Sanikop         const __m128i max_p23 = _mm_max_epu8(primary_val[2], primary_val[3]);
587*09537850SAkhilesh Sanikop         const __m128i max_p = _mm_max_epu8(max_p01, max_p23);
588*09537850SAkhilesh Sanikop         max = _mm_max_epu16(max, _mm_and_si128(max_p, cdef_large_value_mask));
589*09537850SAkhilesh Sanikop       }
590*09537850SAkhilesh Sanikop 
591*09537850SAkhilesh Sanikop       sum = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
592*09537850SAkhilesh Sanikop                                  primary_damping_shift, primary_threshold);
593*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
594*09537850SAkhilesh Sanikop           sum, ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_0,
595*09537850SAkhilesh Sanikop                                     primary_damping_shift, primary_threshold));
596*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
597*09537850SAkhilesh Sanikop           sum, ApplyConstrainAndTap(pixel, primary_val[2], primary_tap_1,
598*09537850SAkhilesh Sanikop                                     primary_damping_shift, primary_threshold));
599*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
600*09537850SAkhilesh Sanikop           sum, ApplyConstrainAndTap(pixel, primary_val[3], primary_tap_1,
601*09537850SAkhilesh Sanikop                                     primary_damping_shift, primary_threshold));
602*09537850SAkhilesh Sanikop     } else {
603*09537850SAkhilesh Sanikop       sum = _mm_setzero_si128();
604*09537850SAkhilesh Sanikop     }
605*09537850SAkhilesh Sanikop 
606*09537850SAkhilesh Sanikop     if (enable_secondary) {
607*09537850SAkhilesh Sanikop       // Secondary |direction| values (+/- 2). Clamp |direction|.
608*09537850SAkhilesh Sanikop       __m128i secondary_val[8];
609*09537850SAkhilesh Sanikop       if (width == 8) {
610*09537850SAkhilesh Sanikop         LoadDirection(src, src_stride, secondary_val, direction + 2);
611*09537850SAkhilesh Sanikop         LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
612*09537850SAkhilesh Sanikop       } else {
613*09537850SAkhilesh Sanikop         LoadDirection4(src, src_stride, secondary_val, direction + 2);
614*09537850SAkhilesh Sanikop         LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
615*09537850SAkhilesh Sanikop       }
616*09537850SAkhilesh Sanikop 
617*09537850SAkhilesh Sanikop       if (clipping_required) {
618*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[0]);
619*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[1]);
620*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[2]);
621*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[3]);
622*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[4]);
623*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[5]);
624*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[6]);
625*09537850SAkhilesh Sanikop         min = _mm_min_epu16(min, secondary_val[7]);
626*09537850SAkhilesh Sanikop 
627*09537850SAkhilesh Sanikop         const __m128i max_s01 =
628*09537850SAkhilesh Sanikop             _mm_max_epu8(secondary_val[0], secondary_val[1]);
629*09537850SAkhilesh Sanikop         const __m128i max_s23 =
630*09537850SAkhilesh Sanikop             _mm_max_epu8(secondary_val[2], secondary_val[3]);
631*09537850SAkhilesh Sanikop         const __m128i max_s45 =
632*09537850SAkhilesh Sanikop             _mm_max_epu8(secondary_val[4], secondary_val[5]);
633*09537850SAkhilesh Sanikop         const __m128i max_s67 =
634*09537850SAkhilesh Sanikop             _mm_max_epu8(secondary_val[6], secondary_val[7]);
635*09537850SAkhilesh Sanikop         const __m128i max_s = _mm_max_epu8(_mm_max_epu8(max_s01, max_s23),
636*09537850SAkhilesh Sanikop                                            _mm_max_epu8(max_s45, max_s67));
637*09537850SAkhilesh Sanikop         max = _mm_max_epu16(max, _mm_and_si128(max_s, cdef_large_value_mask));
638*09537850SAkhilesh Sanikop       }
639*09537850SAkhilesh Sanikop 
640*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
641*09537850SAkhilesh Sanikop           sum,
642*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
643*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
644*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
645*09537850SAkhilesh Sanikop           sum,
646*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_0,
647*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
648*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
649*09537850SAkhilesh Sanikop           sum,
650*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_1,
651*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
652*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
653*09537850SAkhilesh Sanikop           sum,
654*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
655*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
656*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
657*09537850SAkhilesh Sanikop           sum,
658*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[4], secondary_tap_0,
659*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
660*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
661*09537850SAkhilesh Sanikop           sum,
662*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[5], secondary_tap_0,
663*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
664*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
665*09537850SAkhilesh Sanikop           sum,
666*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[6], secondary_tap_1,
667*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
668*09537850SAkhilesh Sanikop       sum = _mm_add_epi16(
669*09537850SAkhilesh Sanikop           sum,
670*09537850SAkhilesh Sanikop           ApplyConstrainAndTap(pixel, secondary_val[7], secondary_tap_1,
671*09537850SAkhilesh Sanikop                                secondary_damping_shift, secondary_threshold));
672*09537850SAkhilesh Sanikop     }
673*09537850SAkhilesh Sanikop     // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
674*09537850SAkhilesh Sanikop     const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
675*09537850SAkhilesh Sanikop     // 8 + sum
676*09537850SAkhilesh Sanikop     sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
677*09537850SAkhilesh Sanikop     // (... - (sum < 0)) >> 4
678*09537850SAkhilesh Sanikop     sum = _mm_add_epi16(sum, sum_lt_0);
679*09537850SAkhilesh Sanikop     sum = _mm_srai_epi16(sum, 4);
680*09537850SAkhilesh Sanikop     // pixel + ...
681*09537850SAkhilesh Sanikop     sum = _mm_add_epi16(sum, pixel);
682*09537850SAkhilesh Sanikop     if (clipping_required) {
683*09537850SAkhilesh Sanikop       // Clip3
684*09537850SAkhilesh Sanikop       sum = _mm_min_epi16(sum, max);
685*09537850SAkhilesh Sanikop       sum = _mm_max_epi16(sum, min);
686*09537850SAkhilesh Sanikop     }
687*09537850SAkhilesh Sanikop 
688*09537850SAkhilesh Sanikop     const __m128i result = _mm_packus_epi16(sum, sum);
689*09537850SAkhilesh Sanikop     if (width == 8) {
690*09537850SAkhilesh Sanikop       src += src_stride;
691*09537850SAkhilesh Sanikop       StoreLo8(dst, result);
692*09537850SAkhilesh Sanikop       dst += dst_stride;
693*09537850SAkhilesh Sanikop       --y;
694*09537850SAkhilesh Sanikop     } else {
695*09537850SAkhilesh Sanikop       src += src_stride << 1;
696*09537850SAkhilesh Sanikop       Store4(dst, result);
697*09537850SAkhilesh Sanikop       dst += dst_stride;
698*09537850SAkhilesh Sanikop       Store4(dst, _mm_srli_si128(result, 4));
699*09537850SAkhilesh Sanikop       dst += dst_stride;
700*09537850SAkhilesh Sanikop       y -= 2;
701*09537850SAkhilesh Sanikop     }
702*09537850SAkhilesh Sanikop   } while (y != 0);
703*09537850SAkhilesh Sanikop }
704*09537850SAkhilesh Sanikop 
Init8bpp()705*09537850SAkhilesh Sanikop void Init8bpp() {
706*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
707*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
708*09537850SAkhilesh Sanikop   dsp->cdef_direction = CdefDirection_SSE4_1;
709*09537850SAkhilesh Sanikop   dsp->cdef_filters[0][0] = CdefFilter_SSE4_1<4>;
710*09537850SAkhilesh Sanikop   dsp->cdef_filters[0][1] =
711*09537850SAkhilesh Sanikop       CdefFilter_SSE4_1<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
712*09537850SAkhilesh Sanikop   dsp->cdef_filters[0][2] = CdefFilter_SSE4_1<4, /*enable_primary=*/false>;
713*09537850SAkhilesh Sanikop   dsp->cdef_filters[1][0] = CdefFilter_SSE4_1<8>;
714*09537850SAkhilesh Sanikop   dsp->cdef_filters[1][1] =
715*09537850SAkhilesh Sanikop       CdefFilter_SSE4_1<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
716*09537850SAkhilesh Sanikop   dsp->cdef_filters[1][2] = CdefFilter_SSE4_1<8, /*enable_primary=*/false>;
717*09537850SAkhilesh Sanikop }
718*09537850SAkhilesh Sanikop 
719*09537850SAkhilesh Sanikop }  // namespace
720*09537850SAkhilesh Sanikop }  // namespace low_bitdepth
721*09537850SAkhilesh Sanikop 
CdefInit_SSE4_1()722*09537850SAkhilesh Sanikop void CdefInit_SSE4_1() { low_bitdepth::Init8bpp(); }
723*09537850SAkhilesh Sanikop 
724*09537850SAkhilesh Sanikop }  // namespace dsp
725*09537850SAkhilesh Sanikop }  // namespace libgav1
726*09537850SAkhilesh Sanikop #else   // !LIBGAV1_TARGETING_SSE4_1
727*09537850SAkhilesh Sanikop namespace libgav1 {
728*09537850SAkhilesh Sanikop namespace dsp {
729*09537850SAkhilesh Sanikop 
CdefInit_SSE4_1()730*09537850SAkhilesh Sanikop void CdefInit_SSE4_1() {}
731*09537850SAkhilesh Sanikop 
732*09537850SAkhilesh Sanikop }  // namespace dsp
733*09537850SAkhilesh Sanikop }  // namespace libgav1
734*09537850SAkhilesh Sanikop #endif  // LIBGAV1_TARGETING_SSE4_1
735