1*09537850SAkhilesh Sanikop // Copyright 2021 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/cdef.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop
18*09537850SAkhilesh Sanikop #if LIBGAV1_TARGETING_AVX2
19*09537850SAkhilesh Sanikop #include <immintrin.h>
20*09537850SAkhilesh Sanikop
21*09537850SAkhilesh Sanikop #include <algorithm>
22*09537850SAkhilesh Sanikop #include <cassert>
23*09537850SAkhilesh Sanikop #include <cstddef>
24*09537850SAkhilesh Sanikop #include <cstdint>
25*09537850SAkhilesh Sanikop #include <cstdlib>
26*09537850SAkhilesh Sanikop
27*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
28*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
29*09537850SAkhilesh Sanikop #include "src/dsp/x86/common_avx2.h"
30*09537850SAkhilesh Sanikop #include "src/utils/common.h"
31*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
32*09537850SAkhilesh Sanikop
33*09537850SAkhilesh Sanikop namespace libgav1 {
34*09537850SAkhilesh Sanikop namespace dsp {
35*09537850SAkhilesh Sanikop namespace low_bitdepth {
36*09537850SAkhilesh Sanikop namespace {
37*09537850SAkhilesh Sanikop
38*09537850SAkhilesh Sanikop #include "src/dsp/cdef.inc"
39*09537850SAkhilesh Sanikop
40*09537850SAkhilesh Sanikop // Used when calculating odd |cost[x]| values.
41*09537850SAkhilesh Sanikop // Holds elements 1 3 5 7 7 7 7 7
42*09537850SAkhilesh Sanikop alignas(32) constexpr uint32_t kCdefDivisionTableOddPairsPadded[] = {
43*09537850SAkhilesh Sanikop 420, 210, 140, 105, 420, 210, 140, 105,
44*09537850SAkhilesh Sanikop 105, 105, 105, 105, 105, 105, 105, 105};
45*09537850SAkhilesh Sanikop
46*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
47*09537850SAkhilesh Sanikop // Refer to CdefDirection_C().
48*09537850SAkhilesh Sanikop //
49*09537850SAkhilesh Sanikop // int32_t partial[8][15] = {};
50*09537850SAkhilesh Sanikop // for (int i = 0; i < 8; ++i) {
51*09537850SAkhilesh Sanikop // for (int j = 0; j < 8; ++j) {
52*09537850SAkhilesh Sanikop // const int x = 1;
53*09537850SAkhilesh Sanikop // partial[0][i + j] += x;
54*09537850SAkhilesh Sanikop // partial[1][i + j / 2] += x;
55*09537850SAkhilesh Sanikop // partial[2][i] += x;
56*09537850SAkhilesh Sanikop // partial[3][3 + i - j / 2] += x;
57*09537850SAkhilesh Sanikop // partial[4][7 + i - j] += x;
58*09537850SAkhilesh Sanikop // partial[5][3 - i / 2 + j] += x;
59*09537850SAkhilesh Sanikop // partial[6][j] += x;
60*09537850SAkhilesh Sanikop // partial[7][i / 2 + j] += x;
61*09537850SAkhilesh Sanikop // }
62*09537850SAkhilesh Sanikop // }
63*09537850SAkhilesh Sanikop //
64*09537850SAkhilesh Sanikop // Using the code above, generate the position count for partial[8][15].
65*09537850SAkhilesh Sanikop //
66*09537850SAkhilesh Sanikop // partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
67*09537850SAkhilesh Sanikop // partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
68*09537850SAkhilesh Sanikop // partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
69*09537850SAkhilesh Sanikop // partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
70*09537850SAkhilesh Sanikop // partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
71*09537850SAkhilesh Sanikop // partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
72*09537850SAkhilesh Sanikop // partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
73*09537850SAkhilesh Sanikop // partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
74*09537850SAkhilesh Sanikop //
75*09537850SAkhilesh Sanikop // The SIMD code shifts the input horizontally, then adds vertically to get the
76*09537850SAkhilesh Sanikop // correct partial value for the given position.
77*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
78*09537850SAkhilesh Sanikop
79*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
80*09537850SAkhilesh Sanikop // partial[0][i + j] += x;
81*09537850SAkhilesh Sanikop //
82*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
83*09537850SAkhilesh Sanikop // 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
84*09537850SAkhilesh Sanikop // 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
85*09537850SAkhilesh Sanikop // 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
86*09537850SAkhilesh Sanikop // 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
87*09537850SAkhilesh Sanikop // 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
88*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
89*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
90*09537850SAkhilesh Sanikop //
91*09537850SAkhilesh Sanikop // partial[4] is the same except the source is reversed.
AddPartial_D0_D4(__m256i * v_src_16,__m256i * partial_lo,__m256i * partial_hi)92*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(__m256i* v_src_16,
93*09537850SAkhilesh Sanikop __m256i* partial_lo,
94*09537850SAkhilesh Sanikop __m256i* partial_hi) {
95*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07
96*09537850SAkhilesh Sanikop *partial_lo = v_src_16[0];
97*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 00
98*09537850SAkhilesh Sanikop *partial_hi = _mm256_setzero_si256();
99*09537850SAkhilesh Sanikop
100*09537850SAkhilesh Sanikop // 00 10 11 12 13 14 15 16
101*09537850SAkhilesh Sanikop *partial_lo =
102*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[1], 2));
103*09537850SAkhilesh Sanikop // 17 00 00 00 00 00 00 00
104*09537850SAkhilesh Sanikop *partial_hi =
105*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[1], 14));
106*09537850SAkhilesh Sanikop
107*09537850SAkhilesh Sanikop // 00 00 20 21 22 23 24 25
108*09537850SAkhilesh Sanikop *partial_lo =
109*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[2], 4));
110*09537850SAkhilesh Sanikop // 26 27 00 00 00 00 00 00
111*09537850SAkhilesh Sanikop *partial_hi =
112*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[2], 12));
113*09537850SAkhilesh Sanikop
114*09537850SAkhilesh Sanikop // 00 00 00 30 31 32 33 34
115*09537850SAkhilesh Sanikop *partial_lo =
116*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[3], 6));
117*09537850SAkhilesh Sanikop // 35 36 37 00 00 00 00 00
118*09537850SAkhilesh Sanikop *partial_hi =
119*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[3], 10));
120*09537850SAkhilesh Sanikop
121*09537850SAkhilesh Sanikop // 00 00 00 00 40 41 42 43
122*09537850SAkhilesh Sanikop *partial_lo =
123*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[4], 8));
124*09537850SAkhilesh Sanikop // 44 45 46 47 00 00 00 00
125*09537850SAkhilesh Sanikop *partial_hi =
126*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[4], 8));
127*09537850SAkhilesh Sanikop
128*09537850SAkhilesh Sanikop // 00 00 00 00 00 50 51 52
129*09537850SAkhilesh Sanikop *partial_lo =
130*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[5], 10));
131*09537850SAkhilesh Sanikop // 53 54 55 56 57 00 00 00
132*09537850SAkhilesh Sanikop *partial_hi =
133*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[5], 6));
134*09537850SAkhilesh Sanikop
135*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 60 61
136*09537850SAkhilesh Sanikop *partial_lo =
137*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[6], 12));
138*09537850SAkhilesh Sanikop // 62 63 64 65 66 67 00 00
139*09537850SAkhilesh Sanikop *partial_hi =
140*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[6], 4));
141*09537850SAkhilesh Sanikop
142*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 70
143*09537850SAkhilesh Sanikop *partial_lo =
144*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_src_16[7], 14));
145*09537850SAkhilesh Sanikop // 71 72 73 74 75 76 77 00
146*09537850SAkhilesh Sanikop *partial_hi =
147*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_src_16[7], 2));
148*09537850SAkhilesh Sanikop }
149*09537850SAkhilesh Sanikop
150*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
151*09537850SAkhilesh Sanikop // partial[1][i + j / 2] += x;
152*09537850SAkhilesh Sanikop //
153*09537850SAkhilesh Sanikop // A0 = src[0] + src[1], A1 = src[2] + src[3], ...
154*09537850SAkhilesh Sanikop //
155*09537850SAkhilesh Sanikop // A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
156*09537850SAkhilesh Sanikop // 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
157*09537850SAkhilesh Sanikop // 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
158*09537850SAkhilesh Sanikop // 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
159*09537850SAkhilesh Sanikop // 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
160*09537850SAkhilesh Sanikop // 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
161*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
162*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
163*09537850SAkhilesh Sanikop //
164*09537850SAkhilesh Sanikop // partial[3] is the same except the source is reversed.
AddPartial_D1_D3(__m256i * v_src_16,__m256i * partial_lo,__m256i * partial_hi)165*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(__m256i* v_src_16,
166*09537850SAkhilesh Sanikop __m256i* partial_lo,
167*09537850SAkhilesh Sanikop __m256i* partial_hi) {
168*09537850SAkhilesh Sanikop __m256i v_d1_temp[8];
169*09537850SAkhilesh Sanikop const __m256i v_zero = _mm256_setzero_si256();
170*09537850SAkhilesh Sanikop
171*09537850SAkhilesh Sanikop for (int i = 0; i < 8; ++i) {
172*09537850SAkhilesh Sanikop v_d1_temp[i] = _mm256_hadd_epi16(v_src_16[i], v_zero);
173*09537850SAkhilesh Sanikop }
174*09537850SAkhilesh Sanikop
175*09537850SAkhilesh Sanikop *partial_lo = *partial_hi = v_zero;
176*09537850SAkhilesh Sanikop // A0 A1 A2 A3 00 00 00 00
177*09537850SAkhilesh Sanikop *partial_lo = _mm256_add_epi16(*partial_lo, v_d1_temp[0]);
178*09537850SAkhilesh Sanikop
179*09537850SAkhilesh Sanikop // 00 B0 B1 B2 B3 00 00 00
180*09537850SAkhilesh Sanikop *partial_lo =
181*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[1], 2));
182*09537850SAkhilesh Sanikop
183*09537850SAkhilesh Sanikop // 00 00 C0 C1 C2 C3 00 00
184*09537850SAkhilesh Sanikop *partial_lo =
185*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[2], 4));
186*09537850SAkhilesh Sanikop // 00 00 00 D0 D1 D2 D3 00
187*09537850SAkhilesh Sanikop *partial_lo =
188*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[3], 6));
189*09537850SAkhilesh Sanikop // 00 00 00 00 E0 E1 E2 E3
190*09537850SAkhilesh Sanikop *partial_lo =
191*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[4], 8));
192*09537850SAkhilesh Sanikop
193*09537850SAkhilesh Sanikop // 00 00 00 00 00 F0 F1 F2
194*09537850SAkhilesh Sanikop *partial_lo =
195*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[5], 10));
196*09537850SAkhilesh Sanikop // F3 00 00 00 00 00 00 00
197*09537850SAkhilesh Sanikop *partial_hi =
198*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[5], 6));
199*09537850SAkhilesh Sanikop
200*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 G0 G1
201*09537850SAkhilesh Sanikop *partial_lo =
202*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[6], 12));
203*09537850SAkhilesh Sanikop // G2 G3 00 00 00 00 00 00
204*09537850SAkhilesh Sanikop *partial_hi =
205*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[6], 4));
206*09537850SAkhilesh Sanikop
207*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 H0
208*09537850SAkhilesh Sanikop *partial_lo =
209*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_d1_temp[7], 14));
210*09537850SAkhilesh Sanikop // H1 H2 H3 00 00 00 00 00
211*09537850SAkhilesh Sanikop *partial_hi =
212*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_d1_temp[7], 2));
213*09537850SAkhilesh Sanikop }
214*09537850SAkhilesh Sanikop
215*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
216*09537850SAkhilesh Sanikop // partial[7][i / 2 + j] += x;
217*09537850SAkhilesh Sanikop //
218*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
219*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
220*09537850SAkhilesh Sanikop // 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
221*09537850SAkhilesh Sanikop // 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
222*09537850SAkhilesh Sanikop // 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
223*09537850SAkhilesh Sanikop // 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
224*09537850SAkhilesh Sanikop // 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
225*09537850SAkhilesh Sanikop // 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
226*09537850SAkhilesh Sanikop //
227*09537850SAkhilesh Sanikop // partial[5] is the same except the source is reversed.
AddPartial_D7_D5(__m256i * v_src,__m256i * partial_lo,__m256i * partial_hi)228*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D7_D5(__m256i* v_src, __m256i* partial_lo,
229*09537850SAkhilesh Sanikop __m256i* partial_hi) {
230*09537850SAkhilesh Sanikop __m256i v_pair_add[4];
231*09537850SAkhilesh Sanikop // Add vertical source pairs.
232*09537850SAkhilesh Sanikop v_pair_add[0] = _mm256_add_epi16(v_src[0], v_src[1]);
233*09537850SAkhilesh Sanikop v_pair_add[1] = _mm256_add_epi16(v_src[2], v_src[3]);
234*09537850SAkhilesh Sanikop v_pair_add[2] = _mm256_add_epi16(v_src[4], v_src[5]);
235*09537850SAkhilesh Sanikop v_pair_add[3] = _mm256_add_epi16(v_src[6], v_src[7]);
236*09537850SAkhilesh Sanikop
237*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07
238*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17
239*09537850SAkhilesh Sanikop *partial_lo = v_pair_add[0];
240*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 00
241*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 00
242*09537850SAkhilesh Sanikop *partial_hi = _mm256_setzero_si256();
243*09537850SAkhilesh Sanikop
244*09537850SAkhilesh Sanikop // 00 20 21 22 23 24 25 26
245*09537850SAkhilesh Sanikop // 00 30 31 32 33 34 35 36
246*09537850SAkhilesh Sanikop *partial_lo =
247*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[1], 2));
248*09537850SAkhilesh Sanikop // 27 00 00 00 00 00 00 00
249*09537850SAkhilesh Sanikop // 37 00 00 00 00 00 00 00
250*09537850SAkhilesh Sanikop *partial_hi =
251*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[1], 14));
252*09537850SAkhilesh Sanikop
253*09537850SAkhilesh Sanikop // 00 00 40 41 42 43 44 45
254*09537850SAkhilesh Sanikop // 00 00 50 51 52 53 54 55
255*09537850SAkhilesh Sanikop *partial_lo =
256*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[2], 4));
257*09537850SAkhilesh Sanikop // 46 47 00 00 00 00 00 00
258*09537850SAkhilesh Sanikop // 56 57 00 00 00 00 00 00
259*09537850SAkhilesh Sanikop *partial_hi =
260*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[2], 12));
261*09537850SAkhilesh Sanikop
262*09537850SAkhilesh Sanikop // 00 00 00 60 61 62 63 64
263*09537850SAkhilesh Sanikop // 00 00 00 70 71 72 73 74
264*09537850SAkhilesh Sanikop *partial_lo =
265*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_lo, _mm256_slli_si256(v_pair_add[3], 6));
266*09537850SAkhilesh Sanikop // 65 66 67 00 00 00 00 00
267*09537850SAkhilesh Sanikop // 75 76 77 00 00 00 00 00
268*09537850SAkhilesh Sanikop *partial_hi =
269*09537850SAkhilesh Sanikop _mm256_add_epi16(*partial_hi, _mm256_srli_si256(v_pair_add[3], 10));
270*09537850SAkhilesh Sanikop }
271*09537850SAkhilesh Sanikop
AddPartial(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t stride,__m256i * partial)272*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial(const uint8_t* LIBGAV1_RESTRICT src,
273*09537850SAkhilesh Sanikop ptrdiff_t stride, __m256i* partial) {
274*09537850SAkhilesh Sanikop // 8x8 input
275*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07
276*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17
277*09537850SAkhilesh Sanikop // 20 21 22 23 24 25 26 27
278*09537850SAkhilesh Sanikop // 30 31 32 33 34 35 36 37
279*09537850SAkhilesh Sanikop // 40 41 42 43 44 45 46 47
280*09537850SAkhilesh Sanikop // 50 51 52 53 54 55 56 57
281*09537850SAkhilesh Sanikop // 60 61 62 63 64 65 66 67
282*09537850SAkhilesh Sanikop // 70 71 72 73 74 75 76 77
283*09537850SAkhilesh Sanikop __m256i v_src[8];
284*09537850SAkhilesh Sanikop for (auto& i : v_src) {
285*09537850SAkhilesh Sanikop i = _mm256_castsi128_si256(LoadLo8(src));
286*09537850SAkhilesh Sanikop // Dup lower lane.
287*09537850SAkhilesh Sanikop i = _mm256_permute2x128_si256(i, i, 0x0);
288*09537850SAkhilesh Sanikop src += stride;
289*09537850SAkhilesh Sanikop }
290*09537850SAkhilesh Sanikop
291*09537850SAkhilesh Sanikop const __m256i v_zero = _mm256_setzero_si256();
292*09537850SAkhilesh Sanikop // partial for direction 2
293*09537850SAkhilesh Sanikop // --------------------------------------------------------------------------
294*09537850SAkhilesh Sanikop // partial[2][i] += x;
295*09537850SAkhilesh Sanikop // 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx xx
296*09537850SAkhilesh Sanikop // 01 11 21 33 41 51 61 71 xx xx xx xx xx xx xx xx
297*09537850SAkhilesh Sanikop // 02 12 22 33 42 52 62 72 xx xx xx xx xx xx xx xx
298*09537850SAkhilesh Sanikop // 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
299*09537850SAkhilesh Sanikop // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
300*09537850SAkhilesh Sanikop // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
301*09537850SAkhilesh Sanikop // 06 16 26 36 46 56 66 76 xx xx xx xx xx xx xx xx
302*09537850SAkhilesh Sanikop // 07 17 27 37 47 57 67 77 xx xx xx xx xx xx xx xx
303*09537850SAkhilesh Sanikop const __m256i v_src_4_0 = _mm256_unpacklo_epi64(v_src[0], v_src[4]);
304*09537850SAkhilesh Sanikop const __m256i v_src_5_1 = _mm256_unpacklo_epi64(v_src[1], v_src[5]);
305*09537850SAkhilesh Sanikop const __m256i v_src_6_2 = _mm256_unpacklo_epi64(v_src[2], v_src[6]);
306*09537850SAkhilesh Sanikop const __m256i v_src_7_3 = _mm256_unpacklo_epi64(v_src[3], v_src[7]);
307*09537850SAkhilesh Sanikop const __m256i v_hsum_4_0 = _mm256_sad_epu8(v_src_4_0, v_zero);
308*09537850SAkhilesh Sanikop const __m256i v_hsum_5_1 = _mm256_sad_epu8(v_src_5_1, v_zero);
309*09537850SAkhilesh Sanikop const __m256i v_hsum_6_2 = _mm256_sad_epu8(v_src_6_2, v_zero);
310*09537850SAkhilesh Sanikop const __m256i v_hsum_7_3 = _mm256_sad_epu8(v_src_7_3, v_zero);
311*09537850SAkhilesh Sanikop const __m256i v_hsum_1_0 = _mm256_unpacklo_epi16(v_hsum_4_0, v_hsum_5_1);
312*09537850SAkhilesh Sanikop const __m256i v_hsum_3_2 = _mm256_unpacklo_epi16(v_hsum_6_2, v_hsum_7_3);
313*09537850SAkhilesh Sanikop const __m256i v_hsum_5_4 = _mm256_unpackhi_epi16(v_hsum_4_0, v_hsum_5_1);
314*09537850SAkhilesh Sanikop const __m256i v_hsum_7_6 = _mm256_unpackhi_epi16(v_hsum_6_2, v_hsum_7_3);
315*09537850SAkhilesh Sanikop partial[2] =
316*09537850SAkhilesh Sanikop _mm256_unpacklo_epi64(_mm256_unpacklo_epi32(v_hsum_1_0, v_hsum_3_2),
317*09537850SAkhilesh Sanikop _mm256_unpacklo_epi32(v_hsum_5_4, v_hsum_7_6));
318*09537850SAkhilesh Sanikop
319*09537850SAkhilesh Sanikop const __m256i extend_reverse = SetrM128i(
320*09537850SAkhilesh Sanikop _mm_set_epi32(static_cast<int>(0x80078006), static_cast<int>(0x80058004),
321*09537850SAkhilesh Sanikop static_cast<int>(0x80038002), static_cast<int>(0x80018000)),
322*09537850SAkhilesh Sanikop _mm_set_epi32(static_cast<int>(0x80008001), static_cast<int>(0x80028003),
323*09537850SAkhilesh Sanikop static_cast<int>(0x80048005),
324*09537850SAkhilesh Sanikop static_cast<int>(0x80068007)));
325*09537850SAkhilesh Sanikop
326*09537850SAkhilesh Sanikop for (auto& i : v_src) {
327*09537850SAkhilesh Sanikop // Zero extend unsigned 8 to 16. The upper lane is reversed.
328*09537850SAkhilesh Sanikop i = _mm256_shuffle_epi8(i, extend_reverse);
329*09537850SAkhilesh Sanikop }
330*09537850SAkhilesh Sanikop
331*09537850SAkhilesh Sanikop // partial for direction 6
332*09537850SAkhilesh Sanikop // --------------------------------------------------------------------------
333*09537850SAkhilesh Sanikop // partial[6][j] += x;
334*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
335*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
336*09537850SAkhilesh Sanikop // 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
337*09537850SAkhilesh Sanikop // 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
338*09537850SAkhilesh Sanikop // 40 41 42 43 44 45 46 47 xx xx xx xx xx xx xx xx
339*09537850SAkhilesh Sanikop // 50 51 52 53 54 55 56 57 xx xx xx xx xx xx xx xx
340*09537850SAkhilesh Sanikop // 60 61 62 63 64 65 66 67 xx xx xx xx xx xx xx xx
341*09537850SAkhilesh Sanikop // 70 71 72 73 74 75 76 77 xx xx xx xx xx xx xx xx
342*09537850SAkhilesh Sanikop partial[6] = v_src[0];
343*09537850SAkhilesh Sanikop for (int i = 1; i < 8; ++i) {
344*09537850SAkhilesh Sanikop partial[6] = _mm256_add_epi16(partial[6], v_src[i]);
345*09537850SAkhilesh Sanikop }
346*09537850SAkhilesh Sanikop
347*09537850SAkhilesh Sanikop AddPartial_D0_D4(v_src, &partial[0], &partial[4]);
348*09537850SAkhilesh Sanikop AddPartial_D1_D3(v_src, &partial[1], &partial[3]);
349*09537850SAkhilesh Sanikop AddPartial_D7_D5(v_src, &partial[7], &partial[5]);
350*09537850SAkhilesh Sanikop }
351*09537850SAkhilesh Sanikop
SumVectorPair_S32(__m256i a)352*09537850SAkhilesh Sanikop inline __m256i SumVectorPair_S32(__m256i a) {
353*09537850SAkhilesh Sanikop a = _mm256_hadd_epi32(a, a);
354*09537850SAkhilesh Sanikop a = _mm256_add_epi32(a, _mm256_srli_si256(a, 4));
355*09537850SAkhilesh Sanikop return a;
356*09537850SAkhilesh Sanikop }
357*09537850SAkhilesh Sanikop
358*09537850SAkhilesh Sanikop // |cost[0]| and |cost[4]| square the input and sum with the corresponding
359*09537850SAkhilesh Sanikop // element from the other end of the vector:
360*09537850SAkhilesh Sanikop // |kCdefDivisionTable[]| element:
361*09537850SAkhilesh Sanikop // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
362*09537850SAkhilesh Sanikop // kCdefDivisionTable[i + 1];
363*09537850SAkhilesh Sanikop // cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
Cost0Or4_Pair(uint32_t * cost,const __m256i partial_0,const __m256i partial_4,const __m256i division_table)364*09537850SAkhilesh Sanikop inline void Cost0Or4_Pair(uint32_t* cost, const __m256i partial_0,
365*09537850SAkhilesh Sanikop const __m256i partial_4,
366*09537850SAkhilesh Sanikop const __m256i division_table) {
367*09537850SAkhilesh Sanikop const __m256i division_table_0 =
368*09537850SAkhilesh Sanikop _mm256_permute2x128_si256(division_table, division_table, 0x0);
369*09537850SAkhilesh Sanikop const __m256i division_table_1 =
370*09537850SAkhilesh Sanikop _mm256_permute2x128_si256(division_table, division_table, 0x11);
371*09537850SAkhilesh Sanikop
372*09537850SAkhilesh Sanikop // partial_lo
373*09537850SAkhilesh Sanikop const __m256i a = partial_0;
374*09537850SAkhilesh Sanikop // partial_hi
375*09537850SAkhilesh Sanikop const __m256i b = partial_4;
376*09537850SAkhilesh Sanikop
377*09537850SAkhilesh Sanikop // Reverse and clear upper 2 bytes.
378*09537850SAkhilesh Sanikop const __m256i reverser = _mm256_broadcastsi128_si256(_mm_set_epi32(
379*09537850SAkhilesh Sanikop static_cast<int>(0x80800100), 0x03020504, 0x07060908, 0x0b0a0d0c));
380*09537850SAkhilesh Sanikop
381*09537850SAkhilesh Sanikop // 14 13 12 11 10 09 08 ZZ
382*09537850SAkhilesh Sanikop const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
383*09537850SAkhilesh Sanikop // 00 14 01 13 02 12 03 11
384*09537850SAkhilesh Sanikop const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
385*09537850SAkhilesh Sanikop // 04 10 05 09 06 08 07 ZZ
386*09537850SAkhilesh Sanikop const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
387*09537850SAkhilesh Sanikop
388*09537850SAkhilesh Sanikop // Square(partial[0][i]) + Square(partial[0][14 - i])
389*09537850SAkhilesh Sanikop const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
390*09537850SAkhilesh Sanikop const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
391*09537850SAkhilesh Sanikop
392*09537850SAkhilesh Sanikop const __m256i c = _mm256_mullo_epi32(square_lo, division_table_0);
393*09537850SAkhilesh Sanikop const __m256i d = _mm256_mullo_epi32(square_hi, division_table_1);
394*09537850SAkhilesh Sanikop const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
395*09537850SAkhilesh Sanikop // Copy upper 32bit sum to lower lane.
396*09537850SAkhilesh Sanikop const __m128i sums =
397*09537850SAkhilesh Sanikop _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
398*09537850SAkhilesh Sanikop cost[0] = _mm_cvtsi128_si32(sums);
399*09537850SAkhilesh Sanikop cost[4] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
400*09537850SAkhilesh Sanikop }
401*09537850SAkhilesh Sanikop
402*09537850SAkhilesh Sanikop template <int index_a, int index_b>
CostOdd_Pair(uint32_t * cost,const __m256i partial_a,const __m256i partial_b,const __m256i division_table[2])403*09537850SAkhilesh Sanikop inline void CostOdd_Pair(uint32_t* cost, const __m256i partial_a,
404*09537850SAkhilesh Sanikop const __m256i partial_b,
405*09537850SAkhilesh Sanikop const __m256i division_table[2]) {
406*09537850SAkhilesh Sanikop // partial_lo
407*09537850SAkhilesh Sanikop const __m256i a = partial_a;
408*09537850SAkhilesh Sanikop // partial_hi
409*09537850SAkhilesh Sanikop const __m256i b = partial_b;
410*09537850SAkhilesh Sanikop
411*09537850SAkhilesh Sanikop // Reverse and clear upper 10 bytes.
412*09537850SAkhilesh Sanikop const __m256i reverser = _mm256_broadcastsi128_si256(
413*09537850SAkhilesh Sanikop _mm_set_epi32(static_cast<int>(0x80808080), static_cast<int>(0x80808080),
414*09537850SAkhilesh Sanikop static_cast<int>(0x80800100), 0x03020504));
415*09537850SAkhilesh Sanikop
416*09537850SAkhilesh Sanikop // 10 09 08 ZZ ZZ ZZ ZZ ZZ
417*09537850SAkhilesh Sanikop const __m256i b_reversed = _mm256_shuffle_epi8(b, reverser);
418*09537850SAkhilesh Sanikop // 00 10 01 09 02 08 03 ZZ
419*09537850SAkhilesh Sanikop const __m256i ab_lo = _mm256_unpacklo_epi16(a, b_reversed);
420*09537850SAkhilesh Sanikop // 04 ZZ 05 ZZ 06 ZZ 07 ZZ
421*09537850SAkhilesh Sanikop const __m256i ab_hi = _mm256_unpackhi_epi16(a, b_reversed);
422*09537850SAkhilesh Sanikop
423*09537850SAkhilesh Sanikop // Square(partial[0][i]) + Square(partial[0][14 - i])
424*09537850SAkhilesh Sanikop const __m256i square_lo = _mm256_madd_epi16(ab_lo, ab_lo);
425*09537850SAkhilesh Sanikop const __m256i square_hi = _mm256_madd_epi16(ab_hi, ab_hi);
426*09537850SAkhilesh Sanikop
427*09537850SAkhilesh Sanikop const __m256i c = _mm256_mullo_epi32(square_lo, division_table[0]);
428*09537850SAkhilesh Sanikop const __m256i d = _mm256_mullo_epi32(square_hi, division_table[1]);
429*09537850SAkhilesh Sanikop const __m256i e = SumVectorPair_S32(_mm256_add_epi32(c, d));
430*09537850SAkhilesh Sanikop // Copy upper 32bit sum to lower lane.
431*09537850SAkhilesh Sanikop const __m128i sums =
432*09537850SAkhilesh Sanikop _mm256_castsi256_si128(_mm256_permute4x64_epi64(e, 0x08));
433*09537850SAkhilesh Sanikop cost[index_a] = _mm_cvtsi128_si32(sums);
434*09537850SAkhilesh Sanikop cost[index_b] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
435*09537850SAkhilesh Sanikop }
436*09537850SAkhilesh Sanikop
Cost2And6_Pair(uint32_t * cost,const __m256i partial_a,const __m256i partial_b,const __m256i division_table)437*09537850SAkhilesh Sanikop inline void Cost2And6_Pair(uint32_t* cost, const __m256i partial_a,
438*09537850SAkhilesh Sanikop const __m256i partial_b,
439*09537850SAkhilesh Sanikop const __m256i division_table) {
440*09537850SAkhilesh Sanikop // The upper lane is a "don't care", so only use the lower lane for
441*09537850SAkhilesh Sanikop // calculating cost.
442*09537850SAkhilesh Sanikop const __m256i a = _mm256_permute2x128_si256(partial_a, partial_b, 0x20);
443*09537850SAkhilesh Sanikop
444*09537850SAkhilesh Sanikop const __m256i square_a = _mm256_madd_epi16(a, a);
445*09537850SAkhilesh Sanikop const __m256i b = _mm256_mullo_epi32(square_a, division_table);
446*09537850SAkhilesh Sanikop const __m256i c = SumVectorPair_S32(b);
447*09537850SAkhilesh Sanikop // Copy upper 32bit sum to lower lane.
448*09537850SAkhilesh Sanikop const __m128i sums =
449*09537850SAkhilesh Sanikop _mm256_castsi256_si128(_mm256_permute4x64_epi64(c, 0x08));
450*09537850SAkhilesh Sanikop cost[2] = _mm_cvtsi128_si32(sums);
451*09537850SAkhilesh Sanikop cost[6] = _mm_cvtsi128_si32(_mm_srli_si128(sums, 8));
452*09537850SAkhilesh Sanikop }
453*09537850SAkhilesh Sanikop
CdefDirection_AVX2(const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride,uint8_t * LIBGAV1_RESTRICT const direction,int * LIBGAV1_RESTRICT const variance)454*09537850SAkhilesh Sanikop void CdefDirection_AVX2(const void* LIBGAV1_RESTRICT const source,
455*09537850SAkhilesh Sanikop ptrdiff_t stride,
456*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const direction,
457*09537850SAkhilesh Sanikop int* LIBGAV1_RESTRICT const variance) {
458*09537850SAkhilesh Sanikop assert(direction != nullptr);
459*09537850SAkhilesh Sanikop assert(variance != nullptr);
460*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(source);
461*09537850SAkhilesh Sanikop uint32_t cost[8];
462*09537850SAkhilesh Sanikop
463*09537850SAkhilesh Sanikop // partial[0] = add partial 0,4 low
464*09537850SAkhilesh Sanikop // partial[1] = add partial 1,3 low
465*09537850SAkhilesh Sanikop // partial[2] = add partial 2 low
466*09537850SAkhilesh Sanikop // partial[3] = add partial 1,3 high
467*09537850SAkhilesh Sanikop // partial[4] = add partial 0,4 high
468*09537850SAkhilesh Sanikop // partial[5] = add partial 7,5 high
469*09537850SAkhilesh Sanikop // partial[6] = add partial 6 low
470*09537850SAkhilesh Sanikop // partial[7] = add partial 7,5 low
471*09537850SAkhilesh Sanikop __m256i partial[8];
472*09537850SAkhilesh Sanikop
473*09537850SAkhilesh Sanikop AddPartial(src, stride, partial);
474*09537850SAkhilesh Sanikop
475*09537850SAkhilesh Sanikop const __m256i division_table = LoadUnaligned32(kCdefDivisionTable);
476*09537850SAkhilesh Sanikop const __m256i division_table_7 =
477*09537850SAkhilesh Sanikop _mm256_broadcastd_epi32(_mm_cvtsi32_si128(kCdefDivisionTable[7]));
478*09537850SAkhilesh Sanikop
479*09537850SAkhilesh Sanikop Cost2And6_Pair(cost, partial[2], partial[6], division_table_7);
480*09537850SAkhilesh Sanikop
481*09537850SAkhilesh Sanikop Cost0Or4_Pair(cost, partial[0], partial[4], division_table);
482*09537850SAkhilesh Sanikop
483*09537850SAkhilesh Sanikop const __m256i division_table_odd[2] = {
484*09537850SAkhilesh Sanikop LoadUnaligned32(kCdefDivisionTableOddPairsPadded),
485*09537850SAkhilesh Sanikop LoadUnaligned32(kCdefDivisionTableOddPairsPadded + 8)};
486*09537850SAkhilesh Sanikop
487*09537850SAkhilesh Sanikop CostOdd_Pair<1, 3>(cost, partial[1], partial[3], division_table_odd);
488*09537850SAkhilesh Sanikop CostOdd_Pair<7, 5>(cost, partial[7], partial[5], division_table_odd);
489*09537850SAkhilesh Sanikop
490*09537850SAkhilesh Sanikop uint32_t best_cost = 0;
491*09537850SAkhilesh Sanikop *direction = 0;
492*09537850SAkhilesh Sanikop for (int i = 0; i < 8; ++i) {
493*09537850SAkhilesh Sanikop if (cost[i] > best_cost) {
494*09537850SAkhilesh Sanikop best_cost = cost[i];
495*09537850SAkhilesh Sanikop *direction = i;
496*09537850SAkhilesh Sanikop }
497*09537850SAkhilesh Sanikop }
498*09537850SAkhilesh Sanikop *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
499*09537850SAkhilesh Sanikop }
500*09537850SAkhilesh Sanikop
501*09537850SAkhilesh Sanikop // -------------------------------------------------------------------------
502*09537850SAkhilesh Sanikop // CdefFilter
503*09537850SAkhilesh Sanikop
504*09537850SAkhilesh Sanikop // Load 4 vectors based on the given |direction|.
LoadDirection(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,__m128i * output,const int direction)505*09537850SAkhilesh Sanikop inline void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
506*09537850SAkhilesh Sanikop const ptrdiff_t stride, __m128i* output,
507*09537850SAkhilesh Sanikop const int direction) {
508*09537850SAkhilesh Sanikop // Each |direction| describes a different set of source values. Expand this
509*09537850SAkhilesh Sanikop // set by negating each set. For |direction| == 0 this gives a diagonal line
510*09537850SAkhilesh Sanikop // from top right to bottom left. The first value is y, the second x. Negative
511*09537850SAkhilesh Sanikop // y values move up.
512*09537850SAkhilesh Sanikop // a b c d
513*09537850SAkhilesh Sanikop // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
514*09537850SAkhilesh Sanikop // c
515*09537850SAkhilesh Sanikop // a
516*09537850SAkhilesh Sanikop // 0
517*09537850SAkhilesh Sanikop // b
518*09537850SAkhilesh Sanikop // d
519*09537850SAkhilesh Sanikop const int y_0 = kCdefDirections[direction][0][0];
520*09537850SAkhilesh Sanikop const int x_0 = kCdefDirections[direction][0][1];
521*09537850SAkhilesh Sanikop const int y_1 = kCdefDirections[direction][1][0];
522*09537850SAkhilesh Sanikop const int x_1 = kCdefDirections[direction][1][1];
523*09537850SAkhilesh Sanikop output[0] = LoadUnaligned16(src - y_0 * stride - x_0);
524*09537850SAkhilesh Sanikop output[1] = LoadUnaligned16(src + y_0 * stride + x_0);
525*09537850SAkhilesh Sanikop output[2] = LoadUnaligned16(src - y_1 * stride - x_1);
526*09537850SAkhilesh Sanikop output[3] = LoadUnaligned16(src + y_1 * stride + x_1);
527*09537850SAkhilesh Sanikop }
528*09537850SAkhilesh Sanikop
529*09537850SAkhilesh Sanikop // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
530*09537850SAkhilesh Sanikop // do 2 rows at a time.
LoadDirection4(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,__m128i * output,const int direction)531*09537850SAkhilesh Sanikop void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
532*09537850SAkhilesh Sanikop const ptrdiff_t stride, __m128i* output,
533*09537850SAkhilesh Sanikop const int direction) {
534*09537850SAkhilesh Sanikop const int y_0 = kCdefDirections[direction][0][0];
535*09537850SAkhilesh Sanikop const int x_0 = kCdefDirections[direction][0][1];
536*09537850SAkhilesh Sanikop const int y_1 = kCdefDirections[direction][1][0];
537*09537850SAkhilesh Sanikop const int x_1 = kCdefDirections[direction][1][1];
538*09537850SAkhilesh Sanikop output[0] = LoadHi8(LoadLo8(src - y_0 * stride - x_0),
539*09537850SAkhilesh Sanikop src - y_0 * stride + stride - x_0);
540*09537850SAkhilesh Sanikop output[1] = LoadHi8(LoadLo8(src + y_0 * stride + x_0),
541*09537850SAkhilesh Sanikop src + y_0 * stride + stride + x_0);
542*09537850SAkhilesh Sanikop output[2] = LoadHi8(LoadLo8(src - y_1 * stride - x_1),
543*09537850SAkhilesh Sanikop src - y_1 * stride + stride - x_1);
544*09537850SAkhilesh Sanikop output[3] = LoadHi8(LoadLo8(src + y_1 * stride + x_1),
545*09537850SAkhilesh Sanikop src + y_1 * stride + stride + x_1);
546*09537850SAkhilesh Sanikop }
547*09537850SAkhilesh Sanikop
Constrain(const __m256i & pixel,const __m256i & reference,const __m128i & damping,const __m256i & threshold)548*09537850SAkhilesh Sanikop inline __m256i Constrain(const __m256i& pixel, const __m256i& reference,
549*09537850SAkhilesh Sanikop const __m128i& damping, const __m256i& threshold) {
550*09537850SAkhilesh Sanikop const __m256i diff = _mm256_sub_epi16(pixel, reference);
551*09537850SAkhilesh Sanikop const __m256i abs_diff = _mm256_abs_epi16(diff);
552*09537850SAkhilesh Sanikop // sign(diff) * Clip3(threshold - (std::abs(diff) >> damping),
553*09537850SAkhilesh Sanikop // 0, std::abs(diff))
554*09537850SAkhilesh Sanikop const __m256i shifted_diff = _mm256_srl_epi16(abs_diff, damping);
555*09537850SAkhilesh Sanikop // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
556*09537850SAkhilesh Sanikop // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
557*09537850SAkhilesh Sanikop // larger than threshold. Subtract using saturation will return 0 when pixel
558*09537850SAkhilesh Sanikop // == kCdefLargeValue.
559*09537850SAkhilesh Sanikop static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
560*09537850SAkhilesh Sanikop const __m256i thresh_minus_shifted_diff =
561*09537850SAkhilesh Sanikop _mm256_subs_epu16(threshold, shifted_diff);
562*09537850SAkhilesh Sanikop const __m256i clamp_abs_diff =
563*09537850SAkhilesh Sanikop _mm256_min_epi16(thresh_minus_shifted_diff, abs_diff);
564*09537850SAkhilesh Sanikop // Restore the sign.
565*09537850SAkhilesh Sanikop return _mm256_sign_epi16(clamp_abs_diff, diff);
566*09537850SAkhilesh Sanikop }
567*09537850SAkhilesh Sanikop
ApplyConstrainAndTap(const __m256i & pixel,const __m256i & val,const __m256i & tap,const __m128i & damping,const __m256i & threshold)568*09537850SAkhilesh Sanikop inline __m256i ApplyConstrainAndTap(const __m256i& pixel, const __m256i& val,
569*09537850SAkhilesh Sanikop const __m256i& tap, const __m128i& damping,
570*09537850SAkhilesh Sanikop const __m256i& threshold) {
571*09537850SAkhilesh Sanikop const __m256i constrained = Constrain(val, pixel, damping, threshold);
572*09537850SAkhilesh Sanikop return _mm256_mullo_epi16(constrained, tap);
573*09537850SAkhilesh Sanikop }
574*09537850SAkhilesh Sanikop
575*09537850SAkhilesh Sanikop template <int width, bool enable_primary = true, bool enable_secondary = true>
CdefFilter_AVX2(const uint16_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,const int primary_strength,const int secondary_strength,const int damping,const int direction,void * LIBGAV1_RESTRICT dest,const ptrdiff_t dst_stride)576*09537850SAkhilesh Sanikop void CdefFilter_AVX2(const uint16_t* LIBGAV1_RESTRICT src,
577*09537850SAkhilesh Sanikop const ptrdiff_t src_stride, const int height,
578*09537850SAkhilesh Sanikop const int primary_strength, const int secondary_strength,
579*09537850SAkhilesh Sanikop const int damping, const int direction,
580*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
581*09537850SAkhilesh Sanikop static_assert(width == 8 || width == 4, "Invalid CDEF width.");
582*09537850SAkhilesh Sanikop static_assert(enable_primary || enable_secondary, "");
583*09537850SAkhilesh Sanikop constexpr bool clipping_required = enable_primary && enable_secondary;
584*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
585*09537850SAkhilesh Sanikop __m128i primary_damping_shift, secondary_damping_shift;
586*09537850SAkhilesh Sanikop
587*09537850SAkhilesh Sanikop // FloorLog2() requires input to be > 0.
588*09537850SAkhilesh Sanikop // 8-bit damping range: Y: [3, 6], UV: [2, 5].
589*09537850SAkhilesh Sanikop if (enable_primary) {
590*09537850SAkhilesh Sanikop // primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is necessary
591*09537850SAkhilesh Sanikop // for UV filtering.
592*09537850SAkhilesh Sanikop primary_damping_shift =
593*09537850SAkhilesh Sanikop _mm_cvtsi32_si128(std::max(0, damping - FloorLog2(primary_strength)));
594*09537850SAkhilesh Sanikop }
595*09537850SAkhilesh Sanikop if (enable_secondary) {
596*09537850SAkhilesh Sanikop // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
597*09537850SAkhilesh Sanikop // necessary.
598*09537850SAkhilesh Sanikop assert(damping - FloorLog2(secondary_strength) >= 0);
599*09537850SAkhilesh Sanikop secondary_damping_shift =
600*09537850SAkhilesh Sanikop _mm_cvtsi32_si128(damping - FloorLog2(secondary_strength));
601*09537850SAkhilesh Sanikop }
602*09537850SAkhilesh Sanikop const __m256i primary_tap_0 = _mm256_broadcastw_epi16(
603*09537850SAkhilesh Sanikop _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][0]));
604*09537850SAkhilesh Sanikop const __m256i primary_tap_1 = _mm256_broadcastw_epi16(
605*09537850SAkhilesh Sanikop _mm_cvtsi32_si128(kCdefPrimaryTaps[primary_strength & 1][1]));
606*09537850SAkhilesh Sanikop const __m256i secondary_tap_0 =
607*09537850SAkhilesh Sanikop _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap0));
608*09537850SAkhilesh Sanikop const __m256i secondary_tap_1 =
609*09537850SAkhilesh Sanikop _mm256_broadcastw_epi16(_mm_cvtsi32_si128(kCdefSecondaryTap1));
610*09537850SAkhilesh Sanikop const __m256i cdef_large_value_mask = _mm256_broadcastw_epi16(
611*09537850SAkhilesh Sanikop _mm_cvtsi32_si128(static_cast<int16_t>(~kCdefLargeValue)));
612*09537850SAkhilesh Sanikop const __m256i primary_threshold =
613*09537850SAkhilesh Sanikop _mm256_broadcastw_epi16(_mm_cvtsi32_si128(primary_strength));
614*09537850SAkhilesh Sanikop const __m256i secondary_threshold =
615*09537850SAkhilesh Sanikop _mm256_broadcastw_epi16(_mm_cvtsi32_si128(secondary_strength));
616*09537850SAkhilesh Sanikop
617*09537850SAkhilesh Sanikop int y = height;
618*09537850SAkhilesh Sanikop do {
619*09537850SAkhilesh Sanikop __m128i pixel_128;
620*09537850SAkhilesh Sanikop if (width == 8) {
621*09537850SAkhilesh Sanikop pixel_128 = LoadUnaligned16(src);
622*09537850SAkhilesh Sanikop } else {
623*09537850SAkhilesh Sanikop pixel_128 = LoadHi8(LoadLo8(src), src + src_stride);
624*09537850SAkhilesh Sanikop }
625*09537850SAkhilesh Sanikop
626*09537850SAkhilesh Sanikop __m256i pixel = SetrM128i(pixel_128, pixel_128);
627*09537850SAkhilesh Sanikop
628*09537850SAkhilesh Sanikop __m256i min = pixel;
629*09537850SAkhilesh Sanikop __m256i max = pixel;
630*09537850SAkhilesh Sanikop __m256i sum_pair;
631*09537850SAkhilesh Sanikop
632*09537850SAkhilesh Sanikop if (enable_primary) {
633*09537850SAkhilesh Sanikop // Primary |direction|.
634*09537850SAkhilesh Sanikop __m128i primary_val_128[4];
635*09537850SAkhilesh Sanikop if (width == 8) {
636*09537850SAkhilesh Sanikop LoadDirection(src, src_stride, primary_val_128, direction);
637*09537850SAkhilesh Sanikop } else {
638*09537850SAkhilesh Sanikop LoadDirection4(src, src_stride, primary_val_128, direction);
639*09537850SAkhilesh Sanikop }
640*09537850SAkhilesh Sanikop
641*09537850SAkhilesh Sanikop __m256i primary_val[2];
642*09537850SAkhilesh Sanikop primary_val[0] = SetrM128i(primary_val_128[0], primary_val_128[1]);
643*09537850SAkhilesh Sanikop primary_val[1] = SetrM128i(primary_val_128[2], primary_val_128[3]);
644*09537850SAkhilesh Sanikop
645*09537850SAkhilesh Sanikop if (clipping_required) {
646*09537850SAkhilesh Sanikop min = _mm256_min_epu16(min, primary_val[0]);
647*09537850SAkhilesh Sanikop min = _mm256_min_epu16(min, primary_val[1]);
648*09537850SAkhilesh Sanikop
649*09537850SAkhilesh Sanikop // The source is 16 bits, however, we only really care about the lower
650*09537850SAkhilesh Sanikop // 8 bits. The upper 8 bits contain the "large" flag. After the final
651*09537850SAkhilesh Sanikop // primary max has been calculated, zero out the upper 8 bits. Use this
652*09537850SAkhilesh Sanikop // to find the "16 bit" max.
653*09537850SAkhilesh Sanikop const __m256i max_p01 = _mm256_max_epu8(primary_val[0], primary_val[1]);
654*09537850SAkhilesh Sanikop max = _mm256_max_epu16(
655*09537850SAkhilesh Sanikop max, _mm256_and_si256(max_p01, cdef_large_value_mask));
656*09537850SAkhilesh Sanikop }
657*09537850SAkhilesh Sanikop
658*09537850SAkhilesh Sanikop sum_pair = ApplyConstrainAndTap(pixel, primary_val[0], primary_tap_0,
659*09537850SAkhilesh Sanikop primary_damping_shift, primary_threshold);
660*09537850SAkhilesh Sanikop sum_pair = _mm256_add_epi16(
661*09537850SAkhilesh Sanikop sum_pair,
662*09537850SAkhilesh Sanikop ApplyConstrainAndTap(pixel, primary_val[1], primary_tap_1,
663*09537850SAkhilesh Sanikop primary_damping_shift, primary_threshold));
664*09537850SAkhilesh Sanikop } else {
665*09537850SAkhilesh Sanikop sum_pair = _mm256_setzero_si256();
666*09537850SAkhilesh Sanikop }
667*09537850SAkhilesh Sanikop
668*09537850SAkhilesh Sanikop if (enable_secondary) {
669*09537850SAkhilesh Sanikop // Secondary |direction| values (+/- 2). Clamp |direction|.
670*09537850SAkhilesh Sanikop __m128i secondary_val_128[8];
671*09537850SAkhilesh Sanikop if (width == 8) {
672*09537850SAkhilesh Sanikop LoadDirection(src, src_stride, secondary_val_128, direction + 2);
673*09537850SAkhilesh Sanikop LoadDirection(src, src_stride, secondary_val_128 + 4, direction - 2);
674*09537850SAkhilesh Sanikop } else {
675*09537850SAkhilesh Sanikop LoadDirection4(src, src_stride, secondary_val_128, direction + 2);
676*09537850SAkhilesh Sanikop LoadDirection4(src, src_stride, secondary_val_128 + 4, direction - 2);
677*09537850SAkhilesh Sanikop }
678*09537850SAkhilesh Sanikop
679*09537850SAkhilesh Sanikop __m256i secondary_val[4];
680*09537850SAkhilesh Sanikop secondary_val[0] = SetrM128i(secondary_val_128[0], secondary_val_128[1]);
681*09537850SAkhilesh Sanikop secondary_val[1] = SetrM128i(secondary_val_128[2], secondary_val_128[3]);
682*09537850SAkhilesh Sanikop secondary_val[2] = SetrM128i(secondary_val_128[4], secondary_val_128[5]);
683*09537850SAkhilesh Sanikop secondary_val[3] = SetrM128i(secondary_val_128[6], secondary_val_128[7]);
684*09537850SAkhilesh Sanikop
685*09537850SAkhilesh Sanikop if (clipping_required) {
686*09537850SAkhilesh Sanikop min = _mm256_min_epu16(min, secondary_val[0]);
687*09537850SAkhilesh Sanikop min = _mm256_min_epu16(min, secondary_val[1]);
688*09537850SAkhilesh Sanikop min = _mm256_min_epu16(min, secondary_val[2]);
689*09537850SAkhilesh Sanikop min = _mm256_min_epu16(min, secondary_val[3]);
690*09537850SAkhilesh Sanikop
691*09537850SAkhilesh Sanikop const __m256i max_s01 =
692*09537850SAkhilesh Sanikop _mm256_max_epu8(secondary_val[0], secondary_val[1]);
693*09537850SAkhilesh Sanikop const __m256i max_s23 =
694*09537850SAkhilesh Sanikop _mm256_max_epu8(secondary_val[2], secondary_val[3]);
695*09537850SAkhilesh Sanikop const __m256i max_s = _mm256_max_epu8(max_s01, max_s23);
696*09537850SAkhilesh Sanikop max = _mm256_max_epu8(max,
697*09537850SAkhilesh Sanikop _mm256_and_si256(max_s, cdef_large_value_mask));
698*09537850SAkhilesh Sanikop }
699*09537850SAkhilesh Sanikop
700*09537850SAkhilesh Sanikop sum_pair = _mm256_add_epi16(
701*09537850SAkhilesh Sanikop sum_pair,
702*09537850SAkhilesh Sanikop ApplyConstrainAndTap(pixel, secondary_val[0], secondary_tap_0,
703*09537850SAkhilesh Sanikop secondary_damping_shift, secondary_threshold));
704*09537850SAkhilesh Sanikop sum_pair = _mm256_add_epi16(
705*09537850SAkhilesh Sanikop sum_pair,
706*09537850SAkhilesh Sanikop ApplyConstrainAndTap(pixel, secondary_val[1], secondary_tap_1,
707*09537850SAkhilesh Sanikop secondary_damping_shift, secondary_threshold));
708*09537850SAkhilesh Sanikop sum_pair = _mm256_add_epi16(
709*09537850SAkhilesh Sanikop sum_pair,
710*09537850SAkhilesh Sanikop ApplyConstrainAndTap(pixel, secondary_val[2], secondary_tap_0,
711*09537850SAkhilesh Sanikop secondary_damping_shift, secondary_threshold));
712*09537850SAkhilesh Sanikop sum_pair = _mm256_add_epi16(
713*09537850SAkhilesh Sanikop sum_pair,
714*09537850SAkhilesh Sanikop ApplyConstrainAndTap(pixel, secondary_val[3], secondary_tap_1,
715*09537850SAkhilesh Sanikop secondary_damping_shift, secondary_threshold));
716*09537850SAkhilesh Sanikop }
717*09537850SAkhilesh Sanikop
718*09537850SAkhilesh Sanikop __m128i sum = _mm_add_epi16(_mm256_castsi256_si128(sum_pair),
719*09537850SAkhilesh Sanikop _mm256_extracti128_si256(sum_pair, 1));
720*09537850SAkhilesh Sanikop
721*09537850SAkhilesh Sanikop // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
722*09537850SAkhilesh Sanikop const __m128i sum_lt_0 = _mm_srai_epi16(sum, 15);
723*09537850SAkhilesh Sanikop // 8 + sum
724*09537850SAkhilesh Sanikop sum = _mm_add_epi16(sum, _mm_set1_epi16(8));
725*09537850SAkhilesh Sanikop // (... - (sum < 0)) >> 4
726*09537850SAkhilesh Sanikop sum = _mm_add_epi16(sum, sum_lt_0);
727*09537850SAkhilesh Sanikop sum = _mm_srai_epi16(sum, 4);
728*09537850SAkhilesh Sanikop // pixel + ...
729*09537850SAkhilesh Sanikop sum = _mm_add_epi16(sum, _mm256_castsi256_si128(pixel));
730*09537850SAkhilesh Sanikop if (clipping_required) {
731*09537850SAkhilesh Sanikop const __m128i min_128 = _mm_min_epu16(_mm256_castsi256_si128(min),
732*09537850SAkhilesh Sanikop _mm256_extracti128_si256(min, 1));
733*09537850SAkhilesh Sanikop
734*09537850SAkhilesh Sanikop const __m128i max_128 = _mm_max_epu16(_mm256_castsi256_si128(max),
735*09537850SAkhilesh Sanikop _mm256_extracti128_si256(max, 1));
736*09537850SAkhilesh Sanikop // Clip3
737*09537850SAkhilesh Sanikop sum = _mm_min_epi16(sum, max_128);
738*09537850SAkhilesh Sanikop sum = _mm_max_epi16(sum, min_128);
739*09537850SAkhilesh Sanikop }
740*09537850SAkhilesh Sanikop
741*09537850SAkhilesh Sanikop const __m128i result = _mm_packus_epi16(sum, sum);
742*09537850SAkhilesh Sanikop if (width == 8) {
743*09537850SAkhilesh Sanikop src += src_stride;
744*09537850SAkhilesh Sanikop StoreLo8(dst, result);
745*09537850SAkhilesh Sanikop dst += dst_stride;
746*09537850SAkhilesh Sanikop --y;
747*09537850SAkhilesh Sanikop } else {
748*09537850SAkhilesh Sanikop src += src_stride << 1;
749*09537850SAkhilesh Sanikop Store4(dst, result);
750*09537850SAkhilesh Sanikop dst += dst_stride;
751*09537850SAkhilesh Sanikop Store4(dst, _mm_srli_si128(result, 4));
752*09537850SAkhilesh Sanikop dst += dst_stride;
753*09537850SAkhilesh Sanikop y -= 2;
754*09537850SAkhilesh Sanikop }
755*09537850SAkhilesh Sanikop } while (y != 0);
756*09537850SAkhilesh Sanikop }
757*09537850SAkhilesh Sanikop
Init8bpp()758*09537850SAkhilesh Sanikop void Init8bpp() {
759*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
760*09537850SAkhilesh Sanikop assert(dsp != nullptr);
761*09537850SAkhilesh Sanikop dsp->cdef_direction = CdefDirection_AVX2;
762*09537850SAkhilesh Sanikop
763*09537850SAkhilesh Sanikop dsp->cdef_filters[0][0] = CdefFilter_AVX2<4>;
764*09537850SAkhilesh Sanikop dsp->cdef_filters[0][1] =
765*09537850SAkhilesh Sanikop CdefFilter_AVX2<4, /*enable_primary=*/true, /*enable_secondary=*/false>;
766*09537850SAkhilesh Sanikop dsp->cdef_filters[0][2] = CdefFilter_AVX2<4, /*enable_primary=*/false>;
767*09537850SAkhilesh Sanikop dsp->cdef_filters[1][0] = CdefFilter_AVX2<8>;
768*09537850SAkhilesh Sanikop dsp->cdef_filters[1][1] =
769*09537850SAkhilesh Sanikop CdefFilter_AVX2<8, /*enable_primary=*/true, /*enable_secondary=*/false>;
770*09537850SAkhilesh Sanikop dsp->cdef_filters[1][2] = CdefFilter_AVX2<8, /*enable_primary=*/false>;
771*09537850SAkhilesh Sanikop }
772*09537850SAkhilesh Sanikop
773*09537850SAkhilesh Sanikop } // namespace
774*09537850SAkhilesh Sanikop } // namespace low_bitdepth
775*09537850SAkhilesh Sanikop
CdefInit_AVX2()776*09537850SAkhilesh Sanikop void CdefInit_AVX2() { low_bitdepth::Init8bpp(); }
777*09537850SAkhilesh Sanikop
778*09537850SAkhilesh Sanikop } // namespace dsp
779*09537850SAkhilesh Sanikop } // namespace libgav1
780*09537850SAkhilesh Sanikop #else // !LIBGAV1_TARGETING_AVX2
781*09537850SAkhilesh Sanikop namespace libgav1 {
782*09537850SAkhilesh Sanikop namespace dsp {
783*09537850SAkhilesh Sanikop
CdefInit_AVX2()784*09537850SAkhilesh Sanikop void CdefInit_AVX2() {}
785*09537850SAkhilesh Sanikop
786*09537850SAkhilesh Sanikop } // namespace dsp
787*09537850SAkhilesh Sanikop } // namespace libgav1
788*09537850SAkhilesh Sanikop #endif // LIBGAV1_TARGETING_AVX2
789