1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/cdef.h"
16*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
17*09537850SAkhilesh Sanikop
18*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_NEON
19*09537850SAkhilesh Sanikop
20*09537850SAkhilesh Sanikop #include <arm_neon.h>
21*09537850SAkhilesh Sanikop
22*09537850SAkhilesh Sanikop #include <algorithm>
23*09537850SAkhilesh Sanikop #include <cassert>
24*09537850SAkhilesh Sanikop #include <cstddef>
25*09537850SAkhilesh Sanikop #include <cstdint>
26*09537850SAkhilesh Sanikop #include <cstdlib>
27*09537850SAkhilesh Sanikop
28*09537850SAkhilesh Sanikop #include "src/dsp/arm/common_neon.h"
29*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
30*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
31*09537850SAkhilesh Sanikop #include "src/utils/common.h"
32*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
33*09537850SAkhilesh Sanikop
34*09537850SAkhilesh Sanikop namespace libgav1 {
35*09537850SAkhilesh Sanikop namespace dsp {
36*09537850SAkhilesh Sanikop namespace {
37*09537850SAkhilesh Sanikop
38*09537850SAkhilesh Sanikop #include "src/dsp/cdef.inc"
39*09537850SAkhilesh Sanikop
40*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
41*09537850SAkhilesh Sanikop // Refer to CdefDirection_C().
42*09537850SAkhilesh Sanikop //
43*09537850SAkhilesh Sanikop // int32_t partial[8][15] = {};
44*09537850SAkhilesh Sanikop // for (int i = 0; i < 8; ++i) {
45*09537850SAkhilesh Sanikop // for (int j = 0; j < 8; ++j) {
46*09537850SAkhilesh Sanikop // const int x = 1;
47*09537850SAkhilesh Sanikop // partial[0][i + j] += x;
48*09537850SAkhilesh Sanikop // partial[1][i + j / 2] += x;
49*09537850SAkhilesh Sanikop // partial[2][i] += x;
50*09537850SAkhilesh Sanikop // partial[3][3 + i - j / 2] += x;
51*09537850SAkhilesh Sanikop // partial[4][7 + i - j] += x;
52*09537850SAkhilesh Sanikop // partial[5][3 - i / 2 + j] += x;
53*09537850SAkhilesh Sanikop // partial[6][j] += x;
54*09537850SAkhilesh Sanikop // partial[7][i / 2 + j] += x;
55*09537850SAkhilesh Sanikop // }
56*09537850SAkhilesh Sanikop // }
57*09537850SAkhilesh Sanikop //
58*09537850SAkhilesh Sanikop // Using the code above, generate the position count for partial[8][15].
59*09537850SAkhilesh Sanikop //
60*09537850SAkhilesh Sanikop // partial[0]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
61*09537850SAkhilesh Sanikop // partial[1]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
62*09537850SAkhilesh Sanikop // partial[2]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
63*09537850SAkhilesh Sanikop // partial[3]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
64*09537850SAkhilesh Sanikop // partial[4]: 1 2 3 4 5 6 7 8 7 6 5 4 3 2 1
65*09537850SAkhilesh Sanikop // partial[5]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
66*09537850SAkhilesh Sanikop // partial[6]: 8 8 8 8 8 8 8 8 0 0 0 0 0 0 0
67*09537850SAkhilesh Sanikop // partial[7]: 2 4 6 8 8 8 8 8 6 4 2 0 0 0 0
68*09537850SAkhilesh Sanikop //
69*09537850SAkhilesh Sanikop // The SIMD code shifts the input horizontally, then adds vertically to get the
70*09537850SAkhilesh Sanikop // correct partial value for the given position.
71*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
72*09537850SAkhilesh Sanikop
73*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
74*09537850SAkhilesh Sanikop // partial[0][i + j] += x;
75*09537850SAkhilesh Sanikop //
76*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
77*09537850SAkhilesh Sanikop // 00 10 11 12 13 14 15 16 17 00 00 00 00 00 00
78*09537850SAkhilesh Sanikop // 00 00 20 21 22 23 24 25 26 27 00 00 00 00 00
79*09537850SAkhilesh Sanikop // 00 00 00 30 31 32 33 34 35 36 37 00 00 00 00
80*09537850SAkhilesh Sanikop // 00 00 00 00 40 41 42 43 44 45 46 47 00 00 00
81*09537850SAkhilesh Sanikop // 00 00 00 00 00 50 51 52 53 54 55 56 57 00 00
82*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 60 61 62 63 64 65 66 67 00
83*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 70 71 72 73 74 75 76 77
84*09537850SAkhilesh Sanikop //
85*09537850SAkhilesh Sanikop // partial[4] is the same except the source is reversed.
AddPartial_D0_D4(uint8x8_t * v_src,uint16x8_t * partial_lo,uint16x8_t * partial_hi)86*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D0_D4(uint8x8_t* v_src,
87*09537850SAkhilesh Sanikop uint16x8_t* partial_lo,
88*09537850SAkhilesh Sanikop uint16x8_t* partial_hi) {
89*09537850SAkhilesh Sanikop const uint8x8_t v_zero = vdup_n_u8(0);
90*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07
91*09537850SAkhilesh Sanikop // 00 10 11 12 13 14 15 16
92*09537850SAkhilesh Sanikop *partial_lo = vaddl_u8(v_src[0], vext_u8(v_zero, v_src[1], 7));
93*09537850SAkhilesh Sanikop
94*09537850SAkhilesh Sanikop // 00 00 20 21 22 23 24 25
95*09537850SAkhilesh Sanikop *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[2], 6));
96*09537850SAkhilesh Sanikop // 17 00 00 00 00 00 00 00
97*09537850SAkhilesh Sanikop // 26 27 00 00 00 00 00 00
98*09537850SAkhilesh Sanikop *partial_hi =
99*09537850SAkhilesh Sanikop vaddl_u8(vext_u8(v_src[1], v_zero, 7), vext_u8(v_src[2], v_zero, 6));
100*09537850SAkhilesh Sanikop
101*09537850SAkhilesh Sanikop // 00 00 00 30 31 32 33 34
102*09537850SAkhilesh Sanikop *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[3], 5));
103*09537850SAkhilesh Sanikop // 35 36 37 00 00 00 00 00
104*09537850SAkhilesh Sanikop *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[3], v_zero, 5));
105*09537850SAkhilesh Sanikop
106*09537850SAkhilesh Sanikop // 00 00 00 00 40 41 42 43
107*09537850SAkhilesh Sanikop *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[4], 4));
108*09537850SAkhilesh Sanikop // 44 45 46 47 00 00 00 00
109*09537850SAkhilesh Sanikop *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[4], v_zero, 4));
110*09537850SAkhilesh Sanikop
111*09537850SAkhilesh Sanikop // 00 00 00 00 00 50 51 52
112*09537850SAkhilesh Sanikop *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[5], 3));
113*09537850SAkhilesh Sanikop // 53 54 55 56 57 00 00 00
114*09537850SAkhilesh Sanikop *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[5], v_zero, 3));
115*09537850SAkhilesh Sanikop
116*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 60 61
117*09537850SAkhilesh Sanikop *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[6], 2));
118*09537850SAkhilesh Sanikop // 62 63 64 65 66 67 00 00
119*09537850SAkhilesh Sanikop *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[6], v_zero, 2));
120*09537850SAkhilesh Sanikop
121*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 70
122*09537850SAkhilesh Sanikop *partial_lo = vaddw_u8(*partial_lo, vext_u8(v_zero, v_src[7], 1));
123*09537850SAkhilesh Sanikop // 71 72 73 74 75 76 77 00
124*09537850SAkhilesh Sanikop *partial_hi = vaddw_u8(*partial_hi, vext_u8(v_src[7], v_zero, 1));
125*09537850SAkhilesh Sanikop }
126*09537850SAkhilesh Sanikop
127*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
128*09537850SAkhilesh Sanikop // partial[1][i + j / 2] += x;
129*09537850SAkhilesh Sanikop //
130*09537850SAkhilesh Sanikop // A0 = src[0] + src[1], A1 = src[2] + src[3], ...
131*09537850SAkhilesh Sanikop //
132*09537850SAkhilesh Sanikop // A0 A1 A2 A3 00 00 00 00 00 00 00 00 00 00 00
133*09537850SAkhilesh Sanikop // 00 B0 B1 B2 B3 00 00 00 00 00 00 00 00 00 00
134*09537850SAkhilesh Sanikop // 00 00 C0 C1 C2 C3 00 00 00 00 00 00 00 00 00
135*09537850SAkhilesh Sanikop // 00 00 00 D0 D1 D2 D3 00 00 00 00 00 00 00 00
136*09537850SAkhilesh Sanikop // 00 00 00 00 E0 E1 E2 E3 00 00 00 00 00 00 00
137*09537850SAkhilesh Sanikop // 00 00 00 00 00 F0 F1 F2 F3 00 00 00 00 00 00
138*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 G0 G1 G2 G3 00 00 00 00 00
139*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 H0 H1 H2 H3 00 00 00 00
140*09537850SAkhilesh Sanikop //
141*09537850SAkhilesh Sanikop // partial[3] is the same except the source is reversed.
AddPartial_D1_D3(uint8x8_t * v_src,uint16x8_t * partial_lo,uint16x8_t * partial_hi)142*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D1_D3(uint8x8_t* v_src,
143*09537850SAkhilesh Sanikop uint16x8_t* partial_lo,
144*09537850SAkhilesh Sanikop uint16x8_t* partial_hi) {
145*09537850SAkhilesh Sanikop uint8x16_t v_d1_temp[8];
146*09537850SAkhilesh Sanikop const uint8x8_t v_zero = vdup_n_u8(0);
147*09537850SAkhilesh Sanikop const uint8x16_t v_zero_16 = vdupq_n_u8(0);
148*09537850SAkhilesh Sanikop
149*09537850SAkhilesh Sanikop for (int i = 0; i < 8; ++i) {
150*09537850SAkhilesh Sanikop v_d1_temp[i] = vcombine_u8(v_src[i], v_zero);
151*09537850SAkhilesh Sanikop }
152*09537850SAkhilesh Sanikop
153*09537850SAkhilesh Sanikop *partial_lo = *partial_hi = vdupq_n_u16(0);
154*09537850SAkhilesh Sanikop // A0 A1 A2 A3 00 00 00 00
155*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, v_d1_temp[0]);
156*09537850SAkhilesh Sanikop
157*09537850SAkhilesh Sanikop // 00 B0 B1 B2 B3 00 00 00
158*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[1], 14));
159*09537850SAkhilesh Sanikop
160*09537850SAkhilesh Sanikop // 00 00 C0 C1 C2 C3 00 00
161*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[2], 12));
162*09537850SAkhilesh Sanikop // 00 00 00 D0 D1 D2 D3 00
163*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[3], 10));
164*09537850SAkhilesh Sanikop // 00 00 00 00 E0 E1 E2 E3
165*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[4], 8));
166*09537850SAkhilesh Sanikop
167*09537850SAkhilesh Sanikop // 00 00 00 00 00 F0 F1 F2
168*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[5], 6));
169*09537850SAkhilesh Sanikop // F3 00 00 00 00 00 00 00
170*09537850SAkhilesh Sanikop *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[5], v_zero_16, 6));
171*09537850SAkhilesh Sanikop
172*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 G0 G1
173*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[6], 4));
174*09537850SAkhilesh Sanikop // G2 G3 00 00 00 00 00 00
175*09537850SAkhilesh Sanikop *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[6], v_zero_16, 4));
176*09537850SAkhilesh Sanikop
177*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 H0
178*09537850SAkhilesh Sanikop *partial_lo = vpadalq_u8(*partial_lo, vextq_u8(v_zero_16, v_d1_temp[7], 2));
179*09537850SAkhilesh Sanikop // H1 H2 H3 00 00 00 00 00
180*09537850SAkhilesh Sanikop *partial_hi = vpadalq_u8(*partial_hi, vextq_u8(v_d1_temp[7], v_zero_16, 2));
181*09537850SAkhilesh Sanikop }
182*09537850SAkhilesh Sanikop
183*09537850SAkhilesh Sanikop // ----------------------------------------------------------------------------
184*09537850SAkhilesh Sanikop // partial[7][i / 2 + j] += x;
185*09537850SAkhilesh Sanikop //
186*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00
187*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00
188*09537850SAkhilesh Sanikop // 00 20 21 22 23 24 25 26 27 00 00 00 00 00 00
189*09537850SAkhilesh Sanikop // 00 30 31 32 33 34 35 36 37 00 00 00 00 00 00
190*09537850SAkhilesh Sanikop // 00 00 40 41 42 43 44 45 46 47 00 00 00 00 00
191*09537850SAkhilesh Sanikop // 00 00 50 51 52 53 54 55 56 57 00 00 00 00 00
192*09537850SAkhilesh Sanikop // 00 00 00 60 61 62 63 64 65 66 67 00 00 00 00
193*09537850SAkhilesh Sanikop // 00 00 00 70 71 72 73 74 75 76 77 00 00 00 00
194*09537850SAkhilesh Sanikop //
195*09537850SAkhilesh Sanikop // partial[5] is the same except the source is reversed.
AddPartial_D5_D7(uint8x8_t * v_src,uint16x8_t * partial_lo,uint16x8_t * partial_hi)196*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial_D5_D7(uint8x8_t* v_src,
197*09537850SAkhilesh Sanikop uint16x8_t* partial_lo,
198*09537850SAkhilesh Sanikop uint16x8_t* partial_hi) {
199*09537850SAkhilesh Sanikop const uint16x8_t v_zero = vdupq_n_u16(0);
200*09537850SAkhilesh Sanikop uint16x8_t v_pair_add[4];
201*09537850SAkhilesh Sanikop // Add vertical source pairs.
202*09537850SAkhilesh Sanikop v_pair_add[0] = vaddl_u8(v_src[0], v_src[1]);
203*09537850SAkhilesh Sanikop v_pair_add[1] = vaddl_u8(v_src[2], v_src[3]);
204*09537850SAkhilesh Sanikop v_pair_add[2] = vaddl_u8(v_src[4], v_src[5]);
205*09537850SAkhilesh Sanikop v_pair_add[3] = vaddl_u8(v_src[6], v_src[7]);
206*09537850SAkhilesh Sanikop
207*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07
208*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17
209*09537850SAkhilesh Sanikop *partial_lo = v_pair_add[0];
210*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 00
211*09537850SAkhilesh Sanikop // 00 00 00 00 00 00 00 00
212*09537850SAkhilesh Sanikop *partial_hi = vdupq_n_u16(0);
213*09537850SAkhilesh Sanikop
214*09537850SAkhilesh Sanikop // 00 20 21 22 23 24 25 26
215*09537850SAkhilesh Sanikop // 00 30 31 32 33 34 35 36
216*09537850SAkhilesh Sanikop *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[1], 7));
217*09537850SAkhilesh Sanikop // 27 00 00 00 00 00 00 00
218*09537850SAkhilesh Sanikop // 37 00 00 00 00 00 00 00
219*09537850SAkhilesh Sanikop *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[1], v_zero, 7));
220*09537850SAkhilesh Sanikop
221*09537850SAkhilesh Sanikop // 00 00 40 41 42 43 44 45
222*09537850SAkhilesh Sanikop // 00 00 50 51 52 53 54 55
223*09537850SAkhilesh Sanikop *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[2], 6));
224*09537850SAkhilesh Sanikop // 46 47 00 00 00 00 00 00
225*09537850SAkhilesh Sanikop // 56 57 00 00 00 00 00 00
226*09537850SAkhilesh Sanikop *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[2], v_zero, 6));
227*09537850SAkhilesh Sanikop
228*09537850SAkhilesh Sanikop // 00 00 00 60 61 62 63 64
229*09537850SAkhilesh Sanikop // 00 00 00 70 71 72 73 74
230*09537850SAkhilesh Sanikop *partial_lo = vaddq_u16(*partial_lo, vextq_u16(v_zero, v_pair_add[3], 5));
231*09537850SAkhilesh Sanikop // 65 66 67 00 00 00 00 00
232*09537850SAkhilesh Sanikop // 75 76 77 00 00 00 00 00
233*09537850SAkhilesh Sanikop *partial_hi = vaddq_u16(*partial_hi, vextq_u16(v_pair_add[3], v_zero, 5));
234*09537850SAkhilesh Sanikop }
235*09537850SAkhilesh Sanikop
236*09537850SAkhilesh Sanikop template <int bitdepth>
AddPartial(const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride,uint16x8_t * partial_lo,uint16x8_t * partial_hi)237*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void AddPartial(const void* LIBGAV1_RESTRICT const source,
238*09537850SAkhilesh Sanikop ptrdiff_t stride, uint16x8_t* partial_lo,
239*09537850SAkhilesh Sanikop uint16x8_t* partial_hi) {
240*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(source);
241*09537850SAkhilesh Sanikop
242*09537850SAkhilesh Sanikop // 8x8 input
243*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07
244*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17
245*09537850SAkhilesh Sanikop // 20 21 22 23 24 25 26 27
246*09537850SAkhilesh Sanikop // 30 31 32 33 34 35 36 37
247*09537850SAkhilesh Sanikop // 40 41 42 43 44 45 46 47
248*09537850SAkhilesh Sanikop // 50 51 52 53 54 55 56 57
249*09537850SAkhilesh Sanikop // 60 61 62 63 64 65 66 67
250*09537850SAkhilesh Sanikop // 70 71 72 73 74 75 76 77
251*09537850SAkhilesh Sanikop uint8x8_t v_src[8];
252*09537850SAkhilesh Sanikop if (bitdepth == kBitdepth8) {
253*09537850SAkhilesh Sanikop for (auto& v : v_src) {
254*09537850SAkhilesh Sanikop v = vld1_u8(src);
255*09537850SAkhilesh Sanikop src += stride;
256*09537850SAkhilesh Sanikop }
257*09537850SAkhilesh Sanikop } else {
258*09537850SAkhilesh Sanikop // bitdepth - 8
259*09537850SAkhilesh Sanikop constexpr int src_shift = (bitdepth == kBitdepth10) ? 2 : 4;
260*09537850SAkhilesh Sanikop for (auto& v : v_src) {
261*09537850SAkhilesh Sanikop v = vshrn_n_u16(vld1q_u16(reinterpret_cast<const uint16_t*>(src)),
262*09537850SAkhilesh Sanikop src_shift);
263*09537850SAkhilesh Sanikop src += stride;
264*09537850SAkhilesh Sanikop }
265*09537850SAkhilesh Sanikop }
266*09537850SAkhilesh Sanikop // partial for direction 2
267*09537850SAkhilesh Sanikop // --------------------------------------------------------------------------
268*09537850SAkhilesh Sanikop // partial[2][i] += x;
269*09537850SAkhilesh Sanikop // 00 10 20 30 40 50 60 70 00 00 00 00 00 00 00 00
270*09537850SAkhilesh Sanikop // 01 11 21 33 41 51 61 71 00 00 00 00 00 00 00 00
271*09537850SAkhilesh Sanikop // 02 12 22 33 42 52 62 72 00 00 00 00 00 00 00 00
272*09537850SAkhilesh Sanikop // 03 13 23 33 43 53 63 73 00 00 00 00 00 00 00 00
273*09537850SAkhilesh Sanikop // 04 14 24 34 44 54 64 74 00 00 00 00 00 00 00 00
274*09537850SAkhilesh Sanikop // 05 15 25 35 45 55 65 75 00 00 00 00 00 00 00 00
275*09537850SAkhilesh Sanikop // 06 16 26 36 46 56 66 76 00 00 00 00 00 00 00 00
276*09537850SAkhilesh Sanikop // 07 17 27 37 47 57 67 77 00 00 00 00 00 00 00 00
277*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[0]), vdupq_n_u16(0), 0);
278*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[1]), partial_lo[2], 1);
279*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[2]), partial_lo[2], 2);
280*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[3]), partial_lo[2], 3);
281*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[4]), partial_lo[2], 4);
282*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[5]), partial_lo[2], 5);
283*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[6]), partial_lo[2], 6);
284*09537850SAkhilesh Sanikop partial_lo[2] = vsetq_lane_u16(SumVector(v_src[7]), partial_lo[2], 7);
285*09537850SAkhilesh Sanikop
286*09537850SAkhilesh Sanikop // partial for direction 6
287*09537850SAkhilesh Sanikop // --------------------------------------------------------------------------
288*09537850SAkhilesh Sanikop // partial[6][j] += x;
289*09537850SAkhilesh Sanikop // 00 01 02 03 04 05 06 07 00 00 00 00 00 00 00 00
290*09537850SAkhilesh Sanikop // 10 11 12 13 14 15 16 17 00 00 00 00 00 00 00 00
291*09537850SAkhilesh Sanikop // 20 21 22 23 24 25 26 27 00 00 00 00 00 00 00 00
292*09537850SAkhilesh Sanikop // 30 31 32 33 34 35 36 37 00 00 00 00 00 00 00 00
293*09537850SAkhilesh Sanikop // 40 41 42 43 44 45 46 47 00 00 00 00 00 00 00 00
294*09537850SAkhilesh Sanikop // 50 51 52 53 54 55 56 57 00 00 00 00 00 00 00 00
295*09537850SAkhilesh Sanikop // 60 61 62 63 64 65 66 67 00 00 00 00 00 00 00 00
296*09537850SAkhilesh Sanikop // 70 71 72 73 74 75 76 77 00 00 00 00 00 00 00 00
297*09537850SAkhilesh Sanikop partial_lo[6] = vaddl_u8(v_src[0], v_src[1]);
298*09537850SAkhilesh Sanikop for (int i = 2; i < 8; ++i) {
299*09537850SAkhilesh Sanikop partial_lo[6] = vaddw_u8(partial_lo[6], v_src[i]);
300*09537850SAkhilesh Sanikop }
301*09537850SAkhilesh Sanikop
302*09537850SAkhilesh Sanikop // partial for direction 0
303*09537850SAkhilesh Sanikop AddPartial_D0_D4(v_src, &partial_lo[0], &partial_hi[0]);
304*09537850SAkhilesh Sanikop
305*09537850SAkhilesh Sanikop // partial for direction 1
306*09537850SAkhilesh Sanikop AddPartial_D1_D3(v_src, &partial_lo[1], &partial_hi[1]);
307*09537850SAkhilesh Sanikop
308*09537850SAkhilesh Sanikop // partial for direction 7
309*09537850SAkhilesh Sanikop AddPartial_D5_D7(v_src, &partial_lo[7], &partial_hi[7]);
310*09537850SAkhilesh Sanikop
311*09537850SAkhilesh Sanikop uint8x8_t v_src_reverse[8];
312*09537850SAkhilesh Sanikop for (int i = 0; i < 8; ++i) {
313*09537850SAkhilesh Sanikop v_src_reverse[i] = vrev64_u8(v_src[i]);
314*09537850SAkhilesh Sanikop }
315*09537850SAkhilesh Sanikop
316*09537850SAkhilesh Sanikop // partial for direction 4
317*09537850SAkhilesh Sanikop AddPartial_D0_D4(v_src_reverse, &partial_lo[4], &partial_hi[4]);
318*09537850SAkhilesh Sanikop
319*09537850SAkhilesh Sanikop // partial for direction 3
320*09537850SAkhilesh Sanikop AddPartial_D1_D3(v_src_reverse, &partial_lo[3], &partial_hi[3]);
321*09537850SAkhilesh Sanikop
322*09537850SAkhilesh Sanikop // partial for direction 5
323*09537850SAkhilesh Sanikop AddPartial_D5_D7(v_src_reverse, &partial_lo[5], &partial_hi[5]);
324*09537850SAkhilesh Sanikop }
325*09537850SAkhilesh Sanikop
Square(uint16x4_t a)326*09537850SAkhilesh Sanikop uint32x4_t Square(uint16x4_t a) { return vmull_u16(a, a); }
327*09537850SAkhilesh Sanikop
SquareAccumulate(uint32x4_t a,uint16x4_t b)328*09537850SAkhilesh Sanikop uint32x4_t SquareAccumulate(uint32x4_t a, uint16x4_t b) {
329*09537850SAkhilesh Sanikop return vmlal_u16(a, b, b);
330*09537850SAkhilesh Sanikop }
331*09537850SAkhilesh Sanikop
332*09537850SAkhilesh Sanikop // |cost[0]| and |cost[4]| square the input and sum with the corresponding
333*09537850SAkhilesh Sanikop // element from the other end of the vector:
334*09537850SAkhilesh Sanikop // |kCdefDivisionTable[]| element:
335*09537850SAkhilesh Sanikop // cost[0] += (Square(partial[0][i]) + Square(partial[0][14 - i])) *
336*09537850SAkhilesh Sanikop // kCdefDivisionTable[i + 1];
337*09537850SAkhilesh Sanikop // cost[0] += Square(partial[0][7]) * kCdefDivisionTable[8];
338*09537850SAkhilesh Sanikop // Because everything is being summed into a single value the distributive
339*09537850SAkhilesh Sanikop // property allows us to mirror the division table and accumulate once.
Cost0Or4(const uint16x8_t a,const uint16x8_t b,const uint32x4_t division_table[4])340*09537850SAkhilesh Sanikop uint32_t Cost0Or4(const uint16x8_t a, const uint16x8_t b,
341*09537850SAkhilesh Sanikop const uint32x4_t division_table[4]) {
342*09537850SAkhilesh Sanikop uint32x4_t c = vmulq_u32(Square(vget_low_u16(a)), division_table[0]);
343*09537850SAkhilesh Sanikop c = vmlaq_u32(c, Square(vget_high_u16(a)), division_table[1]);
344*09537850SAkhilesh Sanikop c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[2]);
345*09537850SAkhilesh Sanikop c = vmlaq_u32(c, Square(vget_high_u16(b)), division_table[3]);
346*09537850SAkhilesh Sanikop return SumVector(c);
347*09537850SAkhilesh Sanikop }
348*09537850SAkhilesh Sanikop
349*09537850SAkhilesh Sanikop // |cost[2]| and |cost[6]| square the input and accumulate:
350*09537850SAkhilesh Sanikop // cost[2] += Square(partial[2][i])
SquareAccumulate(const uint16x8_t a)351*09537850SAkhilesh Sanikop uint32_t SquareAccumulate(const uint16x8_t a) {
352*09537850SAkhilesh Sanikop uint32x4_t c = Square(vget_low_u16(a));
353*09537850SAkhilesh Sanikop c = SquareAccumulate(c, vget_high_u16(a));
354*09537850SAkhilesh Sanikop c = vmulq_n_u32(c, kCdefDivisionTable[7]);
355*09537850SAkhilesh Sanikop return SumVector(c);
356*09537850SAkhilesh Sanikop }
357*09537850SAkhilesh Sanikop
CostOdd(const uint16x8_t a,const uint16x8_t b,const uint32x4_t mask,const uint32x4_t division_table[2])358*09537850SAkhilesh Sanikop uint32_t CostOdd(const uint16x8_t a, const uint16x8_t b, const uint32x4_t mask,
359*09537850SAkhilesh Sanikop const uint32x4_t division_table[2]) {
360*09537850SAkhilesh Sanikop // Remove elements 0-2.
361*09537850SAkhilesh Sanikop uint32x4_t c = vandq_u32(mask, Square(vget_low_u16(a)));
362*09537850SAkhilesh Sanikop c = vaddq_u32(c, Square(vget_high_u16(a)));
363*09537850SAkhilesh Sanikop c = vmulq_n_u32(c, kCdefDivisionTable[7]);
364*09537850SAkhilesh Sanikop
365*09537850SAkhilesh Sanikop c = vmlaq_u32(c, Square(vget_low_u16(a)), division_table[0]);
366*09537850SAkhilesh Sanikop c = vmlaq_u32(c, Square(vget_low_u16(b)), division_table[1]);
367*09537850SAkhilesh Sanikop return SumVector(c);
368*09537850SAkhilesh Sanikop }
369*09537850SAkhilesh Sanikop
370*09537850SAkhilesh Sanikop template <int bitdepth>
CdefDirection_NEON(const void * LIBGAV1_RESTRICT const source,ptrdiff_t stride,uint8_t * LIBGAV1_RESTRICT const direction,int * LIBGAV1_RESTRICT const variance)371*09537850SAkhilesh Sanikop void CdefDirection_NEON(const void* LIBGAV1_RESTRICT const source,
372*09537850SAkhilesh Sanikop ptrdiff_t stride,
373*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT const direction,
374*09537850SAkhilesh Sanikop int* LIBGAV1_RESTRICT const variance) {
375*09537850SAkhilesh Sanikop assert(direction != nullptr);
376*09537850SAkhilesh Sanikop assert(variance != nullptr);
377*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(source);
378*09537850SAkhilesh Sanikop
379*09537850SAkhilesh Sanikop uint32_t cost[8];
380*09537850SAkhilesh Sanikop uint16x8_t partial_lo[8], partial_hi[8];
381*09537850SAkhilesh Sanikop
382*09537850SAkhilesh Sanikop AddPartial<bitdepth>(src, stride, partial_lo, partial_hi);
383*09537850SAkhilesh Sanikop
384*09537850SAkhilesh Sanikop cost[2] = SquareAccumulate(partial_lo[2]);
385*09537850SAkhilesh Sanikop cost[6] = SquareAccumulate(partial_lo[6]);
386*09537850SAkhilesh Sanikop
387*09537850SAkhilesh Sanikop const uint32x4_t division_table[4] = {
388*09537850SAkhilesh Sanikop vld1q_u32(kCdefDivisionTable), vld1q_u32(kCdefDivisionTable + 4),
389*09537850SAkhilesh Sanikop vld1q_u32(kCdefDivisionTable + 8), vld1q_u32(kCdefDivisionTable + 12)};
390*09537850SAkhilesh Sanikop
391*09537850SAkhilesh Sanikop cost[0] = Cost0Or4(partial_lo[0], partial_hi[0], division_table);
392*09537850SAkhilesh Sanikop cost[4] = Cost0Or4(partial_lo[4], partial_hi[4], division_table);
393*09537850SAkhilesh Sanikop
394*09537850SAkhilesh Sanikop const uint32x4_t division_table_odd[2] = {
395*09537850SAkhilesh Sanikop vld1q_u32(kCdefDivisionTableOdd), vld1q_u32(kCdefDivisionTableOdd + 4)};
396*09537850SAkhilesh Sanikop
397*09537850SAkhilesh Sanikop const uint32x4_t element_3_mask = {0, 0, 0, static_cast<uint32_t>(-1)};
398*09537850SAkhilesh Sanikop
399*09537850SAkhilesh Sanikop cost[1] =
400*09537850SAkhilesh Sanikop CostOdd(partial_lo[1], partial_hi[1], element_3_mask, division_table_odd);
401*09537850SAkhilesh Sanikop cost[3] =
402*09537850SAkhilesh Sanikop CostOdd(partial_lo[3], partial_hi[3], element_3_mask, division_table_odd);
403*09537850SAkhilesh Sanikop cost[5] =
404*09537850SAkhilesh Sanikop CostOdd(partial_lo[5], partial_hi[5], element_3_mask, division_table_odd);
405*09537850SAkhilesh Sanikop cost[7] =
406*09537850SAkhilesh Sanikop CostOdd(partial_lo[7], partial_hi[7], element_3_mask, division_table_odd);
407*09537850SAkhilesh Sanikop
408*09537850SAkhilesh Sanikop uint32_t best_cost = 0;
409*09537850SAkhilesh Sanikop *direction = 0;
410*09537850SAkhilesh Sanikop for (int i = 0; i < 8; ++i) {
411*09537850SAkhilesh Sanikop if (cost[i] > best_cost) {
412*09537850SAkhilesh Sanikop best_cost = cost[i];
413*09537850SAkhilesh Sanikop *direction = i;
414*09537850SAkhilesh Sanikop }
415*09537850SAkhilesh Sanikop }
416*09537850SAkhilesh Sanikop *variance = (best_cost - cost[(*direction + 4) & 7]) >> 10;
417*09537850SAkhilesh Sanikop }
418*09537850SAkhilesh Sanikop
419*09537850SAkhilesh Sanikop // -------------------------------------------------------------------------
420*09537850SAkhilesh Sanikop // CdefFilter
421*09537850SAkhilesh Sanikop
422*09537850SAkhilesh Sanikop // Load 4 vectors based on the given |direction|.
LoadDirection(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,uint16x8_t * output,const int direction)423*09537850SAkhilesh Sanikop void LoadDirection(const uint16_t* LIBGAV1_RESTRICT const src,
424*09537850SAkhilesh Sanikop const ptrdiff_t stride, uint16x8_t* output,
425*09537850SAkhilesh Sanikop const int direction) {
426*09537850SAkhilesh Sanikop // Each |direction| describes a different set of source values. Expand this
427*09537850SAkhilesh Sanikop // set by negating each set. For |direction| == 0 this gives a diagonal line
428*09537850SAkhilesh Sanikop // from top right to bottom left. The first value is y, the second x. Negative
429*09537850SAkhilesh Sanikop // y values move up.
430*09537850SAkhilesh Sanikop // a b c d
431*09537850SAkhilesh Sanikop // {-1, 1}, {1, -1}, {-2, 2}, {2, -2}
432*09537850SAkhilesh Sanikop // c
433*09537850SAkhilesh Sanikop // a
434*09537850SAkhilesh Sanikop // 0
435*09537850SAkhilesh Sanikop // b
436*09537850SAkhilesh Sanikop // d
437*09537850SAkhilesh Sanikop const int y_0 = kCdefDirections[direction][0][0];
438*09537850SAkhilesh Sanikop const int x_0 = kCdefDirections[direction][0][1];
439*09537850SAkhilesh Sanikop const int y_1 = kCdefDirections[direction][1][0];
440*09537850SAkhilesh Sanikop const int x_1 = kCdefDirections[direction][1][1];
441*09537850SAkhilesh Sanikop output[0] = vld1q_u16(src + y_0 * stride + x_0);
442*09537850SAkhilesh Sanikop output[1] = vld1q_u16(src - y_0 * stride - x_0);
443*09537850SAkhilesh Sanikop output[2] = vld1q_u16(src + y_1 * stride + x_1);
444*09537850SAkhilesh Sanikop output[3] = vld1q_u16(src - y_1 * stride - x_1);
445*09537850SAkhilesh Sanikop }
446*09537850SAkhilesh Sanikop
447*09537850SAkhilesh Sanikop // Load 4 vectors based on the given |direction|. Use when |block_width| == 4 to
448*09537850SAkhilesh Sanikop // do 2 rows at a time.
LoadDirection4(const uint16_t * LIBGAV1_RESTRICT const src,const ptrdiff_t stride,uint16x8_t * output,const int direction)449*09537850SAkhilesh Sanikop void LoadDirection4(const uint16_t* LIBGAV1_RESTRICT const src,
450*09537850SAkhilesh Sanikop const ptrdiff_t stride, uint16x8_t* output,
451*09537850SAkhilesh Sanikop const int direction) {
452*09537850SAkhilesh Sanikop const int y_0 = kCdefDirections[direction][0][0];
453*09537850SAkhilesh Sanikop const int x_0 = kCdefDirections[direction][0][1];
454*09537850SAkhilesh Sanikop const int y_1 = kCdefDirections[direction][1][0];
455*09537850SAkhilesh Sanikop const int x_1 = kCdefDirections[direction][1][1];
456*09537850SAkhilesh Sanikop output[0] = vcombine_u16(vld1_u16(src + y_0 * stride + x_0),
457*09537850SAkhilesh Sanikop vld1_u16(src + y_0 * stride + stride + x_0));
458*09537850SAkhilesh Sanikop output[1] = vcombine_u16(vld1_u16(src - y_0 * stride - x_0),
459*09537850SAkhilesh Sanikop vld1_u16(src - y_0 * stride + stride - x_0));
460*09537850SAkhilesh Sanikop output[2] = vcombine_u16(vld1_u16(src + y_1 * stride + x_1),
461*09537850SAkhilesh Sanikop vld1_u16(src + y_1 * stride + stride + x_1));
462*09537850SAkhilesh Sanikop output[3] = vcombine_u16(vld1_u16(src - y_1 * stride - x_1),
463*09537850SAkhilesh Sanikop vld1_u16(src - y_1 * stride + stride - x_1));
464*09537850SAkhilesh Sanikop }
465*09537850SAkhilesh Sanikop
Constrain(const uint16x8_t pixel,const uint16x8_t reference,const uint16x8_t threshold,const int16x8_t damping)466*09537850SAkhilesh Sanikop int16x8_t Constrain(const uint16x8_t pixel, const uint16x8_t reference,
467*09537850SAkhilesh Sanikop const uint16x8_t threshold, const int16x8_t damping) {
468*09537850SAkhilesh Sanikop // If reference > pixel, the difference will be negative, so convert to 0 or
469*09537850SAkhilesh Sanikop // -1.
470*09537850SAkhilesh Sanikop const uint16x8_t sign = vcgtq_u16(reference, pixel);
471*09537850SAkhilesh Sanikop const uint16x8_t abs_diff = vabdq_u16(pixel, reference);
472*09537850SAkhilesh Sanikop const uint16x8_t shifted_diff = vshlq_u16(abs_diff, damping);
473*09537850SAkhilesh Sanikop // For bitdepth == 8, the threshold range is [0, 15] and the damping range is
474*09537850SAkhilesh Sanikop // [3, 6]. If pixel == kCdefLargeValue(0x4000), shifted_diff will always be
475*09537850SAkhilesh Sanikop // larger than threshold. Subtract using saturation will return 0 when pixel
476*09537850SAkhilesh Sanikop // == kCdefLargeValue.
477*09537850SAkhilesh Sanikop static_assert(kCdefLargeValue == 0x4000, "Invalid kCdefLargeValue");
478*09537850SAkhilesh Sanikop const uint16x8_t thresh_minus_shifted_diff =
479*09537850SAkhilesh Sanikop vqsubq_u16(threshold, shifted_diff);
480*09537850SAkhilesh Sanikop const uint16x8_t clamp_abs_diff =
481*09537850SAkhilesh Sanikop vminq_u16(thresh_minus_shifted_diff, abs_diff);
482*09537850SAkhilesh Sanikop // Restore the sign.
483*09537850SAkhilesh Sanikop return vreinterpretq_s16_u16(
484*09537850SAkhilesh Sanikop vsubq_u16(veorq_u16(clamp_abs_diff, sign), sign));
485*09537850SAkhilesh Sanikop }
486*09537850SAkhilesh Sanikop
487*09537850SAkhilesh Sanikop template <typename Pixel>
GetMaxPrimary(uint16x8_t * primary_val,uint16x8_t max,uint16x8_t cdef_large_value_mask)488*09537850SAkhilesh Sanikop uint16x8_t GetMaxPrimary(uint16x8_t* primary_val, uint16x8_t max,
489*09537850SAkhilesh Sanikop uint16x8_t cdef_large_value_mask) {
490*09537850SAkhilesh Sanikop if (sizeof(Pixel) == 1) {
491*09537850SAkhilesh Sanikop // The source is 16 bits, however, we only really care about the lower
492*09537850SAkhilesh Sanikop // 8 bits. The upper 8 bits contain the "large" flag. After the final
493*09537850SAkhilesh Sanikop // primary max has been calculated, zero out the upper 8 bits. Use this
494*09537850SAkhilesh Sanikop // to find the "16 bit" max.
495*09537850SAkhilesh Sanikop const uint8x16_t max_p01 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[0]),
496*09537850SAkhilesh Sanikop vreinterpretq_u8_u16(primary_val[1]));
497*09537850SAkhilesh Sanikop const uint8x16_t max_p23 = vmaxq_u8(vreinterpretq_u8_u16(primary_val[2]),
498*09537850SAkhilesh Sanikop vreinterpretq_u8_u16(primary_val[3]));
499*09537850SAkhilesh Sanikop const uint16x8_t max_p = vreinterpretq_u16_u8(vmaxq_u8(max_p01, max_p23));
500*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(max_p, cdef_large_value_mask));
501*09537850SAkhilesh Sanikop } else {
502*09537850SAkhilesh Sanikop // Convert kCdefLargeValue to 0 before calculating max.
503*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(primary_val[0], cdef_large_value_mask));
504*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(primary_val[1], cdef_large_value_mask));
505*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(primary_val[2], cdef_large_value_mask));
506*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(primary_val[3], cdef_large_value_mask));
507*09537850SAkhilesh Sanikop }
508*09537850SAkhilesh Sanikop return max;
509*09537850SAkhilesh Sanikop }
510*09537850SAkhilesh Sanikop
511*09537850SAkhilesh Sanikop template <typename Pixel>
GetMaxSecondary(uint16x8_t * secondary_val,uint16x8_t max,uint16x8_t cdef_large_value_mask)512*09537850SAkhilesh Sanikop uint16x8_t GetMaxSecondary(uint16x8_t* secondary_val, uint16x8_t max,
513*09537850SAkhilesh Sanikop uint16x8_t cdef_large_value_mask) {
514*09537850SAkhilesh Sanikop if (sizeof(Pixel) == 1) {
515*09537850SAkhilesh Sanikop const uint8x16_t max_s01 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[0]),
516*09537850SAkhilesh Sanikop vreinterpretq_u8_u16(secondary_val[1]));
517*09537850SAkhilesh Sanikop const uint8x16_t max_s23 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[2]),
518*09537850SAkhilesh Sanikop vreinterpretq_u8_u16(secondary_val[3]));
519*09537850SAkhilesh Sanikop const uint8x16_t max_s45 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[4]),
520*09537850SAkhilesh Sanikop vreinterpretq_u8_u16(secondary_val[5]));
521*09537850SAkhilesh Sanikop const uint8x16_t max_s67 = vmaxq_u8(vreinterpretq_u8_u16(secondary_val[6]),
522*09537850SAkhilesh Sanikop vreinterpretq_u8_u16(secondary_val[7]));
523*09537850SAkhilesh Sanikop const uint16x8_t max_s = vreinterpretq_u16_u8(
524*09537850SAkhilesh Sanikop vmaxq_u8(vmaxq_u8(max_s01, max_s23), vmaxq_u8(max_s45, max_s67)));
525*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(max_s, cdef_large_value_mask));
526*09537850SAkhilesh Sanikop } else {
527*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[0], cdef_large_value_mask));
528*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[1], cdef_large_value_mask));
529*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[2], cdef_large_value_mask));
530*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[3], cdef_large_value_mask));
531*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[4], cdef_large_value_mask));
532*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[5], cdef_large_value_mask));
533*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[6], cdef_large_value_mask));
534*09537850SAkhilesh Sanikop max = vmaxq_u16(max, vandq_u16(secondary_val[7], cdef_large_value_mask));
535*09537850SAkhilesh Sanikop }
536*09537850SAkhilesh Sanikop return max;
537*09537850SAkhilesh Sanikop }
538*09537850SAkhilesh Sanikop
539*09537850SAkhilesh Sanikop template <typename Pixel, int width>
StorePixels(void * dest,ptrdiff_t dst_stride,int16x8_t result)540*09537850SAkhilesh Sanikop void StorePixels(void* dest, ptrdiff_t dst_stride, int16x8_t result) {
541*09537850SAkhilesh Sanikop auto* const dst8 = static_cast<uint8_t*>(dest);
542*09537850SAkhilesh Sanikop if (sizeof(Pixel) == 1) {
543*09537850SAkhilesh Sanikop const uint8x8_t dst_pixel = vqmovun_s16(result);
544*09537850SAkhilesh Sanikop if (width == 8) {
545*09537850SAkhilesh Sanikop vst1_u8(dst8, dst_pixel);
546*09537850SAkhilesh Sanikop } else {
547*09537850SAkhilesh Sanikop StoreLo4(dst8, dst_pixel);
548*09537850SAkhilesh Sanikop StoreHi4(dst8 + dst_stride, dst_pixel);
549*09537850SAkhilesh Sanikop }
550*09537850SAkhilesh Sanikop } else {
551*09537850SAkhilesh Sanikop const uint16x8_t dst_pixel = vreinterpretq_u16_s16(result);
552*09537850SAkhilesh Sanikop auto* const dst16 = reinterpret_cast<uint16_t*>(dst8);
553*09537850SAkhilesh Sanikop if (width == 8) {
554*09537850SAkhilesh Sanikop vst1q_u16(dst16, dst_pixel);
555*09537850SAkhilesh Sanikop } else {
556*09537850SAkhilesh Sanikop auto* const dst16_next_row =
557*09537850SAkhilesh Sanikop reinterpret_cast<uint16_t*>(dst8 + dst_stride);
558*09537850SAkhilesh Sanikop vst1_u16(dst16, vget_low_u16(dst_pixel));
559*09537850SAkhilesh Sanikop vst1_u16(dst16_next_row, vget_high_u16(dst_pixel));
560*09537850SAkhilesh Sanikop }
561*09537850SAkhilesh Sanikop }
562*09537850SAkhilesh Sanikop }
563*09537850SAkhilesh Sanikop
564*09537850SAkhilesh Sanikop template <int width, typename Pixel, bool enable_primary = true,
565*09537850SAkhilesh Sanikop bool enable_secondary = true>
CdefFilter_NEON(const uint16_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,const int primary_strength,const int secondary_strength,const int damping,const int direction,void * LIBGAV1_RESTRICT dest,const ptrdiff_t dst_stride)566*09537850SAkhilesh Sanikop void CdefFilter_NEON(const uint16_t* LIBGAV1_RESTRICT src,
567*09537850SAkhilesh Sanikop const ptrdiff_t src_stride, const int height,
568*09537850SAkhilesh Sanikop const int primary_strength, const int secondary_strength,
569*09537850SAkhilesh Sanikop const int damping, const int direction,
570*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT dest, const ptrdiff_t dst_stride) {
571*09537850SAkhilesh Sanikop static_assert(width == 8 || width == 4, "");
572*09537850SAkhilesh Sanikop static_assert(enable_primary || enable_secondary, "");
573*09537850SAkhilesh Sanikop constexpr bool clipping_required = enable_primary && enable_secondary;
574*09537850SAkhilesh Sanikop auto* dst = static_cast<uint8_t*>(dest);
575*09537850SAkhilesh Sanikop const uint16x8_t cdef_large_value_mask =
576*09537850SAkhilesh Sanikop vdupq_n_u16(static_cast<uint16_t>(~kCdefLargeValue));
577*09537850SAkhilesh Sanikop const uint16x8_t primary_threshold = vdupq_n_u16(primary_strength);
578*09537850SAkhilesh Sanikop const uint16x8_t secondary_threshold = vdupq_n_u16(secondary_strength);
579*09537850SAkhilesh Sanikop
580*09537850SAkhilesh Sanikop int16x8_t primary_damping_shift, secondary_damping_shift;
581*09537850SAkhilesh Sanikop
582*09537850SAkhilesh Sanikop // FloorLog2() requires input to be > 0.
583*09537850SAkhilesh Sanikop // 8-bit damping range: Y: [3, 6], UV: [2, 5].
584*09537850SAkhilesh Sanikop // 10-bit damping range: Y: [3, 6 + 2], UV: [2, 5 + 2].
585*09537850SAkhilesh Sanikop if (enable_primary) {
586*09537850SAkhilesh Sanikop // 8-bit primary_strength: [0, 15] -> FloorLog2: [0, 3] so a clamp is
587*09537850SAkhilesh Sanikop // necessary for UV filtering.
588*09537850SAkhilesh Sanikop // 10-bit primary_strength: [0, 15 << 2].
589*09537850SAkhilesh Sanikop primary_damping_shift =
590*09537850SAkhilesh Sanikop vdupq_n_s16(-std::max(0, damping - FloorLog2(primary_strength)));
591*09537850SAkhilesh Sanikop }
592*09537850SAkhilesh Sanikop
593*09537850SAkhilesh Sanikop if (enable_secondary) {
594*09537850SAkhilesh Sanikop if (sizeof(Pixel) == 1) {
595*09537850SAkhilesh Sanikop // secondary_strength: [0, 4] -> FloorLog2: [0, 2] so no clamp to 0 is
596*09537850SAkhilesh Sanikop // necessary.
597*09537850SAkhilesh Sanikop assert(damping - FloorLog2(secondary_strength) >= 0);
598*09537850SAkhilesh Sanikop secondary_damping_shift =
599*09537850SAkhilesh Sanikop vdupq_n_s16(-(damping - FloorLog2(secondary_strength)));
600*09537850SAkhilesh Sanikop } else {
601*09537850SAkhilesh Sanikop // secondary_strength: [0, 4 << 2]
602*09537850SAkhilesh Sanikop secondary_damping_shift =
603*09537850SAkhilesh Sanikop vdupq_n_s16(-std::max(0, damping - FloorLog2(secondary_strength)));
604*09537850SAkhilesh Sanikop }
605*09537850SAkhilesh Sanikop }
606*09537850SAkhilesh Sanikop
607*09537850SAkhilesh Sanikop constexpr int coeff_shift = (sizeof(Pixel) == 1) ? 0 : kBitdepth10 - 8;
608*09537850SAkhilesh Sanikop const int primary_tap_0 =
609*09537850SAkhilesh Sanikop kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][0];
610*09537850SAkhilesh Sanikop const int primary_tap_1 =
611*09537850SAkhilesh Sanikop kCdefPrimaryTaps[(primary_strength >> coeff_shift) & 1][1];
612*09537850SAkhilesh Sanikop
613*09537850SAkhilesh Sanikop int y = height;
614*09537850SAkhilesh Sanikop do {
615*09537850SAkhilesh Sanikop uint16x8_t pixel;
616*09537850SAkhilesh Sanikop if (width == 8) {
617*09537850SAkhilesh Sanikop pixel = vld1q_u16(src);
618*09537850SAkhilesh Sanikop } else {
619*09537850SAkhilesh Sanikop pixel = vcombine_u16(vld1_u16(src), vld1_u16(src + src_stride));
620*09537850SAkhilesh Sanikop }
621*09537850SAkhilesh Sanikop
622*09537850SAkhilesh Sanikop uint16x8_t min = pixel;
623*09537850SAkhilesh Sanikop uint16x8_t max = pixel;
624*09537850SAkhilesh Sanikop int16x8_t sum;
625*09537850SAkhilesh Sanikop
626*09537850SAkhilesh Sanikop if (enable_primary) {
627*09537850SAkhilesh Sanikop // Primary |direction|.
628*09537850SAkhilesh Sanikop uint16x8_t primary_val[4];
629*09537850SAkhilesh Sanikop if (width == 8) {
630*09537850SAkhilesh Sanikop LoadDirection(src, src_stride, primary_val, direction);
631*09537850SAkhilesh Sanikop } else {
632*09537850SAkhilesh Sanikop LoadDirection4(src, src_stride, primary_val, direction);
633*09537850SAkhilesh Sanikop }
634*09537850SAkhilesh Sanikop
635*09537850SAkhilesh Sanikop if (clipping_required) {
636*09537850SAkhilesh Sanikop min = vminq_u16(min, primary_val[0]);
637*09537850SAkhilesh Sanikop min = vminq_u16(min, primary_val[1]);
638*09537850SAkhilesh Sanikop min = vminq_u16(min, primary_val[2]);
639*09537850SAkhilesh Sanikop min = vminq_u16(min, primary_val[3]);
640*09537850SAkhilesh Sanikop
641*09537850SAkhilesh Sanikop max = GetMaxPrimary<Pixel>(primary_val, max, cdef_large_value_mask);
642*09537850SAkhilesh Sanikop }
643*09537850SAkhilesh Sanikop
644*09537850SAkhilesh Sanikop sum = Constrain(primary_val[0], pixel, primary_threshold,
645*09537850SAkhilesh Sanikop primary_damping_shift);
646*09537850SAkhilesh Sanikop sum = vmulq_n_s16(sum, primary_tap_0);
647*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
648*09537850SAkhilesh Sanikop Constrain(primary_val[1], pixel, primary_threshold,
649*09537850SAkhilesh Sanikop primary_damping_shift),
650*09537850SAkhilesh Sanikop primary_tap_0);
651*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
652*09537850SAkhilesh Sanikop Constrain(primary_val[2], pixel, primary_threshold,
653*09537850SAkhilesh Sanikop primary_damping_shift),
654*09537850SAkhilesh Sanikop primary_tap_1);
655*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
656*09537850SAkhilesh Sanikop Constrain(primary_val[3], pixel, primary_threshold,
657*09537850SAkhilesh Sanikop primary_damping_shift),
658*09537850SAkhilesh Sanikop primary_tap_1);
659*09537850SAkhilesh Sanikop } else {
660*09537850SAkhilesh Sanikop sum = vdupq_n_s16(0);
661*09537850SAkhilesh Sanikop }
662*09537850SAkhilesh Sanikop
663*09537850SAkhilesh Sanikop if (enable_secondary) {
664*09537850SAkhilesh Sanikop // Secondary |direction| values (+/- 2). Clamp |direction|.
665*09537850SAkhilesh Sanikop uint16x8_t secondary_val[8];
666*09537850SAkhilesh Sanikop if (width == 8) {
667*09537850SAkhilesh Sanikop LoadDirection(src, src_stride, secondary_val, direction + 2);
668*09537850SAkhilesh Sanikop LoadDirection(src, src_stride, secondary_val + 4, direction - 2);
669*09537850SAkhilesh Sanikop } else {
670*09537850SAkhilesh Sanikop LoadDirection4(src, src_stride, secondary_val, direction + 2);
671*09537850SAkhilesh Sanikop LoadDirection4(src, src_stride, secondary_val + 4, direction - 2);
672*09537850SAkhilesh Sanikop }
673*09537850SAkhilesh Sanikop
674*09537850SAkhilesh Sanikop if (clipping_required) {
675*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[0]);
676*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[1]);
677*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[2]);
678*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[3]);
679*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[4]);
680*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[5]);
681*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[6]);
682*09537850SAkhilesh Sanikop min = vminq_u16(min, secondary_val[7]);
683*09537850SAkhilesh Sanikop
684*09537850SAkhilesh Sanikop max = GetMaxSecondary<Pixel>(secondary_val, max, cdef_large_value_mask);
685*09537850SAkhilesh Sanikop }
686*09537850SAkhilesh Sanikop
687*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
688*09537850SAkhilesh Sanikop Constrain(secondary_val[0], pixel, secondary_threshold,
689*09537850SAkhilesh Sanikop secondary_damping_shift),
690*09537850SAkhilesh Sanikop kCdefSecondaryTap0);
691*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
692*09537850SAkhilesh Sanikop Constrain(secondary_val[1], pixel, secondary_threshold,
693*09537850SAkhilesh Sanikop secondary_damping_shift),
694*09537850SAkhilesh Sanikop kCdefSecondaryTap0);
695*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
696*09537850SAkhilesh Sanikop Constrain(secondary_val[2], pixel, secondary_threshold,
697*09537850SAkhilesh Sanikop secondary_damping_shift),
698*09537850SAkhilesh Sanikop kCdefSecondaryTap1);
699*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
700*09537850SAkhilesh Sanikop Constrain(secondary_val[3], pixel, secondary_threshold,
701*09537850SAkhilesh Sanikop secondary_damping_shift),
702*09537850SAkhilesh Sanikop kCdefSecondaryTap1);
703*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
704*09537850SAkhilesh Sanikop Constrain(secondary_val[4], pixel, secondary_threshold,
705*09537850SAkhilesh Sanikop secondary_damping_shift),
706*09537850SAkhilesh Sanikop kCdefSecondaryTap0);
707*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
708*09537850SAkhilesh Sanikop Constrain(secondary_val[5], pixel, secondary_threshold,
709*09537850SAkhilesh Sanikop secondary_damping_shift),
710*09537850SAkhilesh Sanikop kCdefSecondaryTap0);
711*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
712*09537850SAkhilesh Sanikop Constrain(secondary_val[6], pixel, secondary_threshold,
713*09537850SAkhilesh Sanikop secondary_damping_shift),
714*09537850SAkhilesh Sanikop kCdefSecondaryTap1);
715*09537850SAkhilesh Sanikop sum = vmlaq_n_s16(sum,
716*09537850SAkhilesh Sanikop Constrain(secondary_val[7], pixel, secondary_threshold,
717*09537850SAkhilesh Sanikop secondary_damping_shift),
718*09537850SAkhilesh Sanikop kCdefSecondaryTap1);
719*09537850SAkhilesh Sanikop }
720*09537850SAkhilesh Sanikop // Clip3(pixel + ((8 + sum - (sum < 0)) >> 4), min, max))
721*09537850SAkhilesh Sanikop const int16x8_t sum_lt_0 = vshrq_n_s16(sum, 15);
722*09537850SAkhilesh Sanikop sum = vaddq_s16(sum, sum_lt_0);
723*09537850SAkhilesh Sanikop int16x8_t result = vrsraq_n_s16(vreinterpretq_s16_u16(pixel), sum, 4);
724*09537850SAkhilesh Sanikop if (clipping_required) {
725*09537850SAkhilesh Sanikop result = vminq_s16(result, vreinterpretq_s16_u16(max));
726*09537850SAkhilesh Sanikop result = vmaxq_s16(result, vreinterpretq_s16_u16(min));
727*09537850SAkhilesh Sanikop }
728*09537850SAkhilesh Sanikop
729*09537850SAkhilesh Sanikop StorePixels<Pixel, width>(dst, dst_stride, result);
730*09537850SAkhilesh Sanikop
731*09537850SAkhilesh Sanikop src += (width == 8) ? src_stride : src_stride << 1;
732*09537850SAkhilesh Sanikop dst += (width == 8) ? dst_stride : dst_stride << 1;
733*09537850SAkhilesh Sanikop y -= (width == 8) ? 1 : 2;
734*09537850SAkhilesh Sanikop } while (y != 0);
735*09537850SAkhilesh Sanikop }
736*09537850SAkhilesh Sanikop
737*09537850SAkhilesh Sanikop } // namespace
738*09537850SAkhilesh Sanikop
739*09537850SAkhilesh Sanikop namespace low_bitdepth {
740*09537850SAkhilesh Sanikop namespace {
741*09537850SAkhilesh Sanikop
Init8bpp()742*09537850SAkhilesh Sanikop void Init8bpp() {
743*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
744*09537850SAkhilesh Sanikop assert(dsp != nullptr);
745*09537850SAkhilesh Sanikop dsp->cdef_direction = CdefDirection_NEON<kBitdepth8>;
746*09537850SAkhilesh Sanikop dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint8_t>;
747*09537850SAkhilesh Sanikop dsp->cdef_filters[0][1] = CdefFilter_NEON<4, uint8_t, /*enable_primary=*/true,
748*09537850SAkhilesh Sanikop /*enable_secondary=*/false>;
749*09537850SAkhilesh Sanikop dsp->cdef_filters[0][2] =
750*09537850SAkhilesh Sanikop CdefFilter_NEON<4, uint8_t, /*enable_primary=*/false>;
751*09537850SAkhilesh Sanikop dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint8_t>;
752*09537850SAkhilesh Sanikop dsp->cdef_filters[1][1] = CdefFilter_NEON<8, uint8_t, /*enable_primary=*/true,
753*09537850SAkhilesh Sanikop /*enable_secondary=*/false>;
754*09537850SAkhilesh Sanikop dsp->cdef_filters[1][2] =
755*09537850SAkhilesh Sanikop CdefFilter_NEON<8, uint8_t, /*enable_primary=*/false>;
756*09537850SAkhilesh Sanikop }
757*09537850SAkhilesh Sanikop
758*09537850SAkhilesh Sanikop } // namespace
759*09537850SAkhilesh Sanikop } // namespace low_bitdepth
760*09537850SAkhilesh Sanikop
761*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
762*09537850SAkhilesh Sanikop namespace high_bitdepth {
763*09537850SAkhilesh Sanikop namespace {
764*09537850SAkhilesh Sanikop
Init10bpp()765*09537850SAkhilesh Sanikop void Init10bpp() {
766*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
767*09537850SAkhilesh Sanikop assert(dsp != nullptr);
768*09537850SAkhilesh Sanikop dsp->cdef_direction = CdefDirection_NEON<kBitdepth10>;
769*09537850SAkhilesh Sanikop dsp->cdef_filters[0][0] = CdefFilter_NEON<4, uint16_t>;
770*09537850SAkhilesh Sanikop dsp->cdef_filters[0][1] =
771*09537850SAkhilesh Sanikop CdefFilter_NEON<4, uint16_t, /*enable_primary=*/true,
772*09537850SAkhilesh Sanikop /*enable_secondary=*/false>;
773*09537850SAkhilesh Sanikop dsp->cdef_filters[0][2] =
774*09537850SAkhilesh Sanikop CdefFilter_NEON<4, uint16_t, /*enable_primary=*/false>;
775*09537850SAkhilesh Sanikop dsp->cdef_filters[1][0] = CdefFilter_NEON<8, uint16_t>;
776*09537850SAkhilesh Sanikop dsp->cdef_filters[1][1] =
777*09537850SAkhilesh Sanikop CdefFilter_NEON<8, uint16_t, /*enable_primary=*/true,
778*09537850SAkhilesh Sanikop /*enable_secondary=*/false>;
779*09537850SAkhilesh Sanikop dsp->cdef_filters[1][2] =
780*09537850SAkhilesh Sanikop CdefFilter_NEON<8, uint16_t, /*enable_primary=*/false>;
781*09537850SAkhilesh Sanikop }
782*09537850SAkhilesh Sanikop
783*09537850SAkhilesh Sanikop } // namespace
784*09537850SAkhilesh Sanikop } // namespace high_bitdepth
785*09537850SAkhilesh Sanikop #endif // LIBGAV1_MAX_BITDEPTH >= 10
786*09537850SAkhilesh Sanikop
CdefInit_NEON()787*09537850SAkhilesh Sanikop void CdefInit_NEON() {
788*09537850SAkhilesh Sanikop low_bitdepth::Init8bpp();
789*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
790*09537850SAkhilesh Sanikop high_bitdepth::Init10bpp();
791*09537850SAkhilesh Sanikop #endif
792*09537850SAkhilesh Sanikop }
793*09537850SAkhilesh Sanikop
794*09537850SAkhilesh Sanikop } // namespace dsp
795*09537850SAkhilesh Sanikop } // namespace libgav1
796*09537850SAkhilesh Sanikop #else // !LIBGAV1_ENABLE_NEON
797*09537850SAkhilesh Sanikop namespace libgav1 {
798*09537850SAkhilesh Sanikop namespace dsp {
799*09537850SAkhilesh Sanikop
CdefInit_NEON()800*09537850SAkhilesh Sanikop void CdefInit_NEON() {}
801*09537850SAkhilesh Sanikop
802*09537850SAkhilesh Sanikop } // namespace dsp
803*09537850SAkhilesh Sanikop } // namespace libgav1
804*09537850SAkhilesh Sanikop #endif // LIBGAV1_ENABLE_NEON
805