xref: /aosp_15_r20/external/libgav1/src/dsp/loop_restoration.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop //      http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop 
15*09537850SAkhilesh Sanikop #include "src/dsp/loop_restoration.h"
16*09537850SAkhilesh Sanikop 
17*09537850SAkhilesh Sanikop #include <algorithm>
18*09537850SAkhilesh Sanikop #include <cassert>
19*09537850SAkhilesh Sanikop #include <cstddef>
20*09537850SAkhilesh Sanikop #include <cstdint>
21*09537850SAkhilesh Sanikop #include <cstring>
22*09537850SAkhilesh Sanikop 
23*09537850SAkhilesh Sanikop #include "src/dsp/common.h"
24*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
25*09537850SAkhilesh Sanikop #include "src/utils/common.h"
26*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
27*09537850SAkhilesh Sanikop 
28*09537850SAkhilesh Sanikop namespace libgav1 {
29*09537850SAkhilesh Sanikop namespace dsp {
30*09537850SAkhilesh Sanikop 
31*09537850SAkhilesh Sanikop // Section 7.17.3.
32*09537850SAkhilesh Sanikop // a2: range [1, 256].
33*09537850SAkhilesh Sanikop // if (z >= 255)
34*09537850SAkhilesh Sanikop //   a2 = 256;
35*09537850SAkhilesh Sanikop // else if (z == 0)
36*09537850SAkhilesh Sanikop //   a2 = 1;
37*09537850SAkhilesh Sanikop // else
38*09537850SAkhilesh Sanikop //   a2 = ((z << kSgrProjSgrBits) + (z >> 1)) / (z + 1);
39*09537850SAkhilesh Sanikop // ma = 256 - a2;
40*09537850SAkhilesh Sanikop alignas(16) const uint8_t kSgrMaLookup[256] = {
41*09537850SAkhilesh Sanikop     255, 128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16, 15, 14,
42*09537850SAkhilesh Sanikop     13,  13,  12, 12, 11, 11, 10, 10, 9,  9,  9,  9,  8,  8,  8,  8,  7,  7,
43*09537850SAkhilesh Sanikop     7,   7,   7,  6,  6,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  5,  5,  5,
44*09537850SAkhilesh Sanikop     5,   5,   4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
45*09537850SAkhilesh Sanikop     4,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
46*09537850SAkhilesh Sanikop     3,   3,   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,
47*09537850SAkhilesh Sanikop     2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
48*09537850SAkhilesh Sanikop     2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
49*09537850SAkhilesh Sanikop     2,   2,   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
50*09537850SAkhilesh Sanikop     2,   2,   2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
51*09537850SAkhilesh Sanikop     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
52*09537850SAkhilesh Sanikop     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
53*09537850SAkhilesh Sanikop     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
54*09537850SAkhilesh Sanikop     1,   1,   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
55*09537850SAkhilesh Sanikop     1,   1,   1,  0};
56*09537850SAkhilesh Sanikop 
57*09537850SAkhilesh Sanikop namespace {
58*09537850SAkhilesh Sanikop 
59*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
WienerHorizontal(const Pixel * source,const ptrdiff_t source_stride,const int width,const int height,const int16_t * const filter,const int number_zero_coefficients,int16_t ** wiener_buffer)60*09537850SAkhilesh Sanikop inline void WienerHorizontal(const Pixel* source, const ptrdiff_t source_stride,
61*09537850SAkhilesh Sanikop                              const int width, const int height,
62*09537850SAkhilesh Sanikop                              const int16_t* const filter,
63*09537850SAkhilesh Sanikop                              const int number_zero_coefficients,
64*09537850SAkhilesh Sanikop                              int16_t** wiener_buffer) {
65*09537850SAkhilesh Sanikop   constexpr int kCenterTap = kWienerFilterTaps / 2;
66*09537850SAkhilesh Sanikop   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
67*09537850SAkhilesh Sanikop                                            ? kInterRoundBitsHorizontal12bpp
68*09537850SAkhilesh Sanikop                                            : kInterRoundBitsHorizontal;
69*09537850SAkhilesh Sanikop   constexpr int offset =
70*09537850SAkhilesh Sanikop       1 << (bitdepth + kWienerFilterBits - kRoundBitsHorizontal - 1);
71*09537850SAkhilesh Sanikop   constexpr int limit = (offset << 2) - 1;
72*09537850SAkhilesh Sanikop   for (int y = 0; y < height; ++y) {
73*09537850SAkhilesh Sanikop     int x = 0;
74*09537850SAkhilesh Sanikop     do {
75*09537850SAkhilesh Sanikop       // sum fits into 16 bits only when bitdepth = 8.
76*09537850SAkhilesh Sanikop       int sum = 0;
77*09537850SAkhilesh Sanikop       for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
78*09537850SAkhilesh Sanikop         sum +=
79*09537850SAkhilesh Sanikop             filter[k] * (source[x + k] + source[x + kWienerFilterTaps - 1 - k]);
80*09537850SAkhilesh Sanikop       }
81*09537850SAkhilesh Sanikop       sum += filter[kCenterTap] * source[x + kCenterTap];
82*09537850SAkhilesh Sanikop       const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsHorizontal);
83*09537850SAkhilesh Sanikop       (*wiener_buffer)[x] = Clip3(rounded_sum, -offset, limit - offset);
84*09537850SAkhilesh Sanikop     } while (++x != width);
85*09537850SAkhilesh Sanikop     source += source_stride;
86*09537850SAkhilesh Sanikop     *wiener_buffer += width;
87*09537850SAkhilesh Sanikop   }
88*09537850SAkhilesh Sanikop }
89*09537850SAkhilesh Sanikop 
90*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
WienerVertical(const int16_t * wiener_buffer,const int width,const int height,const int16_t * const filter,const int number_zero_coefficients,void * const dest,const ptrdiff_t dest_stride)91*09537850SAkhilesh Sanikop inline void WienerVertical(const int16_t* wiener_buffer, const int width,
92*09537850SAkhilesh Sanikop                            const int height, const int16_t* const filter,
93*09537850SAkhilesh Sanikop                            const int number_zero_coefficients, void* const dest,
94*09537850SAkhilesh Sanikop                            const ptrdiff_t dest_stride) {
95*09537850SAkhilesh Sanikop   constexpr int kCenterTap = kWienerFilterTaps / 2;
96*09537850SAkhilesh Sanikop   constexpr int kRoundBitsVertical =
97*09537850SAkhilesh Sanikop       (bitdepth == 12) ? kInterRoundBitsVertical12bpp : kInterRoundBitsVertical;
98*09537850SAkhilesh Sanikop   auto* dst = static_cast<Pixel*>(dest);
99*09537850SAkhilesh Sanikop   int y = height;
100*09537850SAkhilesh Sanikop   do {
101*09537850SAkhilesh Sanikop     int x = 0;
102*09537850SAkhilesh Sanikop     do {
103*09537850SAkhilesh Sanikop       // sum needs 32 bits.
104*09537850SAkhilesh Sanikop       int sum = 0;
105*09537850SAkhilesh Sanikop       for (int k = number_zero_coefficients; k < kCenterTap; ++k) {
106*09537850SAkhilesh Sanikop         sum += filter[k] *
107*09537850SAkhilesh Sanikop                (wiener_buffer[k * width + x] +
108*09537850SAkhilesh Sanikop                 wiener_buffer[(kWienerFilterTaps - 1 - k) * width + x]);
109*09537850SAkhilesh Sanikop       }
110*09537850SAkhilesh Sanikop       sum += filter[kCenterTap] * wiener_buffer[kCenterTap * width + x];
111*09537850SAkhilesh Sanikop       const int rounded_sum = RightShiftWithRounding(sum, kRoundBitsVertical);
112*09537850SAkhilesh Sanikop       dst[x] = static_cast<Pixel>(Clip3(rounded_sum, 0, (1 << bitdepth) - 1));
113*09537850SAkhilesh Sanikop     } while (++x != width);
114*09537850SAkhilesh Sanikop     wiener_buffer += width;
115*09537850SAkhilesh Sanikop     dst += dest_stride;
116*09537850SAkhilesh Sanikop   } while (--y != 0);
117*09537850SAkhilesh Sanikop }
118*09537850SAkhilesh Sanikop 
119*09537850SAkhilesh Sanikop // Note: bit range for wiener filter.
120*09537850SAkhilesh Sanikop // Wiener filter process first applies horizontal filtering to input pixels,
121*09537850SAkhilesh Sanikop // followed by rounding with predefined bits (dependent on bitdepth).
122*09537850SAkhilesh Sanikop // Then vertical filtering is applied, followed by rounding (dependent on
123*09537850SAkhilesh Sanikop // bitdepth).
124*09537850SAkhilesh Sanikop // The process is the same as convolution:
125*09537850SAkhilesh Sanikop // <input> --> <horizontal filter> --> <rounding 0> --> <vertical filter>
126*09537850SAkhilesh Sanikop // --> <rounding 1>
127*09537850SAkhilesh Sanikop // By design:
128*09537850SAkhilesh Sanikop // (a). horizontal/vertical filtering adds 7 bits to input.
129*09537850SAkhilesh Sanikop // (b). The output of first rounding fits into 16 bits.
130*09537850SAkhilesh Sanikop // (c). The output of second rounding fits into 16 bits.
131*09537850SAkhilesh Sanikop // If input bitdepth > 8, the accumulator of the horizontal filter is larger
132*09537850SAkhilesh Sanikop // than 16 bit and smaller than 32 bits.
133*09537850SAkhilesh Sanikop // The accumulator of the vertical filter is larger than 16 bits and smaller
134*09537850SAkhilesh Sanikop // than 32 bits.
135*09537850SAkhilesh Sanikop // Note: range of wiener filter coefficients.
136*09537850SAkhilesh Sanikop // Wiener filter coefficients are symmetric, and their sum is 1 (128).
137*09537850SAkhilesh Sanikop // The range of each coefficient:
138*09537850SAkhilesh Sanikop // filter[0] = filter[6], 4 bits, min = -5, max = 10.
139*09537850SAkhilesh Sanikop // filter[1] = filter[5], 5 bits, min = -23, max = 8.
140*09537850SAkhilesh Sanikop // filter[2] = filter[4], 6 bits, min = -17, max = 46.
141*09537850SAkhilesh Sanikop // filter[3] = 128 - 2 * (filter[0] + filter[1] + filter[2]).
142*09537850SAkhilesh Sanikop // The difference from libaom is that in libaom:
143*09537850SAkhilesh Sanikop // filter[3] = 0 - 2 * (filter[0] + filter[1] + filter[2]).
144*09537850SAkhilesh Sanikop // Thus in libaom's computation, an offset of 128 is needed for filter[3].
145*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
WienerFilter_C(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)146*09537850SAkhilesh Sanikop void WienerFilter_C(
147*09537850SAkhilesh Sanikop     const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
148*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
149*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const top_border,
150*09537850SAkhilesh Sanikop     const ptrdiff_t top_border_stride,
151*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const bottom_border,
152*09537850SAkhilesh Sanikop     const ptrdiff_t bottom_border_stride, const int width, const int height,
153*09537850SAkhilesh Sanikop     RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
154*09537850SAkhilesh Sanikop     void* LIBGAV1_RESTRICT const dest) {
155*09537850SAkhilesh Sanikop   constexpr int kCenterTap = kWienerFilterTaps / 2;
156*09537850SAkhilesh Sanikop   const int16_t* const number_leading_zero_coefficients =
157*09537850SAkhilesh Sanikop       restoration_info.wiener_info.number_leading_zero_coefficients;
158*09537850SAkhilesh Sanikop   const int number_rows_to_skip = std::max(
159*09537850SAkhilesh Sanikop       static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
160*09537850SAkhilesh Sanikop       1);
161*09537850SAkhilesh Sanikop   int16_t* const wiener_buffer_org = restoration_buffer->wiener_buffer;
162*09537850SAkhilesh Sanikop 
163*09537850SAkhilesh Sanikop   // horizontal filtering.
164*09537850SAkhilesh Sanikop   const int height_horizontal =
165*09537850SAkhilesh Sanikop       height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
166*09537850SAkhilesh Sanikop   const int height_extra = (height_horizontal - height) >> 1;
167*09537850SAkhilesh Sanikop   assert(height_extra <= 2);
168*09537850SAkhilesh Sanikop   const int16_t* const filter_horizontal =
169*09537850SAkhilesh Sanikop       restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
170*09537850SAkhilesh Sanikop   const auto* src = static_cast<const Pixel*>(source) - kCenterTap;
171*09537850SAkhilesh Sanikop   const auto* top = static_cast<const Pixel*>(top_border) - kCenterTap;
172*09537850SAkhilesh Sanikop   const auto* bottom = static_cast<const Pixel*>(bottom_border) - kCenterTap;
173*09537850SAkhilesh Sanikop   auto* wiener_buffer = wiener_buffer_org + number_rows_to_skip * width;
174*09537850SAkhilesh Sanikop 
175*09537850SAkhilesh Sanikop   if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
176*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(
177*09537850SAkhilesh Sanikop         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
178*09537850SAkhilesh Sanikop         height_extra, filter_horizontal, 0, &wiener_buffer);
179*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
180*09537850SAkhilesh Sanikop                                       filter_horizontal, 0, &wiener_buffer);
181*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
182*09537850SAkhilesh Sanikop                                       height_extra, filter_horizontal, 0,
183*09537850SAkhilesh Sanikop                                       &wiener_buffer);
184*09537850SAkhilesh Sanikop   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
185*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(
186*09537850SAkhilesh Sanikop         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
187*09537850SAkhilesh Sanikop         height_extra, filter_horizontal, 1, &wiener_buffer);
188*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
189*09537850SAkhilesh Sanikop                                       filter_horizontal, 1, &wiener_buffer);
190*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
191*09537850SAkhilesh Sanikop                                       height_extra, filter_horizontal, 1,
192*09537850SAkhilesh Sanikop                                       &wiener_buffer);
193*09537850SAkhilesh Sanikop   } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
194*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(
195*09537850SAkhilesh Sanikop         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
196*09537850SAkhilesh Sanikop         height_extra, filter_horizontal, 2, &wiener_buffer);
197*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
198*09537850SAkhilesh Sanikop                                       filter_horizontal, 2, &wiener_buffer);
199*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
200*09537850SAkhilesh Sanikop                                       height_extra, filter_horizontal, 2,
201*09537850SAkhilesh Sanikop                                       &wiener_buffer);
202*09537850SAkhilesh Sanikop   } else {
203*09537850SAkhilesh Sanikop     assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
204*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(
205*09537850SAkhilesh Sanikop         top + (2 - height_extra) * top_border_stride, top_border_stride, width,
206*09537850SAkhilesh Sanikop         height_extra, filter_horizontal, 3, &wiener_buffer);
207*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(src, stride, width, height,
208*09537850SAkhilesh Sanikop                                       filter_horizontal, 3, &wiener_buffer);
209*09537850SAkhilesh Sanikop     WienerHorizontal<bitdepth, Pixel>(bottom, bottom_border_stride, width,
210*09537850SAkhilesh Sanikop                                       height_extra, filter_horizontal, 3,
211*09537850SAkhilesh Sanikop                                       &wiener_buffer);
212*09537850SAkhilesh Sanikop   }
213*09537850SAkhilesh Sanikop 
214*09537850SAkhilesh Sanikop   // vertical filtering.
215*09537850SAkhilesh Sanikop   const int16_t* const filter_vertical =
216*09537850SAkhilesh Sanikop       restoration_info.wiener_info.filter[WienerInfo::kVertical];
217*09537850SAkhilesh Sanikop   if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
218*09537850SAkhilesh Sanikop     // Because the top row of |source| is a duplicate of the second row, and the
219*09537850SAkhilesh Sanikop     // bottom row of |source| is a duplicate of its above row, we can duplicate
220*09537850SAkhilesh Sanikop     // the top and bottom row of |wiener_buffer| accordingly.
221*09537850SAkhilesh Sanikop     memcpy(wiener_buffer, wiener_buffer - width,
222*09537850SAkhilesh Sanikop            sizeof(*wiener_buffer) * width);
223*09537850SAkhilesh Sanikop     memcpy(wiener_buffer_org, wiener_buffer_org + width,
224*09537850SAkhilesh Sanikop            sizeof(*wiener_buffer) * width);
225*09537850SAkhilesh Sanikop     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
226*09537850SAkhilesh Sanikop                                     filter_vertical, 0, dest, stride);
227*09537850SAkhilesh Sanikop   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
228*09537850SAkhilesh Sanikop     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
229*09537850SAkhilesh Sanikop                                     filter_vertical, 1, dest, stride);
230*09537850SAkhilesh Sanikop   } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
231*09537850SAkhilesh Sanikop     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
232*09537850SAkhilesh Sanikop                                     filter_vertical, 2, dest, stride);
233*09537850SAkhilesh Sanikop   } else {
234*09537850SAkhilesh Sanikop     assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
235*09537850SAkhilesh Sanikop     WienerVertical<bitdepth, Pixel>(wiener_buffer_org, width, height,
236*09537850SAkhilesh Sanikop                                     filter_vertical, 3, dest, stride);
237*09537850SAkhilesh Sanikop   }
238*09537850SAkhilesh Sanikop }
239*09537850SAkhilesh Sanikop 
240*09537850SAkhilesh Sanikop //------------------------------------------------------------------------------
241*09537850SAkhilesh Sanikop // SGR
242*09537850SAkhilesh Sanikop 
243*09537850SAkhilesh Sanikop // When |height| is 1, |src_stride| could be set to an arbitrary value.
244*09537850SAkhilesh Sanikop template <typename Pixel, int size>
BoxSum(const Pixel * src,const ptrdiff_t src_stride,const int height,const int width,uint16_t * const * sums,uint32_t * const * square_sums)245*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
246*09537850SAkhilesh Sanikop                                   const int height, const int width,
247*09537850SAkhilesh Sanikop                                   uint16_t* const* sums,
248*09537850SAkhilesh Sanikop                                   uint32_t* const* square_sums) {
249*09537850SAkhilesh Sanikop   int y = height;
250*09537850SAkhilesh Sanikop   do {
251*09537850SAkhilesh Sanikop     uint32_t sum = 0;
252*09537850SAkhilesh Sanikop     uint32_t square_sum = 0;
253*09537850SAkhilesh Sanikop     for (int dx = 0; dx < size; ++dx) {
254*09537850SAkhilesh Sanikop       const Pixel source = src[dx];
255*09537850SAkhilesh Sanikop       sum += source;
256*09537850SAkhilesh Sanikop       square_sum += source * source;
257*09537850SAkhilesh Sanikop     }
258*09537850SAkhilesh Sanikop     (*sums)[0] = sum;
259*09537850SAkhilesh Sanikop     (*square_sums)[0] = square_sum;
260*09537850SAkhilesh Sanikop     int x = 1;
261*09537850SAkhilesh Sanikop     do {
262*09537850SAkhilesh Sanikop       const Pixel source0 = src[x - 1];
263*09537850SAkhilesh Sanikop       const Pixel source1 = src[x - 1 + size];
264*09537850SAkhilesh Sanikop       sum -= source0;
265*09537850SAkhilesh Sanikop       sum += source1;
266*09537850SAkhilesh Sanikop       square_sum -= source0 * source0;
267*09537850SAkhilesh Sanikop       square_sum += source1 * source1;
268*09537850SAkhilesh Sanikop       (*sums)[x] = sum;
269*09537850SAkhilesh Sanikop       (*square_sums)[x] = square_sum;
270*09537850SAkhilesh Sanikop     } while (++x != width);
271*09537850SAkhilesh Sanikop     src += src_stride;
272*09537850SAkhilesh Sanikop     ++sums;
273*09537850SAkhilesh Sanikop     ++square_sums;
274*09537850SAkhilesh Sanikop   } while (--y != 0);
275*09537850SAkhilesh Sanikop }
276*09537850SAkhilesh Sanikop 
277*09537850SAkhilesh Sanikop // When |height| is 1, |src_stride| could be set to an arbitrary value.
278*09537850SAkhilesh Sanikop template <typename Pixel>
BoxSum(const Pixel * src,const ptrdiff_t src_stride,const int height,const int width,uint16_t * const * sum3,uint16_t * const * sum5,uint32_t * const * square_sum3,uint32_t * const * square_sum5)279*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void BoxSum(const Pixel* src, const ptrdiff_t src_stride,
280*09537850SAkhilesh Sanikop                                   const int height, const int width,
281*09537850SAkhilesh Sanikop                                   uint16_t* const* sum3, uint16_t* const* sum5,
282*09537850SAkhilesh Sanikop                                   uint32_t* const* square_sum3,
283*09537850SAkhilesh Sanikop                                   uint32_t* const* square_sum5) {
284*09537850SAkhilesh Sanikop   int y = height;
285*09537850SAkhilesh Sanikop   do {
286*09537850SAkhilesh Sanikop     uint32_t sum = 0;
287*09537850SAkhilesh Sanikop     uint32_t square_sum = 0;
288*09537850SAkhilesh Sanikop     for (int dx = 0; dx < 4; ++dx) {
289*09537850SAkhilesh Sanikop       const Pixel source = src[dx];
290*09537850SAkhilesh Sanikop       sum += source;
291*09537850SAkhilesh Sanikop       square_sum += source * source;
292*09537850SAkhilesh Sanikop     }
293*09537850SAkhilesh Sanikop     int x = 0;
294*09537850SAkhilesh Sanikop     do {
295*09537850SAkhilesh Sanikop       const Pixel source0 = src[x];
296*09537850SAkhilesh Sanikop       const Pixel source1 = src[x + 4];
297*09537850SAkhilesh Sanikop       sum -= source0;
298*09537850SAkhilesh Sanikop       square_sum -= source0 * source0;
299*09537850SAkhilesh Sanikop       (*sum3)[x] = sum;
300*09537850SAkhilesh Sanikop       (*square_sum3)[x] = square_sum;
301*09537850SAkhilesh Sanikop       sum += source1;
302*09537850SAkhilesh Sanikop       square_sum += source1 * source1;
303*09537850SAkhilesh Sanikop       (*sum5)[x] = sum + source0;
304*09537850SAkhilesh Sanikop       (*square_sum5)[x] = square_sum + source0 * source0;
305*09537850SAkhilesh Sanikop     } while (++x != width);
306*09537850SAkhilesh Sanikop     src += src_stride;
307*09537850SAkhilesh Sanikop     ++sum3;
308*09537850SAkhilesh Sanikop     ++sum5;
309*09537850SAkhilesh Sanikop     ++square_sum3;
310*09537850SAkhilesh Sanikop     ++square_sum5;
311*09537850SAkhilesh Sanikop   } while (--y != 0);
312*09537850SAkhilesh Sanikop }
313*09537850SAkhilesh Sanikop 
314*09537850SAkhilesh Sanikop template <int bitdepth, int n>
CalculateIntermediate(const uint32_t s,uint32_t a,const uint32_t b,uint8_t * const ma_ptr,uint32_t * const b_ptr)315*09537850SAkhilesh Sanikop inline void CalculateIntermediate(const uint32_t s, uint32_t a,
316*09537850SAkhilesh Sanikop                                   const uint32_t b, uint8_t* const ma_ptr,
317*09537850SAkhilesh Sanikop                                   uint32_t* const b_ptr) {
318*09537850SAkhilesh Sanikop   // a: before shift, max is 25 * (2^(bitdepth) - 1) * (2^(bitdepth) - 1).
319*09537850SAkhilesh Sanikop   // since max bitdepth = 12, max < 2^31.
320*09537850SAkhilesh Sanikop   // after shift, a < 2^16 * n < 2^22 regardless of bitdepth
321*09537850SAkhilesh Sanikop   a = RightShiftWithRounding(a, (bitdepth - 8) << 1);
322*09537850SAkhilesh Sanikop   // b: max is 25 * (2^(bitdepth) - 1). If bitdepth = 12, max < 2^19.
323*09537850SAkhilesh Sanikop   // d < 2^8 * n < 2^14 regardless of bitdepth
324*09537850SAkhilesh Sanikop   const uint32_t d = RightShiftWithRounding(b, bitdepth - 8);
325*09537850SAkhilesh Sanikop   // p: Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
326*09537850SAkhilesh Sanikop   // and p itself satisfies p < 2^14 * n^2 < 2^26.
327*09537850SAkhilesh Sanikop   // This bound on p is due to:
328*09537850SAkhilesh Sanikop   // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
329*09537850SAkhilesh Sanikop   // Note: Sometimes, in high bitdepth, we can end up with a*n < b*b.
330*09537850SAkhilesh Sanikop   // This is an artifact of rounding, and can only happen if all pixels
331*09537850SAkhilesh Sanikop   // are (almost) identical, so in this case we saturate to p=0.
332*09537850SAkhilesh Sanikop   const uint32_t p = (a * n < d * d) ? 0 : a * n - d * d;
333*09537850SAkhilesh Sanikop   // p * s < (2^14 * n^2) * round(2^20 / (n^2 * scale)) < 2^34 / scale <
334*09537850SAkhilesh Sanikop   // 2^32 as long as scale >= 4. So p * s fits into a uint32_t, and z < 2^12
335*09537850SAkhilesh Sanikop   // (this holds even after accounting for the rounding in s)
336*09537850SAkhilesh Sanikop   const uint32_t z = RightShiftWithRounding(p * s, kSgrProjScaleBits);
337*09537850SAkhilesh Sanikop   // ma: range [0, 255].
338*09537850SAkhilesh Sanikop   const uint32_t ma = kSgrMaLookup[std::min(z, 255u)];
339*09537850SAkhilesh Sanikop   const uint32_t one_over_n = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n;
340*09537850SAkhilesh Sanikop   // ma < 2^8, b < 2^(bitdepth) * n,
341*09537850SAkhilesh Sanikop   // one_over_n = round(2^12 / n)
342*09537850SAkhilesh Sanikop   // => the product here is < 2^(20 + bitdepth) <= 2^32,
343*09537850SAkhilesh Sanikop   // and b is set to a value < 2^(8 + bitdepth).
344*09537850SAkhilesh Sanikop   // This holds even with the rounding in one_over_n and in the overall result,
345*09537850SAkhilesh Sanikop   // as long as ma is strictly less than 2^8.
346*09537850SAkhilesh Sanikop   const uint32_t b2 = ma * b * one_over_n;
347*09537850SAkhilesh Sanikop   *ma_ptr = ma;
348*09537850SAkhilesh Sanikop   *b_ptr = RightShiftWithRounding(b2, kSgrProjReciprocalBits);
349*09537850SAkhilesh Sanikop }
350*09537850SAkhilesh Sanikop 
351*09537850SAkhilesh Sanikop template <typename T>
Sum343(const T * const src)352*09537850SAkhilesh Sanikop inline uint32_t Sum343(const T* const src) {
353*09537850SAkhilesh Sanikop   return 3 * (src[0] + src[2]) + 4 * src[1];
354*09537850SAkhilesh Sanikop }
355*09537850SAkhilesh Sanikop 
356*09537850SAkhilesh Sanikop template <typename T>
Sum444(const T * const src)357*09537850SAkhilesh Sanikop inline uint32_t Sum444(const T* const src) {
358*09537850SAkhilesh Sanikop   return 4 * (src[0] + src[1] + src[2]);
359*09537850SAkhilesh Sanikop }
360*09537850SAkhilesh Sanikop 
361*09537850SAkhilesh Sanikop template <typename T>
Sum565(const T * const src)362*09537850SAkhilesh Sanikop inline uint32_t Sum565(const T* const src) {
363*09537850SAkhilesh Sanikop   return 5 * (src[0] + src[2]) + 6 * src[1];
364*09537850SAkhilesh Sanikop }
365*09537850SAkhilesh Sanikop 
366*09537850SAkhilesh Sanikop template <int bitdepth>
BoxFilterPreProcess5(const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],const int width,const uint32_t s,SgrBuffer * const sgr_buffer,uint16_t * const ma565,uint32_t * const b565)367*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
368*09537850SAkhilesh Sanikop     const uint16_t* const sum5[5], const uint32_t* const square_sum5[5],
369*09537850SAkhilesh Sanikop     const int width, const uint32_t s, SgrBuffer* const sgr_buffer,
370*09537850SAkhilesh Sanikop     uint16_t* const ma565, uint32_t* const b565) {
371*09537850SAkhilesh Sanikop   int x = 0;
372*09537850SAkhilesh Sanikop   do {
373*09537850SAkhilesh Sanikop     uint32_t a = 0;
374*09537850SAkhilesh Sanikop     uint32_t b = 0;
375*09537850SAkhilesh Sanikop     for (int dy = 0; dy < 5; ++dy) {
376*09537850SAkhilesh Sanikop       a += square_sum5[dy][x];
377*09537850SAkhilesh Sanikop       b += sum5[dy][x];
378*09537850SAkhilesh Sanikop     }
379*09537850SAkhilesh Sanikop     CalculateIntermediate<bitdepth, 25>(s, a, b, sgr_buffer->ma + x,
380*09537850SAkhilesh Sanikop                                         sgr_buffer->b + x);
381*09537850SAkhilesh Sanikop   } while (++x != width + 2);
382*09537850SAkhilesh Sanikop   x = 0;
383*09537850SAkhilesh Sanikop   do {
384*09537850SAkhilesh Sanikop     ma565[x] = Sum565(sgr_buffer->ma + x);
385*09537850SAkhilesh Sanikop     b565[x] = Sum565(sgr_buffer->b + x);
386*09537850SAkhilesh Sanikop   } while (++x != width);
387*09537850SAkhilesh Sanikop }
388*09537850SAkhilesh Sanikop 
389*09537850SAkhilesh Sanikop template <int bitdepth>
BoxFilterPreProcess3(const uint16_t * const sum3[3],const uint32_t * const square_sum3[3],const int width,const uint32_t s,const bool calculate444,SgrBuffer * const sgr_buffer,uint16_t * const ma343,uint32_t * const b343,uint16_t * const ma444,uint32_t * const b444)390*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
391*09537850SAkhilesh Sanikop     const uint16_t* const sum3[3], const uint32_t* const square_sum3[3],
392*09537850SAkhilesh Sanikop     const int width, const uint32_t s, const bool calculate444,
393*09537850SAkhilesh Sanikop     SgrBuffer* const sgr_buffer, uint16_t* const ma343, uint32_t* const b343,
394*09537850SAkhilesh Sanikop     uint16_t* const ma444, uint32_t* const b444) {
395*09537850SAkhilesh Sanikop   int x = 0;
396*09537850SAkhilesh Sanikop   do {
397*09537850SAkhilesh Sanikop     uint32_t a = 0;
398*09537850SAkhilesh Sanikop     uint32_t b = 0;
399*09537850SAkhilesh Sanikop     for (int dy = 0; dy < 3; ++dy) {
400*09537850SAkhilesh Sanikop       a += square_sum3[dy][x];
401*09537850SAkhilesh Sanikop       b += sum3[dy][x];
402*09537850SAkhilesh Sanikop     }
403*09537850SAkhilesh Sanikop     CalculateIntermediate<bitdepth, 9>(s, a, b, sgr_buffer->ma + x,
404*09537850SAkhilesh Sanikop                                        sgr_buffer->b + x);
405*09537850SAkhilesh Sanikop   } while (++x != width + 2);
406*09537850SAkhilesh Sanikop   x = 0;
407*09537850SAkhilesh Sanikop   do {
408*09537850SAkhilesh Sanikop     ma343[x] = Sum343(sgr_buffer->ma + x);
409*09537850SAkhilesh Sanikop     b343[x] = Sum343(sgr_buffer->b + x);
410*09537850SAkhilesh Sanikop   } while (++x != width);
411*09537850SAkhilesh Sanikop   if (calculate444) {
412*09537850SAkhilesh Sanikop     x = 0;
413*09537850SAkhilesh Sanikop     do {
414*09537850SAkhilesh Sanikop       ma444[x] = Sum444(sgr_buffer->ma + x);
415*09537850SAkhilesh Sanikop       b444[x] = Sum444(sgr_buffer->b + x);
416*09537850SAkhilesh Sanikop     } while (++x != width);
417*09537850SAkhilesh Sanikop   }
418*09537850SAkhilesh Sanikop }
419*09537850SAkhilesh Sanikop 
420*09537850SAkhilesh Sanikop template <typename Pixel>
CalculateFilteredOutput(const Pixel src,const uint32_t ma,const uint32_t b,const int shift)421*09537850SAkhilesh Sanikop inline int CalculateFilteredOutput(const Pixel src, const uint32_t ma,
422*09537850SAkhilesh Sanikop                                    const uint32_t b, const int shift) {
423*09537850SAkhilesh Sanikop   const int32_t v = b - ma * src;
424*09537850SAkhilesh Sanikop   return RightShiftWithRounding(v,
425*09537850SAkhilesh Sanikop                                 kSgrProjSgrBits + shift - kSgrProjRestoreBits);
426*09537850SAkhilesh Sanikop }
427*09537850SAkhilesh Sanikop 
428*09537850SAkhilesh Sanikop template <typename Pixel>
BoxFilterPass1Kernel(const Pixel src0,const Pixel src1,const uint16_t * const ma565[2],const uint32_t * const b565[2],const ptrdiff_t x,int p[2])429*09537850SAkhilesh Sanikop inline void BoxFilterPass1Kernel(const Pixel src0, const Pixel src1,
430*09537850SAkhilesh Sanikop                                  const uint16_t* const ma565[2],
431*09537850SAkhilesh Sanikop                                  const uint32_t* const b565[2],
432*09537850SAkhilesh Sanikop                                  const ptrdiff_t x, int p[2]) {
433*09537850SAkhilesh Sanikop   p[0] = CalculateFilteredOutput<Pixel>(src0, ma565[0][x] + ma565[1][x],
434*09537850SAkhilesh Sanikop                                         b565[0][x] + b565[1][x], 5);
435*09537850SAkhilesh Sanikop   p[1] = CalculateFilteredOutput<Pixel>(src1, ma565[1][x], b565[1][x], 4);
436*09537850SAkhilesh Sanikop }
437*09537850SAkhilesh Sanikop 
438*09537850SAkhilesh Sanikop template <typename Pixel>
BoxFilterPass2Kernel(const Pixel src,const uint16_t * const ma343[3],const uint16_t * const ma444,const uint32_t * const b343[3],const uint32_t * const b444,const ptrdiff_t x)439*09537850SAkhilesh Sanikop inline int BoxFilterPass2Kernel(const Pixel src, const uint16_t* const ma343[3],
440*09537850SAkhilesh Sanikop                                 const uint16_t* const ma444,
441*09537850SAkhilesh Sanikop                                 const uint32_t* const b343[3],
442*09537850SAkhilesh Sanikop                                 const uint32_t* const b444, const ptrdiff_t x) {
443*09537850SAkhilesh Sanikop   const uint32_t ma = ma343[0][x] + ma444[x] + ma343[2][x];
444*09537850SAkhilesh Sanikop   const uint32_t b = b343[0][x] + b444[x] + b343[2][x];
445*09537850SAkhilesh Sanikop   return CalculateFilteredOutput<Pixel>(src, ma, b, 5);
446*09537850SAkhilesh Sanikop }
447*09537850SAkhilesh Sanikop 
448*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
SelfGuidedFinal(const int src,const int v)449*09537850SAkhilesh Sanikop inline Pixel SelfGuidedFinal(const int src, const int v) {
450*09537850SAkhilesh Sanikop   // if radius_pass_0 == 0 and radius_pass_1 == 0, the range of v is:
451*09537850SAkhilesh Sanikop   // bits(u) + bits(w0/w1/w2) + 2 = bitdepth + 13.
452*09537850SAkhilesh Sanikop   // Then, range of s is bitdepth + 2. This is a rough estimation, taking the
453*09537850SAkhilesh Sanikop   // maximum value of each element.
454*09537850SAkhilesh Sanikop   const int s = src + RightShiftWithRounding(
455*09537850SAkhilesh Sanikop                           v, kSgrProjRestoreBits + kSgrProjPrecisionBits);
456*09537850SAkhilesh Sanikop   return static_cast<Pixel>(Clip3(s, 0, (1 << bitdepth) - 1));
457*09537850SAkhilesh Sanikop }
458*09537850SAkhilesh Sanikop 
459*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
SelfGuidedDoubleMultiplier(const int src,const int filter0,const int filter1,const int16_t w0,const int16_t w2)460*09537850SAkhilesh Sanikop inline Pixel SelfGuidedDoubleMultiplier(const int src, const int filter0,
461*09537850SAkhilesh Sanikop                                         const int filter1, const int16_t w0,
462*09537850SAkhilesh Sanikop                                         const int16_t w2) {
463*09537850SAkhilesh Sanikop   const int v = w0 * filter0 + w2 * filter1;
464*09537850SAkhilesh Sanikop   return SelfGuidedFinal<bitdepth, Pixel>(src, v);
465*09537850SAkhilesh Sanikop }
466*09537850SAkhilesh Sanikop 
467*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
SelfGuidedSingleMultiplier(const int src,const int filter,const int16_t w0)468*09537850SAkhilesh Sanikop inline Pixel SelfGuidedSingleMultiplier(const int src, const int filter,
469*09537850SAkhilesh Sanikop                                         const int16_t w0) {
470*09537850SAkhilesh Sanikop   const int v = w0 * filter;
471*09537850SAkhilesh Sanikop   return SelfGuidedFinal<bitdepth, Pixel>(src, v);
472*09537850SAkhilesh Sanikop }
473*09537850SAkhilesh Sanikop 
474*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
BoxFilterPass1(const Pixel * const src,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const uint32_t scale,const int16_t w0,SgrBuffer * const sgr_buffer,uint16_t * const ma565[2],uint32_t * const b565[2],Pixel * dst)475*09537850SAkhilesh Sanikop inline void BoxFilterPass1(const Pixel* const src, const ptrdiff_t stride,
476*09537850SAkhilesh Sanikop                            uint16_t* const sum5[5],
477*09537850SAkhilesh Sanikop                            uint32_t* const square_sum5[5], const int width,
478*09537850SAkhilesh Sanikop                            const uint32_t scale, const int16_t w0,
479*09537850SAkhilesh Sanikop                            SgrBuffer* const sgr_buffer,
480*09537850SAkhilesh Sanikop                            uint16_t* const ma565[2], uint32_t* const b565[2],
481*09537850SAkhilesh Sanikop                            Pixel* dst) {
482*09537850SAkhilesh Sanikop   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
483*09537850SAkhilesh Sanikop                                  ma565[1], b565[1]);
484*09537850SAkhilesh Sanikop   int x = 0;
485*09537850SAkhilesh Sanikop   do {
486*09537850SAkhilesh Sanikop     int p[2];
487*09537850SAkhilesh Sanikop     BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p);
488*09537850SAkhilesh Sanikop     dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p[0], w0);
489*09537850SAkhilesh Sanikop     dst[stride + x] =
490*09537850SAkhilesh Sanikop         SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[stride + x], p[1], w0);
491*09537850SAkhilesh Sanikop   } while (++x != width);
492*09537850SAkhilesh Sanikop }
493*09537850SAkhilesh Sanikop 
494*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
BoxFilterPass2(const Pixel * const src,const Pixel * const src0,const int width,const uint16_t scale,const int16_t w0,uint16_t * const sum3[4],uint32_t * const square_sum3[4],SgrBuffer * const sgr_buffer,uint16_t * const ma343[4],uint16_t * const ma444[3],uint32_t * const b343[4],uint32_t * const b444[3],Pixel * dst)495*09537850SAkhilesh Sanikop inline void BoxFilterPass2(const Pixel* const src, const Pixel* const src0,
496*09537850SAkhilesh Sanikop                            const int width, const uint16_t scale,
497*09537850SAkhilesh Sanikop                            const int16_t w0, uint16_t* const sum3[4],
498*09537850SAkhilesh Sanikop                            uint32_t* const square_sum3[4],
499*09537850SAkhilesh Sanikop                            SgrBuffer* const sgr_buffer,
500*09537850SAkhilesh Sanikop                            uint16_t* const ma343[4], uint16_t* const ma444[3],
501*09537850SAkhilesh Sanikop                            uint32_t* const b343[4], uint32_t* const b444[3],
502*09537850SAkhilesh Sanikop                            Pixel* dst) {
503*09537850SAkhilesh Sanikop   BoxSum<Pixel, 3>(src0, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
504*09537850SAkhilesh Sanikop   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
505*09537850SAkhilesh Sanikop                                  sgr_buffer, ma343[2], b343[2], ma444[1],
506*09537850SAkhilesh Sanikop                                  b444[1]);
507*09537850SAkhilesh Sanikop   int x = 0;
508*09537850SAkhilesh Sanikop   do {
509*09537850SAkhilesh Sanikop     const int p =
510*09537850SAkhilesh Sanikop         BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
511*09537850SAkhilesh Sanikop     dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
512*09537850SAkhilesh Sanikop   } while (++x != width);
513*09537850SAkhilesh Sanikop }
514*09537850SAkhilesh Sanikop 
515*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
BoxFilter(const Pixel * const src,const ptrdiff_t stride,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,SgrBuffer * const sgr_buffer,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],Pixel * dst)516*09537850SAkhilesh Sanikop inline void BoxFilter(const Pixel* const src, const ptrdiff_t stride,
517*09537850SAkhilesh Sanikop                       uint16_t* const sum3[4], uint16_t* const sum5[5],
518*09537850SAkhilesh Sanikop                       uint32_t* const square_sum3[4],
519*09537850SAkhilesh Sanikop                       uint32_t* const square_sum5[5], const int width,
520*09537850SAkhilesh Sanikop                       const uint16_t scales[2], const int16_t w0,
521*09537850SAkhilesh Sanikop                       const int16_t w2, SgrBuffer* const sgr_buffer,
522*09537850SAkhilesh Sanikop                       uint16_t* const ma343[4], uint16_t* const ma444[3],
523*09537850SAkhilesh Sanikop                       uint16_t* const ma565[2], uint32_t* const b343[4],
524*09537850SAkhilesh Sanikop                       uint32_t* const b444[3], uint32_t* const b565[2],
525*09537850SAkhilesh Sanikop                       Pixel* dst) {
526*09537850SAkhilesh Sanikop   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
527*09537850SAkhilesh Sanikop                                  sgr_buffer, ma565[1], b565[1]);
528*09537850SAkhilesh Sanikop   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], true,
529*09537850SAkhilesh Sanikop                                  sgr_buffer, ma343[2], b343[2], ma444[1],
530*09537850SAkhilesh Sanikop                                  b444[1]);
531*09537850SAkhilesh Sanikop   BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
532*09537850SAkhilesh Sanikop                                  true, sgr_buffer, ma343[3], b343[3], ma444[2],
533*09537850SAkhilesh Sanikop                                  b444[2]);
534*09537850SAkhilesh Sanikop   int x = 0;
535*09537850SAkhilesh Sanikop   do {
536*09537850SAkhilesh Sanikop     int p[2][2];
537*09537850SAkhilesh Sanikop     BoxFilterPass1Kernel<Pixel>(src[x], src[stride + x], ma565, b565, x, p[0]);
538*09537850SAkhilesh Sanikop     p[1][0] =
539*09537850SAkhilesh Sanikop         BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343, b444[0], x);
540*09537850SAkhilesh Sanikop     p[1][1] = BoxFilterPass2Kernel<Pixel>(src[stride + x], ma343 + 1, ma444[1],
541*09537850SAkhilesh Sanikop                                           b343 + 1, b444[1], x);
542*09537850SAkhilesh Sanikop     dst[x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p[0][0],
543*09537850SAkhilesh Sanikop                                                          p[1][0], w0, w2);
544*09537850SAkhilesh Sanikop     dst[stride + x] = SelfGuidedDoubleMultiplier<bitdepth, Pixel>(
545*09537850SAkhilesh Sanikop         src[stride + x], p[0][1], p[1][1], w0, w2);
546*09537850SAkhilesh Sanikop   } while (++x != width);
547*09537850SAkhilesh Sanikop }
548*09537850SAkhilesh Sanikop 
549*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)550*09537850SAkhilesh Sanikop inline void BoxFilterProcess(const RestorationUnitInfo& restoration_info,
551*09537850SAkhilesh Sanikop                              const Pixel* src, const ptrdiff_t stride,
552*09537850SAkhilesh Sanikop                              const Pixel* const top_border,
553*09537850SAkhilesh Sanikop                              const ptrdiff_t top_border_stride,
554*09537850SAkhilesh Sanikop                              const Pixel* bottom_border,
555*09537850SAkhilesh Sanikop                              const ptrdiff_t bottom_border_stride,
556*09537850SAkhilesh Sanikop                              const int width, const int height,
557*09537850SAkhilesh Sanikop                              SgrBuffer* const sgr_buffer, Pixel* dst) {
558*09537850SAkhilesh Sanikop   const auto temp_stride = Align<ptrdiff_t>(width, 8);
559*09537850SAkhilesh Sanikop   const ptrdiff_t sum_stride = temp_stride + 8;
560*09537850SAkhilesh Sanikop   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
561*09537850SAkhilesh Sanikop   const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index];  // < 2^12.
562*09537850SAkhilesh Sanikop   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
563*09537850SAkhilesh Sanikop   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
564*09537850SAkhilesh Sanikop   const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
565*09537850SAkhilesh Sanikop   uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
566*09537850SAkhilesh Sanikop   uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
567*09537850SAkhilesh Sanikop   sum3[0] = sgr_buffer->sum3;
568*09537850SAkhilesh Sanikop   square_sum3[0] = sgr_buffer->square_sum3;
569*09537850SAkhilesh Sanikop   ma343[0] = sgr_buffer->ma343;
570*09537850SAkhilesh Sanikop   b343[0] = sgr_buffer->b343;
571*09537850SAkhilesh Sanikop   for (int i = 1; i <= 3; ++i) {
572*09537850SAkhilesh Sanikop     sum3[i] = sum3[i - 1] + sum_stride;
573*09537850SAkhilesh Sanikop     square_sum3[i] = square_sum3[i - 1] + sum_stride;
574*09537850SAkhilesh Sanikop     ma343[i] = ma343[i - 1] + temp_stride;
575*09537850SAkhilesh Sanikop     b343[i] = b343[i - 1] + temp_stride;
576*09537850SAkhilesh Sanikop   }
577*09537850SAkhilesh Sanikop   sum5[0] = sgr_buffer->sum5;
578*09537850SAkhilesh Sanikop   square_sum5[0] = sgr_buffer->square_sum5;
579*09537850SAkhilesh Sanikop   for (int i = 1; i <= 4; ++i) {
580*09537850SAkhilesh Sanikop     sum5[i] = sum5[i - 1] + sum_stride;
581*09537850SAkhilesh Sanikop     square_sum5[i] = square_sum5[i - 1] + sum_stride;
582*09537850SAkhilesh Sanikop   }
583*09537850SAkhilesh Sanikop   ma444[0] = sgr_buffer->ma444;
584*09537850SAkhilesh Sanikop   b444[0] = sgr_buffer->b444;
585*09537850SAkhilesh Sanikop   for (int i = 1; i <= 2; ++i) {
586*09537850SAkhilesh Sanikop     ma444[i] = ma444[i - 1] + temp_stride;
587*09537850SAkhilesh Sanikop     b444[i] = b444[i - 1] + temp_stride;
588*09537850SAkhilesh Sanikop   }
589*09537850SAkhilesh Sanikop   ma565[0] = sgr_buffer->ma565;
590*09537850SAkhilesh Sanikop   ma565[1] = ma565[0] + temp_stride;
591*09537850SAkhilesh Sanikop   b565[0] = sgr_buffer->b565;
592*09537850SAkhilesh Sanikop   b565[1] = b565[0] + temp_stride;
593*09537850SAkhilesh Sanikop   assert(scales[0] != 0);
594*09537850SAkhilesh Sanikop   assert(scales[1] != 0);
595*09537850SAkhilesh Sanikop   BoxSum<Pixel>(top_border, top_border_stride, 2, width + 2, sum3, sum5 + 1,
596*09537850SAkhilesh Sanikop                 square_sum3, square_sum5 + 1);
597*09537850SAkhilesh Sanikop   sum5[0] = sum5[1];
598*09537850SAkhilesh Sanikop   square_sum5[0] = square_sum5[1];
599*09537850SAkhilesh Sanikop   BoxSum<Pixel>(src, stride, 1, width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
600*09537850SAkhilesh Sanikop                 square_sum5 + 3);
601*09537850SAkhilesh Sanikop   const Pixel* const s = (height > 1) ? src + stride : bottom_border;
602*09537850SAkhilesh Sanikop   BoxSum<Pixel>(s, 0, 1, width + 2, sum3 + 3, sum5 + 4, square_sum3 + 3,
603*09537850SAkhilesh Sanikop                 square_sum5 + 4);
604*09537850SAkhilesh Sanikop   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
605*09537850SAkhilesh Sanikop                                  sgr_buffer, ma565[0], b565[0]);
606*09537850SAkhilesh Sanikop   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
607*09537850SAkhilesh Sanikop                                  sgr_buffer, ma343[0], b343[0], nullptr,
608*09537850SAkhilesh Sanikop                                  nullptr);
609*09537850SAkhilesh Sanikop   BoxFilterPreProcess3<bitdepth>(sum3 + 1, square_sum3 + 1, width, scales[1],
610*09537850SAkhilesh Sanikop                                  true, sgr_buffer, ma343[1], b343[1], ma444[0],
611*09537850SAkhilesh Sanikop                                  b444[0]);
612*09537850SAkhilesh Sanikop   sum5[0] = sgr_buffer->sum5;
613*09537850SAkhilesh Sanikop   square_sum5[0] = sgr_buffer->square_sum5;
614*09537850SAkhilesh Sanikop 
615*09537850SAkhilesh Sanikop   for (int y = (height >> 1) - 1; y > 0; --y) {
616*09537850SAkhilesh Sanikop     Circulate4PointersBy2<uint16_t>(sum3);
617*09537850SAkhilesh Sanikop     Circulate4PointersBy2<uint32_t>(square_sum3);
618*09537850SAkhilesh Sanikop     Circulate5PointersBy2<uint16_t>(sum5);
619*09537850SAkhilesh Sanikop     Circulate5PointersBy2<uint32_t>(square_sum5);
620*09537850SAkhilesh Sanikop     BoxSum<Pixel>(src + 2 * stride, stride, 2, width + 2, sum3 + 2, sum5 + 3,
621*09537850SAkhilesh Sanikop                   square_sum3 + 2, square_sum5 + 3);
622*09537850SAkhilesh Sanikop     BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
623*09537850SAkhilesh Sanikop                                square_sum5, width, scales, w0, w2, sgr_buffer,
624*09537850SAkhilesh Sanikop                                ma343, ma444, ma565, b343, b444, b565, dst);
625*09537850SAkhilesh Sanikop     src += 2 * stride;
626*09537850SAkhilesh Sanikop     dst += 2 * stride;
627*09537850SAkhilesh Sanikop     Circulate4PointersBy2<uint16_t>(ma343);
628*09537850SAkhilesh Sanikop     Circulate4PointersBy2<uint32_t>(b343);
629*09537850SAkhilesh Sanikop     std::swap(ma444[0], ma444[2]);
630*09537850SAkhilesh Sanikop     std::swap(b444[0], b444[2]);
631*09537850SAkhilesh Sanikop     std::swap(ma565[0], ma565[1]);
632*09537850SAkhilesh Sanikop     std::swap(b565[0], b565[1]);
633*09537850SAkhilesh Sanikop   }
634*09537850SAkhilesh Sanikop 
635*09537850SAkhilesh Sanikop   Circulate4PointersBy2<uint16_t>(sum3);
636*09537850SAkhilesh Sanikop   Circulate4PointersBy2<uint32_t>(square_sum3);
637*09537850SAkhilesh Sanikop   Circulate5PointersBy2<uint16_t>(sum5);
638*09537850SAkhilesh Sanikop   Circulate5PointersBy2<uint32_t>(square_sum5);
639*09537850SAkhilesh Sanikop   if ((height & 1) == 0 || height > 1) {
640*09537850SAkhilesh Sanikop     const Pixel* sr;
641*09537850SAkhilesh Sanikop     ptrdiff_t s_stride;
642*09537850SAkhilesh Sanikop     if ((height & 1) == 0) {
643*09537850SAkhilesh Sanikop       sr = bottom_border;
644*09537850SAkhilesh Sanikop       s_stride = bottom_border_stride;
645*09537850SAkhilesh Sanikop     } else {
646*09537850SAkhilesh Sanikop       sr = src + 2 * stride;
647*09537850SAkhilesh Sanikop       s_stride = bottom_border - (src + 2 * stride);
648*09537850SAkhilesh Sanikop     }
649*09537850SAkhilesh Sanikop     BoxSum<Pixel>(sr, s_stride, 2, width + 2, sum3 + 2, sum5 + 3,
650*09537850SAkhilesh Sanikop                   square_sum3 + 2, square_sum5 + 3);
651*09537850SAkhilesh Sanikop     BoxFilter<bitdepth, Pixel>(src + 3, stride, sum3, sum5, square_sum3,
652*09537850SAkhilesh Sanikop                                square_sum5, width, scales, w0, w2, sgr_buffer,
653*09537850SAkhilesh Sanikop                                ma343, ma444, ma565, b343, b444, b565, dst);
654*09537850SAkhilesh Sanikop   }
655*09537850SAkhilesh Sanikop   if ((height & 1) != 0) {
656*09537850SAkhilesh Sanikop     src += 3;
657*09537850SAkhilesh Sanikop     if (height > 1) {
658*09537850SAkhilesh Sanikop       src += 2 * stride;
659*09537850SAkhilesh Sanikop       dst += 2 * stride;
660*09537850SAkhilesh Sanikop       Circulate4PointersBy2<uint16_t>(sum3);
661*09537850SAkhilesh Sanikop       Circulate4PointersBy2<uint32_t>(square_sum3);
662*09537850SAkhilesh Sanikop       Circulate5PointersBy2<uint16_t>(sum5);
663*09537850SAkhilesh Sanikop       Circulate5PointersBy2<uint32_t>(square_sum5);
664*09537850SAkhilesh Sanikop       Circulate4PointersBy2<uint16_t>(ma343);
665*09537850SAkhilesh Sanikop       Circulate4PointersBy2<uint32_t>(b343);
666*09537850SAkhilesh Sanikop       std::swap(ma444[0], ma444[2]);
667*09537850SAkhilesh Sanikop       std::swap(b444[0], b444[2]);
668*09537850SAkhilesh Sanikop       std::swap(ma565[0], ma565[1]);
669*09537850SAkhilesh Sanikop       std::swap(b565[0], b565[1]);
670*09537850SAkhilesh Sanikop     }
671*09537850SAkhilesh Sanikop     BoxSum<Pixel>(bottom_border + bottom_border_stride, bottom_border_stride, 1,
672*09537850SAkhilesh Sanikop                   width + 2, sum3 + 2, sum5 + 3, square_sum3 + 2,
673*09537850SAkhilesh Sanikop                   square_sum5 + 3);
674*09537850SAkhilesh Sanikop     sum5[4] = sum5[3];
675*09537850SAkhilesh Sanikop     square_sum5[4] = square_sum5[3];
676*09537850SAkhilesh Sanikop     BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scales[0],
677*09537850SAkhilesh Sanikop                                    sgr_buffer, ma565[1], b565[1]);
678*09537850SAkhilesh Sanikop     BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scales[1], false,
679*09537850SAkhilesh Sanikop                                    sgr_buffer, ma343[2], b343[2], nullptr,
680*09537850SAkhilesh Sanikop                                    nullptr);
681*09537850SAkhilesh Sanikop     int x = 0;
682*09537850SAkhilesh Sanikop     do {
683*09537850SAkhilesh Sanikop       const int p0 = CalculateFilteredOutput<Pixel>(
684*09537850SAkhilesh Sanikop           src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
685*09537850SAkhilesh Sanikop       const int p1 = BoxFilterPass2Kernel<Pixel>(src[x], ma343, ma444[0], b343,
686*09537850SAkhilesh Sanikop                                                  b444[0], x);
687*09537850SAkhilesh Sanikop       dst[x] =
688*09537850SAkhilesh Sanikop           SelfGuidedDoubleMultiplier<bitdepth, Pixel>(src[x], p0, p1, w0, w2);
689*09537850SAkhilesh Sanikop     } while (++x != width);
690*09537850SAkhilesh Sanikop   }
691*09537850SAkhilesh Sanikop }
692*09537850SAkhilesh Sanikop 
693*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)694*09537850SAkhilesh Sanikop inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
695*09537850SAkhilesh Sanikop                                   const Pixel* src, const ptrdiff_t stride,
696*09537850SAkhilesh Sanikop                                   const Pixel* const top_border,
697*09537850SAkhilesh Sanikop                                   const ptrdiff_t top_border_stride,
698*09537850SAkhilesh Sanikop                                   const Pixel* bottom_border,
699*09537850SAkhilesh Sanikop                                   const ptrdiff_t bottom_border_stride,
700*09537850SAkhilesh Sanikop                                   const int width, const int height,
701*09537850SAkhilesh Sanikop                                   SgrBuffer* const sgr_buffer, Pixel* dst) {
702*09537850SAkhilesh Sanikop   const auto temp_stride = Align<ptrdiff_t>(width, 8);
703*09537850SAkhilesh Sanikop   const ptrdiff_t sum_stride = temp_stride + 8;
704*09537850SAkhilesh Sanikop   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
705*09537850SAkhilesh Sanikop   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0];  // < 2^12.
706*09537850SAkhilesh Sanikop   const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
707*09537850SAkhilesh Sanikop   uint16_t *sum5[5], *ma565[2];
708*09537850SAkhilesh Sanikop   uint32_t *square_sum5[5], *b565[2];
709*09537850SAkhilesh Sanikop   sum5[0] = sgr_buffer->sum5;
710*09537850SAkhilesh Sanikop   square_sum5[0] = sgr_buffer->square_sum5;
711*09537850SAkhilesh Sanikop   for (int i = 1; i <= 4; ++i) {
712*09537850SAkhilesh Sanikop     sum5[i] = sum5[i - 1] + sum_stride;
713*09537850SAkhilesh Sanikop     square_sum5[i] = square_sum5[i - 1] + sum_stride;
714*09537850SAkhilesh Sanikop   }
715*09537850SAkhilesh Sanikop   ma565[0] = sgr_buffer->ma565;
716*09537850SAkhilesh Sanikop   ma565[1] = ma565[0] + temp_stride;
717*09537850SAkhilesh Sanikop   b565[0] = sgr_buffer->b565;
718*09537850SAkhilesh Sanikop   b565[1] = b565[0] + temp_stride;
719*09537850SAkhilesh Sanikop   assert(scale != 0);
720*09537850SAkhilesh Sanikop   BoxSum<Pixel, 5>(top_border, top_border_stride, 2, width + 2, sum5 + 1,
721*09537850SAkhilesh Sanikop                    square_sum5 + 1);
722*09537850SAkhilesh Sanikop   sum5[0] = sum5[1];
723*09537850SAkhilesh Sanikop   square_sum5[0] = square_sum5[1];
724*09537850SAkhilesh Sanikop   BoxSum<Pixel, 5>(src, stride, 1, width + 2, sum5 + 3, square_sum5 + 3);
725*09537850SAkhilesh Sanikop   const Pixel* const s = (height > 1) ? src + stride : bottom_border;
726*09537850SAkhilesh Sanikop   BoxSum<Pixel, 5>(s, 0, 1, width + 2, sum5 + 4, square_sum5 + 4);
727*09537850SAkhilesh Sanikop   BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
728*09537850SAkhilesh Sanikop                                  ma565[0], b565[0]);
729*09537850SAkhilesh Sanikop   sum5[0] = sgr_buffer->sum5;
730*09537850SAkhilesh Sanikop   square_sum5[0] = sgr_buffer->square_sum5;
731*09537850SAkhilesh Sanikop 
732*09537850SAkhilesh Sanikop   for (int y = (height >> 1) - 1; y > 0; --y) {
733*09537850SAkhilesh Sanikop     Circulate5PointersBy2<uint16_t>(sum5);
734*09537850SAkhilesh Sanikop     Circulate5PointersBy2<uint32_t>(square_sum5);
735*09537850SAkhilesh Sanikop     BoxSum<Pixel, 5>(src + 2 * stride, stride, 2, width + 2, sum5 + 3,
736*09537850SAkhilesh Sanikop                      square_sum5 + 3);
737*09537850SAkhilesh Sanikop     BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
738*09537850SAkhilesh Sanikop                                     scale, w0, sgr_buffer, ma565, b565, dst);
739*09537850SAkhilesh Sanikop     src += 2 * stride;
740*09537850SAkhilesh Sanikop     dst += 2 * stride;
741*09537850SAkhilesh Sanikop     std::swap(ma565[0], ma565[1]);
742*09537850SAkhilesh Sanikop     std::swap(b565[0], b565[1]);
743*09537850SAkhilesh Sanikop   }
744*09537850SAkhilesh Sanikop 
745*09537850SAkhilesh Sanikop   Circulate5PointersBy2<uint16_t>(sum5);
746*09537850SAkhilesh Sanikop   Circulate5PointersBy2<uint32_t>(square_sum5);
747*09537850SAkhilesh Sanikop   if ((height & 1) == 0 || height > 1) {
748*09537850SAkhilesh Sanikop     const Pixel* sr;
749*09537850SAkhilesh Sanikop     ptrdiff_t s_stride;
750*09537850SAkhilesh Sanikop     if ((height & 1) == 0) {
751*09537850SAkhilesh Sanikop       sr = bottom_border;
752*09537850SAkhilesh Sanikop       s_stride = bottom_border_stride;
753*09537850SAkhilesh Sanikop     } else {
754*09537850SAkhilesh Sanikop       sr = src + 2 * stride;
755*09537850SAkhilesh Sanikop       s_stride = bottom_border - (src + 2 * stride);
756*09537850SAkhilesh Sanikop     }
757*09537850SAkhilesh Sanikop     BoxSum<Pixel, 5>(sr, s_stride, 2, width + 2, sum5 + 3, square_sum5 + 3);
758*09537850SAkhilesh Sanikop     BoxFilterPass1<bitdepth, Pixel>(src + 3, stride, sum5, square_sum5, width,
759*09537850SAkhilesh Sanikop                                     scale, w0, sgr_buffer, ma565, b565, dst);
760*09537850SAkhilesh Sanikop   }
761*09537850SAkhilesh Sanikop   if ((height & 1) != 0) {
762*09537850SAkhilesh Sanikop     src += 3;
763*09537850SAkhilesh Sanikop     if (height > 1) {
764*09537850SAkhilesh Sanikop       src += 2 * stride;
765*09537850SAkhilesh Sanikop       dst += 2 * stride;
766*09537850SAkhilesh Sanikop       std::swap(ma565[0], ma565[1]);
767*09537850SAkhilesh Sanikop       std::swap(b565[0], b565[1]);
768*09537850SAkhilesh Sanikop       Circulate5PointersBy2<uint16_t>(sum5);
769*09537850SAkhilesh Sanikop       Circulate5PointersBy2<uint32_t>(square_sum5);
770*09537850SAkhilesh Sanikop     }
771*09537850SAkhilesh Sanikop     BoxSum<Pixel, 5>(bottom_border + bottom_border_stride, bottom_border_stride,
772*09537850SAkhilesh Sanikop                      1, width + 2, sum5 + 3, square_sum5 + 3);
773*09537850SAkhilesh Sanikop     sum5[4] = sum5[3];
774*09537850SAkhilesh Sanikop     square_sum5[4] = square_sum5[3];
775*09537850SAkhilesh Sanikop     BoxFilterPreProcess5<bitdepth>(sum5, square_sum5, width, scale, sgr_buffer,
776*09537850SAkhilesh Sanikop                                    ma565[1], b565[1]);
777*09537850SAkhilesh Sanikop     int x = 0;
778*09537850SAkhilesh Sanikop     do {
779*09537850SAkhilesh Sanikop       const int p = CalculateFilteredOutput<Pixel>(
780*09537850SAkhilesh Sanikop           src[x], ma565[0][x] + ma565[1][x], b565[0][x] + b565[1][x], 5);
781*09537850SAkhilesh Sanikop       dst[x] = SelfGuidedSingleMultiplier<bitdepth, Pixel>(src[x], p, w0);
782*09537850SAkhilesh Sanikop     } while (++x != width);
783*09537850SAkhilesh Sanikop   }
784*09537850SAkhilesh Sanikop }
785*09537850SAkhilesh Sanikop 
786*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const Pixel * src,const ptrdiff_t stride,const Pixel * const top_border,const ptrdiff_t top_border_stride,const Pixel * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,Pixel * dst)787*09537850SAkhilesh Sanikop inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
788*09537850SAkhilesh Sanikop                                   const Pixel* src, const ptrdiff_t stride,
789*09537850SAkhilesh Sanikop                                   const Pixel* const top_border,
790*09537850SAkhilesh Sanikop                                   const ptrdiff_t top_border_stride,
791*09537850SAkhilesh Sanikop                                   const Pixel* bottom_border,
792*09537850SAkhilesh Sanikop                                   const ptrdiff_t bottom_border_stride,
793*09537850SAkhilesh Sanikop                                   const int width, const int height,
794*09537850SAkhilesh Sanikop                                   SgrBuffer* const sgr_buffer, Pixel* dst) {
795*09537850SAkhilesh Sanikop   assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
796*09537850SAkhilesh Sanikop   const auto temp_stride = Align<ptrdiff_t>(width, 8);
797*09537850SAkhilesh Sanikop   const ptrdiff_t sum_stride = temp_stride + 8;
798*09537850SAkhilesh Sanikop   const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
799*09537850SAkhilesh Sanikop   const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
800*09537850SAkhilesh Sanikop   const int sgr_proj_index = restoration_info.sgr_proj_info.index;
801*09537850SAkhilesh Sanikop   const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1];  // < 2^12.
802*09537850SAkhilesh Sanikop   uint16_t *sum3[3], *ma343[3], *ma444[2];
803*09537850SAkhilesh Sanikop   uint32_t *square_sum3[3], *b343[3], *b444[2];
804*09537850SAkhilesh Sanikop   sum3[0] = sgr_buffer->sum3;
805*09537850SAkhilesh Sanikop   square_sum3[0] = sgr_buffer->square_sum3;
806*09537850SAkhilesh Sanikop   ma343[0] = sgr_buffer->ma343;
807*09537850SAkhilesh Sanikop   b343[0] = sgr_buffer->b343;
808*09537850SAkhilesh Sanikop   for (int i = 1; i <= 2; ++i) {
809*09537850SAkhilesh Sanikop     sum3[i] = sum3[i - 1] + sum_stride;
810*09537850SAkhilesh Sanikop     square_sum3[i] = square_sum3[i - 1] + sum_stride;
811*09537850SAkhilesh Sanikop     ma343[i] = ma343[i - 1] + temp_stride;
812*09537850SAkhilesh Sanikop     b343[i] = b343[i - 1] + temp_stride;
813*09537850SAkhilesh Sanikop   }
814*09537850SAkhilesh Sanikop   ma444[0] = sgr_buffer->ma444;
815*09537850SAkhilesh Sanikop   ma444[1] = ma444[0] + temp_stride;
816*09537850SAkhilesh Sanikop   b444[0] = sgr_buffer->b444;
817*09537850SAkhilesh Sanikop   b444[1] = b444[0] + temp_stride;
818*09537850SAkhilesh Sanikop   assert(scale != 0);
819*09537850SAkhilesh Sanikop   BoxSum<Pixel, 3>(top_border, top_border_stride, 2, width + 2, sum3,
820*09537850SAkhilesh Sanikop                    square_sum3);
821*09537850SAkhilesh Sanikop   BoxSum<Pixel, 3>(src, stride, 1, width + 2, sum3 + 2, square_sum3 + 2);
822*09537850SAkhilesh Sanikop   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, false,
823*09537850SAkhilesh Sanikop                                  sgr_buffer, ma343[0], b343[0], nullptr,
824*09537850SAkhilesh Sanikop                                  nullptr);
825*09537850SAkhilesh Sanikop   Circulate3PointersBy1<uint16_t>(sum3);
826*09537850SAkhilesh Sanikop   Circulate3PointersBy1<uint32_t>(square_sum3);
827*09537850SAkhilesh Sanikop   const Pixel* s;
828*09537850SAkhilesh Sanikop   if (height > 1) {
829*09537850SAkhilesh Sanikop     s = src + stride;
830*09537850SAkhilesh Sanikop   } else {
831*09537850SAkhilesh Sanikop     s = bottom_border;
832*09537850SAkhilesh Sanikop     bottom_border += bottom_border_stride;
833*09537850SAkhilesh Sanikop   }
834*09537850SAkhilesh Sanikop   BoxSum<Pixel, 3>(s, 0, 1, width + 2, sum3 + 2, square_sum3 + 2);
835*09537850SAkhilesh Sanikop   BoxFilterPreProcess3<bitdepth>(sum3, square_sum3, width, scale, true,
836*09537850SAkhilesh Sanikop                                  sgr_buffer, ma343[1], b343[1], ma444[0],
837*09537850SAkhilesh Sanikop                                  b444[0]);
838*09537850SAkhilesh Sanikop 
839*09537850SAkhilesh Sanikop   for (int y = height - 2; y > 0; --y) {
840*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint16_t>(sum3);
841*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint32_t>(square_sum3);
842*09537850SAkhilesh Sanikop     BoxFilterPass2<bitdepth, Pixel>(src + 2, src + 2 * stride, width, scale, w0,
843*09537850SAkhilesh Sanikop                                     sum3, square_sum3, sgr_buffer, ma343, ma444,
844*09537850SAkhilesh Sanikop                                     b343, b444, dst);
845*09537850SAkhilesh Sanikop     src += stride;
846*09537850SAkhilesh Sanikop     dst += stride;
847*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint16_t>(ma343);
848*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint32_t>(b343);
849*09537850SAkhilesh Sanikop     std::swap(ma444[0], ma444[1]);
850*09537850SAkhilesh Sanikop     std::swap(b444[0], b444[1]);
851*09537850SAkhilesh Sanikop   }
852*09537850SAkhilesh Sanikop 
853*09537850SAkhilesh Sanikop   src += 2;
854*09537850SAkhilesh Sanikop   int y = std::min(height, 2);
855*09537850SAkhilesh Sanikop   do {
856*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint16_t>(sum3);
857*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint32_t>(square_sum3);
858*09537850SAkhilesh Sanikop     BoxFilterPass2<bitdepth, Pixel>(src, bottom_border, width, scale, w0, sum3,
859*09537850SAkhilesh Sanikop                                     square_sum3, sgr_buffer, ma343, ma444, b343,
860*09537850SAkhilesh Sanikop                                     b444, dst);
861*09537850SAkhilesh Sanikop     src += stride;
862*09537850SAkhilesh Sanikop     dst += stride;
863*09537850SAkhilesh Sanikop     bottom_border += bottom_border_stride;
864*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint16_t>(ma343);
865*09537850SAkhilesh Sanikop     Circulate3PointersBy1<uint32_t>(b343);
866*09537850SAkhilesh Sanikop     std::swap(ma444[0], ma444[1]);
867*09537850SAkhilesh Sanikop     std::swap(b444[0], b444[1]);
868*09537850SAkhilesh Sanikop   } while (--y != 0);
869*09537850SAkhilesh Sanikop }
870*09537850SAkhilesh Sanikop 
871*09537850SAkhilesh Sanikop template <int bitdepth, typename Pixel>
SelfGuidedFilter_C(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)872*09537850SAkhilesh Sanikop void SelfGuidedFilter_C(
873*09537850SAkhilesh Sanikop     const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
874*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
875*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const top_border,
876*09537850SAkhilesh Sanikop     const ptrdiff_t top_border_stride,
877*09537850SAkhilesh Sanikop     const void* LIBGAV1_RESTRICT const bottom_border,
878*09537850SAkhilesh Sanikop     const ptrdiff_t bottom_border_stride, const int width, const int height,
879*09537850SAkhilesh Sanikop     RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
880*09537850SAkhilesh Sanikop     void* LIBGAV1_RESTRICT const dest) {
881*09537850SAkhilesh Sanikop   const int index = restoration_info.sgr_proj_info.index;
882*09537850SAkhilesh Sanikop   const int radius_pass_0 = kSgrProjParams[index][0];  // 2 or 0
883*09537850SAkhilesh Sanikop   const int radius_pass_1 = kSgrProjParams[index][2];  // 1 or 0
884*09537850SAkhilesh Sanikop   const auto* src = static_cast<const Pixel*>(source);
885*09537850SAkhilesh Sanikop   const auto* top = static_cast<const Pixel*>(top_border);
886*09537850SAkhilesh Sanikop   const auto* bottom = static_cast<const Pixel*>(bottom_border);
887*09537850SAkhilesh Sanikop   auto* dst = static_cast<Pixel*>(dest);
888*09537850SAkhilesh Sanikop   SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
889*09537850SAkhilesh Sanikop   if (radius_pass_1 == 0) {
890*09537850SAkhilesh Sanikop     // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
891*09537850SAkhilesh Sanikop     // following assertion.
892*09537850SAkhilesh Sanikop     assert(radius_pass_0 != 0);
893*09537850SAkhilesh Sanikop     BoxFilterProcessPass1<bitdepth, Pixel>(
894*09537850SAkhilesh Sanikop         restoration_info, src - 3, stride, top - 3, top_border_stride,
895*09537850SAkhilesh Sanikop         bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
896*09537850SAkhilesh Sanikop   } else if (radius_pass_0 == 0) {
897*09537850SAkhilesh Sanikop     BoxFilterProcessPass2<bitdepth, Pixel>(
898*09537850SAkhilesh Sanikop         restoration_info, src - 2, stride, top - 2, top_border_stride,
899*09537850SAkhilesh Sanikop         bottom - 2, bottom_border_stride, width, height, sgr_buffer, dst);
900*09537850SAkhilesh Sanikop   } else {
901*09537850SAkhilesh Sanikop     BoxFilterProcess<bitdepth, Pixel>(
902*09537850SAkhilesh Sanikop         restoration_info, src - 3, stride, top - 3, top_border_stride,
903*09537850SAkhilesh Sanikop         bottom - 3, bottom_border_stride, width, height, sgr_buffer, dst);
904*09537850SAkhilesh Sanikop   }
905*09537850SAkhilesh Sanikop }
906*09537850SAkhilesh Sanikop 
Init8bpp()907*09537850SAkhilesh Sanikop void Init8bpp() {
908*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
909*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
910*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
911*09537850SAkhilesh Sanikop   dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
912*09537850SAkhilesh Sanikop   dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
913*09537850SAkhilesh Sanikop #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
914*09537850SAkhilesh Sanikop   static_cast<void>(dsp);
915*09537850SAkhilesh Sanikop #ifndef LIBGAV1_Dsp8bpp_WienerFilter
916*09537850SAkhilesh Sanikop   dsp->loop_restorations[0] = WienerFilter_C<8, uint8_t>;
917*09537850SAkhilesh Sanikop #endif
918*09537850SAkhilesh Sanikop #ifndef LIBGAV1_Dsp8bpp_SelfGuidedFilter
919*09537850SAkhilesh Sanikop   dsp->loop_restorations[1] = SelfGuidedFilter_C<8, uint8_t>;
920*09537850SAkhilesh Sanikop #endif
921*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
922*09537850SAkhilesh Sanikop }
923*09537850SAkhilesh Sanikop 
924*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp()925*09537850SAkhilesh Sanikop void Init10bpp() {
926*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
927*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
928*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
929*09537850SAkhilesh Sanikop   dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
930*09537850SAkhilesh Sanikop   dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
931*09537850SAkhilesh Sanikop #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
932*09537850SAkhilesh Sanikop   static_cast<void>(dsp);
933*09537850SAkhilesh Sanikop #ifndef LIBGAV1_Dsp10bpp_WienerFilter
934*09537850SAkhilesh Sanikop   dsp->loop_restorations[0] = WienerFilter_C<10, uint16_t>;
935*09537850SAkhilesh Sanikop #endif
936*09537850SAkhilesh Sanikop #ifndef LIBGAV1_Dsp10bpp_SelfGuidedFilter
937*09537850SAkhilesh Sanikop   dsp->loop_restorations[1] = SelfGuidedFilter_C<10, uint16_t>;
938*09537850SAkhilesh Sanikop #endif
939*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
940*09537850SAkhilesh Sanikop }
941*09537850SAkhilesh Sanikop #endif  // LIBGAV1_MAX_BITDEPTH >= 10
942*09537850SAkhilesh Sanikop 
943*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH == 12
Init12bpp()944*09537850SAkhilesh Sanikop void Init12bpp() {
945*09537850SAkhilesh Sanikop   Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
946*09537850SAkhilesh Sanikop   assert(dsp != nullptr);
947*09537850SAkhilesh Sanikop #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
948*09537850SAkhilesh Sanikop   dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
949*09537850SAkhilesh Sanikop   dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
950*09537850SAkhilesh Sanikop #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
951*09537850SAkhilesh Sanikop   static_cast<void>(dsp);
952*09537850SAkhilesh Sanikop #ifndef LIBGAV1_Dsp12bpp_WienerFilter
953*09537850SAkhilesh Sanikop   dsp->loop_restorations[0] = WienerFilter_C<12, uint16_t>;
954*09537850SAkhilesh Sanikop #endif
955*09537850SAkhilesh Sanikop #ifndef LIBGAV1_Dsp12bpp_SelfGuidedFilter
956*09537850SAkhilesh Sanikop   dsp->loop_restorations[1] = SelfGuidedFilter_C<12, uint16_t>;
957*09537850SAkhilesh Sanikop #endif
958*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
959*09537850SAkhilesh Sanikop }
960*09537850SAkhilesh Sanikop #endif  // LIBGAV1_MAX_BITDEPTH == 12
961*09537850SAkhilesh Sanikop 
962*09537850SAkhilesh Sanikop }  // namespace
963*09537850SAkhilesh Sanikop 
LoopRestorationInit_C()964*09537850SAkhilesh Sanikop void LoopRestorationInit_C() {
965*09537850SAkhilesh Sanikop   Init8bpp();
966*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH >= 10
967*09537850SAkhilesh Sanikop   Init10bpp();
968*09537850SAkhilesh Sanikop #endif
969*09537850SAkhilesh Sanikop #if LIBGAV1_MAX_BITDEPTH == 12
970*09537850SAkhilesh Sanikop   Init12bpp();
971*09537850SAkhilesh Sanikop #endif
972*09537850SAkhilesh Sanikop }
973*09537850SAkhilesh Sanikop 
974*09537850SAkhilesh Sanikop }  // namespace dsp
975*09537850SAkhilesh Sanikop }  // namespace libgav1
976