1 /*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Two Orioles, LLC
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 * list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "config.h"
29
30 #include <stdlib.h>
31
32 #include "common/intops.h"
33
34 #include "src/looprestoration.h"
35 #include "src/tables.h"
36
37 // 256 * 1.5 + 3 + 3 = 390
38 #define REST_UNIT_STRIDE (390)
39
40 // TODO Reuse p when no padding is needed (add and remove lpf pixels in p)
41 // TODO Chroma only requires 2 rows of padding.
42 static NOINLINE void
padding(pixel * dst,const pixel * p,const ptrdiff_t stride,const pixel (* left)[4],const pixel * lpf,int unit_w,const int stripe_h,const enum LrEdgeFlags edges)43 padding(pixel *dst, const pixel *p, const ptrdiff_t stride,
44 const pixel (*left)[4], const pixel *lpf, int unit_w,
45 const int stripe_h, const enum LrEdgeFlags edges)
46 {
47 const int have_left = !!(edges & LR_HAVE_LEFT);
48 const int have_right = !!(edges & LR_HAVE_RIGHT);
49
50 // Copy more pixels if we don't have to pad them
51 unit_w += 3 * have_left + 3 * have_right;
52 pixel *dst_l = dst + 3 * !have_left;
53 p -= 3 * have_left;
54 lpf -= 3 * have_left;
55
56 if (edges & LR_HAVE_TOP) {
57 // Copy previous loop filtered rows
58 const pixel *const above_1 = lpf;
59 const pixel *const above_2 = above_1 + PXSTRIDE(stride);
60 pixel_copy(dst_l, above_1, unit_w);
61 pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
62 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
63 } else {
64 // Pad with first row
65 pixel_copy(dst_l, p, unit_w);
66 pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
67 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
68 if (have_left) {
69 pixel_copy(dst_l, &left[0][1], 3);
70 pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
71 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
72 }
73 }
74
75 pixel *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
76 if (edges & LR_HAVE_BOTTOM) {
77 // Copy next loop filtered rows
78 const pixel *const below_1 = lpf + 6 * PXSTRIDE(stride);
79 const pixel *const below_2 = below_1 + PXSTRIDE(stride);
80 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
81 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
82 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
83 } else {
84 // Pad with last row
85 const pixel *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
86 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
87 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
88 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
89 if (have_left) {
90 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
91 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
92 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
93 }
94 }
95
96 // Inner UNIT_WxSTRIPE_H
97 for (int j = 0; j < stripe_h; j++) {
98 pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
99 dst_tl += REST_UNIT_STRIDE;
100 p += PXSTRIDE(stride);
101 }
102
103 if (!have_right) {
104 pixel *pad = dst_l + unit_w;
105 pixel *row_last = &dst_l[unit_w - 1];
106 // Pad 3x(STRIPE_H+6) with last column
107 for (int j = 0; j < stripe_h + 6; j++) {
108 pixel_set(pad, *row_last, 3);
109 pad += REST_UNIT_STRIDE;
110 row_last += REST_UNIT_STRIDE;
111 }
112 }
113
114 if (!have_left) {
115 // Pad 3x(STRIPE_H+6) with first column
116 for (int j = 0; j < stripe_h + 6; j++) {
117 pixel_set(dst, *dst_l, 3);
118 dst += REST_UNIT_STRIDE;
119 dst_l += REST_UNIT_STRIDE;
120 }
121 } else {
122 dst += 3 * REST_UNIT_STRIDE;
123 for (int j = 0; j < stripe_h; j++) {
124 pixel_copy(dst, &left[j][1], 3);
125 dst += REST_UNIT_STRIDE;
126 }
127 }
128 }
129
130 // FIXME Could split into luma and chroma specific functions,
131 // (since first and last tops are always 0 for chroma)
132 // FIXME Could implement a version that requires less temporary memory
133 // (should be possible to implement with only 6 rows of temp storage)
wiener_c(pixel * p,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)134 static void wiener_c(pixel *p, const ptrdiff_t stride,
135 const pixel (*const left)[4],
136 const pixel *lpf, const int w, const int h,
137 const LooprestorationParams *const params,
138 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
139 {
140 // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
141 // of padding above and below
142 pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
143 pixel *tmp_ptr = tmp;
144
145 padding(tmp, p, stride, left, lpf, w, h, edges);
146
147 // Values stored between horizontal and vertical filtering don't
148 // fit in a uint8_t.
149 uint16_t hor[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
150 uint16_t *hor_ptr = hor;
151
152 const int16_t (*const filter)[8] = params->filter;
153 const int bitdepth = bitdepth_from_max(bitdepth_max);
154 const int round_bits_h = 3 + (bitdepth == 12) * 2;
155 const int rounding_off_h = 1 << (round_bits_h - 1);
156 const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
157 for (int j = 0; j < h + 6; j++) {
158 for (int i = 0; i < w; i++) {
159 int sum = (1 << (bitdepth + 6));
160 #if BITDEPTH == 8
161 sum += tmp_ptr[i + 3] * 128;
162 #endif
163
164 for (int k = 0; k < 7; k++) {
165 sum += tmp_ptr[i + k] * filter[0][k];
166 }
167
168 hor_ptr[i] =
169 iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
170 }
171 tmp_ptr += REST_UNIT_STRIDE;
172 hor_ptr += REST_UNIT_STRIDE;
173 }
174
175 const int round_bits_v = 11 - (bitdepth == 12) * 2;
176 const int rounding_off_v = 1 << (round_bits_v - 1);
177 const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
178 for (int j = 0; j < h; j++) {
179 for (int i = 0; i < w; i++) {
180 int sum = -round_offset;
181
182 for (int k = 0; k < 7; k++) {
183 sum += hor[(j + k) * REST_UNIT_STRIDE + i] * filter[1][k];
184 }
185
186 p[j * PXSTRIDE(stride) + i] =
187 iclip_pixel((sum + rounding_off_v) >> round_bits_v);
188 }
189 }
190 }
191
192 // Sum over a 3x3 area
193 // The dst and src pointers are positioned 3 pixels above and 3 pixels to the
194 // left of the top left corner. However, the self guided filter only needs 1
195 // pixel above and one pixel to the left. As for the pixels below and to the
196 // right they must be computed in the sums, but don't need to be stored.
197 //
198 // Example for a 4x4 block:
199 // x x x x x x x x x x
200 // x c c c c c c c c x
201 // x i s s s s s s i x
202 // x i s s s s s s i x
203 // x i s s s s s s i x
204 // x i s s s s s s i x
205 // x i s s s s s s i x
206 // x i s s s s s s i x
207 // x c c c c c c c c x
208 // x x x x x x x x x x
209 //
210 // s: Pixel summed and stored
211 // i: Pixel summed and stored (between loops)
212 // c: Pixel summed not stored
213 // x: Pixel not summed not stored
boxsum3(int32_t * sumsq,coef * sum,const pixel * src,const int w,const int h)214 static void boxsum3(int32_t *sumsq, coef *sum, const pixel *src,
215 const int w, const int h)
216 {
217 // We skip the first row, as it is never used
218 src += REST_UNIT_STRIDE;
219
220 // We skip the first and last columns, as they are never used
221 for (int x = 1; x < w - 1; x++) {
222 coef *sum_v = sum + x;
223 int32_t *sumsq_v = sumsq + x;
224 const pixel *s = src + x;
225 int a = s[0], a2 = a * a;
226 int b = s[REST_UNIT_STRIDE], b2 = b * b;
227
228 // We skip the first 2 rows, as they are skipped in the next loop and
229 // we don't need the last 2 row as it is skipped in the next loop
230 for (int y = 2; y < h - 2; y++) {
231 s += REST_UNIT_STRIDE;
232 const int c = s[REST_UNIT_STRIDE];
233 const int c2 = c * c;
234 sum_v += REST_UNIT_STRIDE;
235 sumsq_v += REST_UNIT_STRIDE;
236 *sum_v = a + b + c;
237 *sumsq_v = a2 + b2 + c2;
238 a = b;
239 a2 = b2;
240 b = c;
241 b2 = c2;
242 }
243 }
244
245 // We skip the first row as it is never read
246 sum += REST_UNIT_STRIDE;
247 sumsq += REST_UNIT_STRIDE;
248 // We skip the last 2 rows as it is never read
249 for (int y = 2; y < h - 2; y++) {
250 int a = sum[1], a2 = sumsq[1];
251 int b = sum[2], b2 = sumsq[2];
252
253 // We don't store the first column as it is never read and
254 // we don't store the last 2 columns as they are never read
255 for (int x = 2; x < w - 2; x++) {
256 const int c = sum[x + 1], c2 = sumsq[x + 1];
257 sum[x] = a + b + c;
258 sumsq[x] = a2 + b2 + c2;
259 a = b;
260 a2 = b2;
261 b = c;
262 b2 = c2;
263 }
264 sum += REST_UNIT_STRIDE;
265 sumsq += REST_UNIT_STRIDE;
266 }
267 }
268
269 // Sum over a 5x5 area
270 // The dst and src pointers are positioned 3 pixels above and 3 pixels to the
271 // left of the top left corner. However, the self guided filter only needs 1
272 // pixel above and one pixel to the left. As for the pixels below and to the
273 // right they must be computed in the sums, but don't need to be stored.
274 //
275 // Example for a 4x4 block:
276 // c c c c c c c c c c
277 // c c c c c c c c c c
278 // i i s s s s s s i i
279 // i i s s s s s s i i
280 // i i s s s s s s i i
281 // i i s s s s s s i i
282 // i i s s s s s s i i
283 // i i s s s s s s i i
284 // c c c c c c c c c c
285 // c c c c c c c c c c
286 //
287 // s: Pixel summed and stored
288 // i: Pixel summed and stored (between loops)
289 // c: Pixel summed not stored
290 // x: Pixel not summed not stored
boxsum5(int32_t * sumsq,coef * sum,const pixel * const src,const int w,const int h)291 static void boxsum5(int32_t *sumsq, coef *sum, const pixel *const src,
292 const int w, const int h)
293 {
294 for (int x = 0; x < w; x++) {
295 coef *sum_v = sum + x;
296 int32_t *sumsq_v = sumsq + x;
297 const pixel *s = src + 3 * REST_UNIT_STRIDE + x;
298 int a = s[-3 * REST_UNIT_STRIDE], a2 = a * a;
299 int b = s[-2 * REST_UNIT_STRIDE], b2 = b * b;
300 int c = s[-1 * REST_UNIT_STRIDE], c2 = c * c;
301 int d = s[0], d2 = d * d;
302
303 // We skip the first 2 rows, as they are skipped in the next loop and
304 // we don't need the last 2 row as it is skipped in the next loop
305 for (int y = 2; y < h - 2; y++) {
306 s += REST_UNIT_STRIDE;
307 const int e = *s, e2 = e * e;
308 sum_v += REST_UNIT_STRIDE;
309 sumsq_v += REST_UNIT_STRIDE;
310 *sum_v = a + b + c + d + e;
311 *sumsq_v = a2 + b2 + c2 + d2 + e2;
312 a = b;
313 b = c;
314 c = d;
315 d = e;
316 a2 = b2;
317 b2 = c2;
318 c2 = d2;
319 d2 = e2;
320 }
321 }
322
323 // We skip the first row as it is never read
324 sum += REST_UNIT_STRIDE;
325 sumsq += REST_UNIT_STRIDE;
326 for (int y = 2; y < h - 2; y++) {
327 int a = sum[0], a2 = sumsq[0];
328 int b = sum[1], b2 = sumsq[1];
329 int c = sum[2], c2 = sumsq[2];
330 int d = sum[3], d2 = sumsq[3];
331
332 for (int x = 2; x < w - 2; x++) {
333 const int e = sum[x + 2], e2 = sumsq[x + 2];
334 sum[x] = a + b + c + d + e;
335 sumsq[x] = a2 + b2 + c2 + d2 + e2;
336 a = b;
337 b = c;
338 c = d;
339 d = e;
340 a2 = b2;
341 b2 = c2;
342 c2 = d2;
343 d2 = e2;
344 }
345 sum += REST_UNIT_STRIDE;
346 sumsq += REST_UNIT_STRIDE;
347 }
348 }
349
350 static NOINLINE void
selfguided_filter(coef * dst,const pixel * src,const ptrdiff_t src_stride,const int w,const int h,const int n,const unsigned s HIGHBD_DECL_SUFFIX)351 selfguided_filter(coef *dst, const pixel *src, const ptrdiff_t src_stride,
352 const int w, const int h, const int n, const unsigned s
353 HIGHBD_DECL_SUFFIX)
354 {
355 const unsigned sgr_one_by_x = n == 25 ? 164 : 455;
356
357 // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
358 // of padding above and below
359 int32_t sumsq[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
360 int32_t *A = sumsq + 2 * REST_UNIT_STRIDE + 3;
361 // By inverting A and B after the boxsums, B can be of size coef instead
362 // of int32_t
363 coef sum[68 /*(64 + 2 + 2)*/ * REST_UNIT_STRIDE];
364 coef *B = sum + 2 * REST_UNIT_STRIDE + 3;
365
366 const int step = (n == 25) + 1;
367 if (n == 25)
368 boxsum5(sumsq, sum, src, w + 6, h + 6);
369 else
370 boxsum3(sumsq, sum, src, w + 6, h + 6);
371 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
372
373 int32_t *AA = A - REST_UNIT_STRIDE;
374 coef *BB = B - REST_UNIT_STRIDE;
375 for (int j = -1; j < h + 1; j+= step) {
376 for (int i = -1; i < w + 1; i++) {
377 const int a =
378 (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
379 const int b =
380 (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
381
382 const unsigned p = imax(a * n - b * b, 0);
383 const unsigned z = (p * s + (1 << 19)) >> 20;
384 const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
385
386 // This is where we invert A and B, so that B is of size coef.
387 AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
388 BB[i] = x;
389 }
390 AA += step * REST_UNIT_STRIDE;
391 BB += step * REST_UNIT_STRIDE;
392 }
393
394 src += 3 * REST_UNIT_STRIDE + 3;
395 if (n == 25) {
396 int j = 0;
397 #define SIX_NEIGHBORS(P, i)\
398 ((P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 6 + \
399 (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
400 P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 5)
401 for (; j < h - 1; j+=2) {
402 for (int i = 0; i < w; i++) {
403 const int a = SIX_NEIGHBORS(B, i);
404 const int b = SIX_NEIGHBORS(A, i);
405 dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
406 }
407 dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
408 src += REST_UNIT_STRIDE;
409 B += REST_UNIT_STRIDE;
410 A += REST_UNIT_STRIDE;
411 for (int i = 0; i < w; i++) {
412 const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
413 const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
414 dst[i] = (b - a * src[i] + (1 << 7)) >> 8;
415 }
416 dst += 384 /* Maximum restoration width is 384 (256 * 1.5) */;
417 src += REST_UNIT_STRIDE;
418 B += REST_UNIT_STRIDE;
419 A += REST_UNIT_STRIDE;
420 }
421 if (j + 1 == h) { // Last row, when number of rows is odd
422 for (int i = 0; i < w; i++) {
423 const int a = SIX_NEIGHBORS(B, i);
424 const int b = SIX_NEIGHBORS(A, i);
425 dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
426 }
427 }
428 #undef SIX_NEIGHBORS
429 } else {
430 #define EIGHT_NEIGHBORS(P, i)\
431 ((P[i] + P[i - 1] + P[i + 1] + P[i - REST_UNIT_STRIDE] + P[i + REST_UNIT_STRIDE]) * 4 + \
432 (P[i - 1 - REST_UNIT_STRIDE] + P[i - 1 + REST_UNIT_STRIDE] + \
433 P[i + 1 - REST_UNIT_STRIDE] + P[i + 1 + REST_UNIT_STRIDE]) * 3)
434 for (int j = 0; j < h; j++) {
435 for (int i = 0; i < w; i++) {
436 const int a = EIGHT_NEIGHBORS(B, i);
437 const int b = EIGHT_NEIGHBORS(A, i);
438 dst[i] = (b - a * src[i] + (1 << 8)) >> 9;
439 }
440 dst += 384;
441 src += REST_UNIT_STRIDE;
442 B += REST_UNIT_STRIDE;
443 A += REST_UNIT_STRIDE;
444 }
445 }
446 #undef EIGHT_NEIGHBORS
447 }
448
sgr_5x5_c(pixel * p,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)449 static void sgr_5x5_c(pixel *p, const ptrdiff_t stride,
450 const pixel (*const left)[4], const pixel *lpf,
451 const int w, const int h,
452 const LooprestorationParams *const params,
453 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
454 {
455 // Selfguided filter is applied to a maximum stripe height of 64 + 3 pixels
456 // of padding above and below
457 pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
458
459 // Selfguided filter outputs to a maximum stripe height of 64 and a
460 // maximum restoration width of 384 (256 * 1.5)
461 coef dst[64 * 384];
462
463 padding(tmp, p, stride, left, lpf, w, h, edges);
464 selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 25,
465 params->sgr.s0 HIGHBD_TAIL_SUFFIX);
466
467 const int w0 = params->sgr.w0;
468 for (int j = 0; j < h; j++) {
469 for (int i = 0; i < w; i++) {
470 const int v = w0 * dst[j * 384 + i];
471 p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
472 }
473 p += PXSTRIDE(stride);
474 }
475 }
476
sgr_3x3_c(pixel * p,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)477 static void sgr_3x3_c(pixel *p, const ptrdiff_t stride,
478 const pixel (*const left)[4], const pixel *lpf,
479 const int w, const int h,
480 const LooprestorationParams *const params,
481 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
482 {
483 pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
484 coef dst[64 * 384];
485
486 padding(tmp, p, stride, left, lpf, w, h, edges);
487 selfguided_filter(dst, tmp, REST_UNIT_STRIDE, w, h, 9,
488 params->sgr.s1 HIGHBD_TAIL_SUFFIX);
489
490 const int w1 = params->sgr.w1;
491 for (int j = 0; j < h; j++) {
492 for (int i = 0; i < w; i++) {
493 const int v = w1 * dst[j * 384 + i];
494 p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
495 }
496 p += PXSTRIDE(stride);
497 }
498 }
499
sgr_mix_c(pixel * p,const ptrdiff_t stride,const pixel (* const left)[4],const pixel * lpf,const int w,const int h,const LooprestorationParams * const params,const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)500 static void sgr_mix_c(pixel *p, const ptrdiff_t stride,
501 const pixel (*const left)[4], const pixel *lpf,
502 const int w, const int h,
503 const LooprestorationParams *const params,
504 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
505 {
506 pixel tmp[70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE];
507 coef dst0[64 * 384];
508 coef dst1[64 * 384];
509
510 padding(tmp, p, stride, left, lpf, w, h, edges);
511 selfguided_filter(dst0, tmp, REST_UNIT_STRIDE, w, h, 25,
512 params->sgr.s0 HIGHBD_TAIL_SUFFIX);
513 selfguided_filter(dst1, tmp, REST_UNIT_STRIDE, w, h, 9,
514 params->sgr.s1 HIGHBD_TAIL_SUFFIX);
515
516 const int w0 = params->sgr.w0;
517 const int w1 = params->sgr.w1;
518 for (int j = 0; j < h; j++) {
519 for (int i = 0; i < w; i++) {
520 const int v = w0 * dst0[j * 384 + i] + w1 * dst1[j * 384 + i];
521 p[i] = iclip_pixel(p[i] + ((v + (1 << 10)) >> 11));
522 }
523 p += PXSTRIDE(stride);
524 }
525 }
526
527 #if HAVE_ASM
528 #if ARCH_AARCH64 || ARCH_ARM
529 #include "src/arm/looprestoration.h"
530 #elif ARCH_LOONGARCH64
531 #include "src/loongarch/looprestoration.h"
532 #elif ARCH_PPC64LE
533 #include "src/ppc/looprestoration.h"
534 #elif ARCH_X86
535 #include "src/x86/looprestoration.h"
536 #endif
537 #endif
538
bitfn(dav1d_loop_restoration_dsp_init)539 COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
540 const int bpc)
541 {
542 c->wiener[0] = c->wiener[1] = wiener_c;
543 c->sgr[0] = sgr_5x5_c;
544 c->sgr[1] = sgr_3x3_c;
545 c->sgr[2] = sgr_mix_c;
546
547 #if HAVE_ASM
548 #if ARCH_AARCH64 || ARCH_ARM
549 loop_restoration_dsp_init_arm(c, bpc);
550 #elif ARCH_LOONGARCH64
551 loop_restoration_dsp_init_loongarch(c, bpc);
552 #elif ARCH_PPC64LE
553 loop_restoration_dsp_init_ppc(c, bpc);
554 #elif ARCH_X86
555 loop_restoration_dsp_init_x86(c, bpc);
556 #endif
557 #endif
558 }
559