1 /*
2 * Copyright © 2019, Luca Barbato
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "src/ppc/dav1d_types.h"
28 #include "src/ppc/cdef.h"
29
30 #if BITDEPTH == 8
vconstrain(const i16x8 diff,const int16_t threshold,const uint16_t shift)31 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
32 const uint16_t shift)
33 {
34 const i16x8 zero = vec_splat_s16(0);
35 if (!threshold) return zero;
36 const i16x8 abs_diff = vec_abs(diff);
37 const b16x8 mask = vec_cmplt(diff, zero);
38 const i16x8 thr = vec_splats(threshold);
39 const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
40 const i16x8 max = vec_max(zero, sub);
41 const i16x8 min = vec_min(abs_diff, max);
42 const i16x8 neg = vec_sub(zero, min);
43 return vec_sel(min, neg, mask);
44 }
45
copy4xN(uint16_t * tmp,const uint8_t * src,const ptrdiff_t src_stride,const uint8_t (* left)[2],const uint8_t * const top,const uint8_t * const bottom,const int w,const int h,const enum CdefEdgeFlags edges)46 static inline void copy4xN(uint16_t *tmp,
47 const uint8_t *src, const ptrdiff_t src_stride,
48 const uint8_t (*left)[2], const uint8_t *const top,
49 const uint8_t *const bottom, const int w, const int h,
50 const enum CdefEdgeFlags edges)
51 {
52 const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
53
54 u16x8 l0;
55 u16x8 l1;
56
57 int y_start = -2, y_end = h + 2;
58
59 // Copy top and bottom first
60 if (!(edges & CDEF_HAVE_TOP)) {
61 l0 = fill;
62 l1 = fill;
63 y_start = 0;
64 } else {
65 l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
66 l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
67 }
68
69 vec_st(l0, 0, tmp - 2 * 8);
70 vec_st(l1, 0, tmp - 1 * 8);
71
72 if (!(edges & CDEF_HAVE_BOTTOM)) {
73 l0 = fill;
74 l1 = fill;
75 y_end -= 2;
76 } else {
77 l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2));
78 l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2));
79 }
80
81 vec_st(l0, 0, tmp + (h + 0) * 8);
82 vec_st(l1, 0, tmp + (h + 1) * 8);
83
84 int y_with_left_edge = 0;
85 if (!(edges & CDEF_HAVE_LEFT)) {
86 u16x8 l = u8h_to_u16(vec_vsx_ld(0, src));
87 vec_vsx_st(l, 0, tmp + 2);
88
89 y_with_left_edge = 1;
90 }
91
92 for (int y = y_with_left_edge; y < h; y++) {
93 u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
94 vec_st(l, 0, tmp + y * 8);
95 }
96
97 if (!(edges & CDEF_HAVE_LEFT)) {
98 for (int y = y_start; y < y_end; y++) {
99 tmp[y * 8] = INT16_MAX;
100 tmp[1 + y * 8] = INT16_MAX;
101 }
102 } else {
103 for (int y = 0; y < h; y++) {
104 tmp[y * 8] = left[y][0];
105 tmp[1 + y * 8] = left[y][1];
106 }
107 }
108 if (!(edges & CDEF_HAVE_RIGHT)) {
109 for (int y = y_start; y < y_end; y++) {
110 tmp[- 2 + (y + 1) * 8] = INT16_MAX;
111 tmp[- 1 + (y + 1) * 8] = INT16_MAX;
112 }
113 }
114 }
115
copy8xN(uint16_t * tmp,const uint8_t * src,const ptrdiff_t src_stride,const uint8_t (* left)[2],const uint8_t * const top,const uint8_t * const bottom,const int w,const int h,const enum CdefEdgeFlags edges)116 static inline void copy8xN(uint16_t *tmp,
117 const uint8_t *src, const ptrdiff_t src_stride,
118 const uint8_t (*left)[2], const uint8_t *const top,
119 const uint8_t *const bottom, const int w, const int h,
120 const enum CdefEdgeFlags edges)
121 {
122 const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
123
124 u16x8 l0h, l0l;
125 u16x8 l1h, l1l;
126
127 int y_start = -2, y_end = h + 2;
128
129 // Copy top and bottom first
130 if (!(edges & CDEF_HAVE_TOP)) {
131 l0h = fill;
132 l0l = fill;
133 l1h = fill;
134 l1l = fill;
135 y_start = 0;
136 } else {
137 u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
138 u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
139 l0h = u8h_to_u16(l0);
140 l0l = u8l_to_u16(l0);
141 l1h = u8h_to_u16(l1);
142 l1l = u8l_to_u16(l1);
143 }
144
145 vec_st(l0h, 0, tmp - 4 * 8);
146 vec_st(l0l, 0, tmp - 3 * 8);
147 vec_st(l1h, 0, tmp - 2 * 8);
148 vec_st(l1l, 0, tmp - 1 * 8);
149
150 if (!(edges & CDEF_HAVE_BOTTOM)) {
151 l0h = fill;
152 l0l = fill;
153 l1h = fill;
154 l1l = fill;
155 y_end -= 2;
156 } else {
157 u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2);
158 u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2);
159 l0h = u8h_to_u16(l0);
160 l0l = u8l_to_u16(l0);
161 l1h = u8h_to_u16(l1);
162 l1l = u8l_to_u16(l1);
163 }
164
165 vec_st(l0h, 0, tmp + (h + 0) * 16);
166 vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
167 vec_st(l1h, 0, tmp + (h + 1) * 16);
168 vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
169
170 int y_with_left_edge = 0;
171 if (!(edges & CDEF_HAVE_LEFT)) {
172 u8x16 l = vec_vsx_ld(0, src);
173 u16x8 lh = u8h_to_u16(l);
174 u16x8 ll = u8l_to_u16(l);
175 vec_vsx_st(lh, 0, tmp + 2);
176 vec_vsx_st(ll, 0, tmp + 8 + 2);
177
178 y_with_left_edge = 1;
179 }
180
181 for (int y = y_with_left_edge; y < h; y++) {
182 u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
183 u16x8 lh = u8h_to_u16(l);
184 u16x8 ll = u8l_to_u16(l);
185 vec_st(lh, 0, tmp + y * 16);
186 vec_st(ll, 0, tmp + 8 + y * 16);
187 }
188
189 if (!(edges & CDEF_HAVE_LEFT)) {
190 for (int y = y_start; y < y_end; y++) {
191 tmp[y * 16] = INT16_MAX;
192 tmp[1 + y * 16] = INT16_MAX;
193 }
194 } else {
195 for (int y = 0; y < h; y++) {
196 tmp[y * 16] = left[y][0];
197 tmp[1 + y * 16] = left[y][1];
198 }
199 }
200 if (!(edges & CDEF_HAVE_RIGHT)) {
201 for (int y = y_start; y < y_end; y++) {
202 tmp[- 6 + (y + 1) * 16] = INT16_MAX;
203 tmp[- 5 + (y + 1) * 16] = INT16_MAX;
204 }
205 }
206 }
207
max_mask(i16x8 a,i16x8 b)208 static inline i16x8 max_mask(i16x8 a, i16x8 b) {
209 const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
210
211 const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
212
213 const i16x8 val = vec_sel(a, b, mask);
214
215 return vec_max(val, b);
216 }
217
218 #define LOAD_PIX(addr) \
219 const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
220 i16x8 sum = vec_splat_s16(0);
221
222 #define LOAD_PIX4(addr) \
223 const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
224 const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \
225 const i16x8 px = vec_xxpermdi(a, b, 0); \
226 i16x8 sum = vec_splat_s16(0);
227
228 #define LOAD_DIR(p, addr, o0, o1) \
229 const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
230 const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
231 const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
232 const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
233
234 #define LOAD_DIR4(p, addr, o0, o1) \
235 LOAD_DIR(p ## a, addr, o0, o1) \
236 LOAD_DIR(p ## b, addr + 8, o0, o1) \
237 const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
238 const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
239 const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
240 const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
241
242 #define CONSTRAIN(p, strength, shift) \
243 const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
244 const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
245 const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
246 const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
247 \
248 i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \
249 i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \
250 i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \
251 i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift);
252
253 #define SETUP_MINMAX \
254 i16x8 max = px; \
255 i16x8 min = px; \
256
257 #define MIN_MAX(p) \
258 max = max_mask(p ## 0, max); \
259 min = vec_min(p ## 0, min); \
260 max = max_mask(p ## 1, max); \
261 min = vec_min(p ## 1, min); \
262 max = max_mask(p ## 2, max); \
263 min = vec_min(p ## 2, min); \
264 max = max_mask(p ## 3, max); \
265 min = vec_min(p ## 3, min);
266
267 #define MAKE_TAPS \
268 const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \
269 const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \
270 const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd));
271
272 #define PRI_0_UPDATE_SUM(p) \
273 sum = vec_madd(tap0, p ## _c0, sum); \
274 sum = vec_madd(tap0, p ## _c1, sum); \
275 sum = vec_madd(tap1, p ## _c2, sum); \
276 sum = vec_madd(tap1, p ## _c3, sum);
277
278 #define UPDATE_SUM(p) \
279 const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
280 const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
281 sum = vec_add(sum, p ## sum0); \
282 sum = vec_add(sum, p ## sum1);
283
284 #define SEC_0_UPDATE_SUM(p) \
285 sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \
286 sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \
287 sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \
288 sum = vec_madd(vec_splat_s16(2), p ## _c3, sum);
289
290 #define BIAS \
291 i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \
292 bias = vec_sub(vec_splat_s16(8), bias); \
293
294 #define STORE4 \
295 dst[0] = vdst[0]; \
296 dst[1] = vdst[1]; \
297 dst[2] = vdst[2]; \
298 dst[3] = vdst[3]; \
299 \
300 tmp += 8; \
301 dst += PXSTRIDE(dst_stride); \
302 dst[0] = vdst[4]; \
303 dst[1] = vdst[5]; \
304 dst[2] = vdst[6]; \
305 dst[3] = vdst[7]; \
306 \
307 tmp += 8; \
308 dst += PXSTRIDE(dst_stride);
309
310 #define STORE4_CLAMPED \
311 BIAS \
312 i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
313 i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
314 STORE4
315
316 #define STORE4_UNCLAMPED \
317 BIAS \
318 i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
319 STORE4
320
321 #define STORE8 \
322 dst[0] = vdst[0]; \
323 dst[1] = vdst[1]; \
324 dst[2] = vdst[2]; \
325 dst[3] = vdst[3]; \
326 dst[4] = vdst[4]; \
327 dst[5] = vdst[5]; \
328 dst[6] = vdst[6]; \
329 dst[7] = vdst[7]; \
330 \
331 tmp += 16; \
332 dst += PXSTRIDE(dst_stride);
333
334 #define STORE8_CLAMPED \
335 BIAS \
336 i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
337 i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
338 STORE8
339
340 #define STORE8_UNCLAMPED \
341 BIAS \
342 i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
343 STORE8
344
345 #define DIRECTIONS(w, tmp_stride) \
346 static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \
347 { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \
348 { 0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \
349 { 0 * tmp_stride + 1, 0 * tmp_stride + 2 }, \
350 { 0 * tmp_stride + 1, 1 * tmp_stride + 2 }, \
351 { 1 * tmp_stride + 1, 2 * tmp_stride + 2 }, \
352 { 1 * tmp_stride + 0, 2 * tmp_stride + 1 }, \
353 { 1 * tmp_stride + 0, 2 * tmp_stride + 0 }, \
354 { 1 * tmp_stride + 0, 2 * tmp_stride - 1 } \
355 };
356
357 DIRECTIONS(4, 8)
358 DIRECTIONS(8, 16)
359
360 static inline void
filter_4xN(pixel * dst,const ptrdiff_t dst_stride,const pixel (* left)[2],const pixel * const top,const pixel * const bottom,const int w,const int h,const int pri_strength,const int sec_strength,const int dir,const int pri_shift,const int sec_shift,const enum CdefEdgeFlags edges,uint16_t * tmp)361 filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
362 const pixel (*left)[2], const pixel *const top,
363 const pixel *const bottom, const int w, const int h,
364 const int pri_strength, const int sec_strength, const int dir,
365 const int pri_shift, const int sec_shift,
366 const enum CdefEdgeFlags edges, uint16_t *tmp)
367 {
368 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
369 const int off1 = cdef_directions4[dir][0];
370 const int off1_1 = cdef_directions4[dir][1];
371
372 const int off2 = cdef_directions4[(dir + 2) & 7][0];
373 const int off3 = cdef_directions4[(dir + 6) & 7][0];
374
375 const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
376 const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
377
378 MAKE_TAPS
379
380 for (int y = 0; y < h / 2; y++) {
381 LOAD_PIX4(tmp)
382
383 SETUP_MINMAX
384
385 // Primary pass
386 LOAD_DIR4(p, tmp, off1, off1_1)
387
388 CONSTRAIN(p, pri_strength, pri_shift)
389
390 MIN_MAX(p)
391
392 PRI_0_UPDATE_SUM(p)
393
394 // Secondary pass 1
395 LOAD_DIR4(s, tmp, off2, off3)
396
397 CONSTRAIN(s, sec_strength, sec_shift)
398
399 MIN_MAX(s)
400
401 SEC_0_UPDATE_SUM(s)
402
403 // Secondary pass 2
404 LOAD_DIR4(s2, tmp, off2_1, off3_1)
405
406 CONSTRAIN(s2, sec_strength, sec_shift)
407
408 MIN_MAX(s2)
409
410 UPDATE_SUM(s2)
411
412 // Store
413 STORE4_CLAMPED
414 }
415 }
416
417 static inline void
filter_4xN_pri(pixel * dst,const ptrdiff_t dst_stride,const pixel (* left)[2],const pixel * const top,const pixel * const bottom,const int w,const int h,const int pri_strength,const int dir,const int pri_shift,const enum CdefEdgeFlags edges,uint16_t * tmp)418 filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride,
419 const pixel (*left)[2], const pixel *const top,
420 const pixel *const bottom, const int w, const int h,
421 const int pri_strength, const int dir,
422 const int pri_shift, const enum CdefEdgeFlags edges,
423 uint16_t *tmp)
424 {
425 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
426 const int off1 = cdef_directions4[dir][0];
427 const int off1_1 = cdef_directions4[dir][1];
428
429 MAKE_TAPS
430
431 for (int y = 0; y < h / 2; y++) {
432 LOAD_PIX4(tmp)
433
434 // Primary pass
435 LOAD_DIR4(p, tmp, off1, off1_1)
436
437 CONSTRAIN(p, pri_strength, pri_shift)
438
439 PRI_0_UPDATE_SUM(p)
440
441 STORE4_UNCLAMPED
442 }
443 }
444
445 static inline void
filter_4xN_sec(pixel * dst,const ptrdiff_t dst_stride,const pixel (* left)[2],const pixel * const top,const pixel * const bottom,const int w,const int h,const int sec_strength,const int dir,const int sec_shift,const enum CdefEdgeFlags edges,uint16_t * tmp)446 filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride,
447 const pixel (*left)[2], const pixel *const top,
448 const pixel *const bottom, const int w, const int h,
449 const int sec_strength, const int dir,
450 const int sec_shift, const enum CdefEdgeFlags edges,
451 uint16_t *tmp)
452 {
453 const int off2 = cdef_directions4[(dir + 2) & 7][0];
454 const int off3 = cdef_directions4[(dir + 6) & 7][0];
455
456 const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
457 const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
458
459 for (int y = 0; y < h / 2; y++) {
460 LOAD_PIX4(tmp)
461 // Secondary pass 1
462 LOAD_DIR4(s, tmp, off2, off3)
463
464 CONSTRAIN(s, sec_strength, sec_shift)
465
466 SEC_0_UPDATE_SUM(s)
467
468 // Secondary pass 2
469 LOAD_DIR4(s2, tmp, off2_1, off3_1)
470
471 CONSTRAIN(s2, sec_strength, sec_shift)
472
473 UPDATE_SUM(s2)
474
475 STORE4_UNCLAMPED
476 }
477 }
478
479 static inline void
filter_8xN(pixel * dst,const ptrdiff_t dst_stride,const pixel (* left)[2],const pixel * const top,const pixel * const bottom,const int w,const int h,const int pri_strength,const int sec_strength,const int dir,const int pri_shift,const int sec_shift,const enum CdefEdgeFlags edges,uint16_t * tmp)480 filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
481 const pixel (*left)[2], const pixel *const top,
482 const pixel *const bottom, const int w, const int h,
483 const int pri_strength, const int sec_strength, const int dir,
484 const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges,
485 uint16_t *tmp)
486 {
487 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
488
489 const int off1 = cdef_directions8[dir][0];
490 const int off1_1 = cdef_directions8[dir][1];
491
492 const int off2 = cdef_directions8[(dir + 2) & 7][0];
493 const int off3 = cdef_directions8[(dir + 6) & 7][0];
494
495 const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
496 const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
497
498 MAKE_TAPS
499
500 for (int y = 0; y < h; y++) {
501 LOAD_PIX(tmp)
502
503 SETUP_MINMAX
504
505 // Primary pass
506 LOAD_DIR(p, tmp, off1, off1_1)
507
508 CONSTRAIN(p, pri_strength, pri_shift)
509
510 MIN_MAX(p)
511
512 PRI_0_UPDATE_SUM(p)
513
514 // Secondary pass 1
515 LOAD_DIR(s, tmp, off2, off3)
516
517 CONSTRAIN(s, sec_strength, sec_shift)
518
519 MIN_MAX(s)
520
521 SEC_0_UPDATE_SUM(s)
522
523 // Secondary pass 2
524 LOAD_DIR(s2, tmp, off2_1, off3_1)
525
526 CONSTRAIN(s2, sec_strength, sec_shift)
527
528 MIN_MAX(s2)
529
530 UPDATE_SUM(s2)
531
532 // Store
533 STORE8_CLAMPED
534 }
535
536 }
537
538 static inline void
filter_8xN_pri(pixel * dst,const ptrdiff_t dst_stride,const pixel (* left)[2],const pixel * const top,const pixel * const bottom,const int w,const int h,const int pri_strength,const int dir,const int pri_shift,const enum CdefEdgeFlags edges,uint16_t * tmp)539 filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride,
540 const pixel (*left)[2], const pixel *const top,
541 const pixel *const bottom, const int w, const int h,
542 const int pri_strength, const int dir,
543 const int pri_shift, const enum CdefEdgeFlags edges,
544 uint16_t *tmp)
545 {
546 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
547 const int off1 = cdef_directions8[dir][0];
548 const int off1_1 = cdef_directions8[dir][1];
549
550 MAKE_TAPS
551
552 for (int y = 0; y < h; y++) {
553 LOAD_PIX(tmp)
554
555 // Primary pass
556 LOAD_DIR(p, tmp, off1, off1_1)
557
558 CONSTRAIN(p, pri_strength, pri_shift)
559
560 PRI_0_UPDATE_SUM(p)
561
562 STORE8_UNCLAMPED
563 }
564 }
565
566 static inline void
filter_8xN_sec(pixel * dst,const ptrdiff_t dst_stride,const pixel (* left)[2],const pixel * const top,const pixel * const bottom,const int w,const int h,const int sec_strength,const int dir,const int sec_shift,const enum CdefEdgeFlags edges,uint16_t * tmp)567 filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride,
568 const pixel (*left)[2], const pixel *const top,
569 const pixel *const bottom, const int w, const int h,
570 const int sec_strength, const int dir,
571 const int sec_shift, const enum CdefEdgeFlags edges,
572 uint16_t *tmp)
573 {
574 const int off2 = cdef_directions8[(dir + 2) & 7][0];
575 const int off3 = cdef_directions8[(dir + 6) & 7][0];
576
577 const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
578 const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
579
580 for (int y = 0; y < h; y++) {
581 LOAD_PIX(tmp)
582
583 // Secondary pass 1
584 LOAD_DIR(s, tmp, off2, off3)
585
586 CONSTRAIN(s, sec_strength, sec_shift)
587
588 SEC_0_UPDATE_SUM(s)
589
590 // Secondary pass 2
591 LOAD_DIR(s2, tmp, off2_1, off3_1)
592
593 CONSTRAIN(s2, sec_strength, sec_shift)
594
595 UPDATE_SUM(s2)
596
597 STORE8_UNCLAMPED
598 }
599 }
600
601 #define cdef_fn(w, h, tmp_stride) \
602 void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
603 const ptrdiff_t dst_stride, \
604 const pixel (*left)[2], \
605 const pixel *const top, \
606 const pixel *const bottom, \
607 const int pri_strength, \
608 const int sec_strength, \
609 const int dir, \
610 const int damping, \
611 const enum CdefEdgeFlags edges) \
612 { \
613 ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
614 uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
615 copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \
616 if (pri_strength) { \
617 const int pri_shift = imax(0, damping - ulog2(pri_strength)); \
618 if (sec_strength) { \
619 const int sec_shift = damping - ulog2(sec_strength); \
620 filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
621 sec_strength, dir, pri_shift, sec_shift, edges, tmp); \
622 } else { \
623 filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
624 dir, pri_shift, edges, tmp); \
625 } \
626 } else { \
627 const int sec_shift = damping - ulog2(sec_strength); \
628 filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \
629 dir, sec_shift, edges, tmp); \
630 } \
631 }
632
633 cdef_fn(4, 4, 8);
634 cdef_fn(4, 8, 8);
635 cdef_fn(8, 8, 16);
636 #endif
637