xref: /aosp_15_r20/external/libdav1d/src/arm/64/cdef.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30#include "cdef_tmpl.S"
31
32.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
33        tst             w7,  #1 // CDEF_HAVE_LEFT
34        b.eq            2f
35        // CDEF_HAVE_LEFT
36        sub             \s1,  \s1,  #2
37        sub             \s2,  \s2,  #2
38        tst             w7,  #2 // CDEF_HAVE_RIGHT
39        b.eq            1f
40        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
41        ldr             \rn\()0, [\s1]
42        ldr             s1,      [\s1, #\w]
43        ldr             \rn\()2, [\s2]
44        ldr             s3,      [\s2, #\w]
45        uxtl            v0.8h,   v0.8b
46        uxtl            v1.8h,   v1.8b
47        uxtl            v2.8h,   v2.8b
48        uxtl            v3.8h,   v3.8b
49        str             \rw\()0, [x0]
50        str             d1,      [x0, #2*\w]
51        add             x0,  x0,  #2*\stride
52        str             \rw\()2, [x0]
53        str             d3,      [x0, #2*\w]
54.if \ret
55        ret
56.else
57        add             x0,  x0,  #2*\stride
58        b               3f
59.endif
60
611:
62        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
63        ldr             \rn\()0, [\s1]
64        ldr             h1,      [\s1, #\w]
65        ldr             \rn\()2, [\s2]
66        ldr             h3,      [\s2, #\w]
67        uxtl            v0.8h,   v0.8b
68        uxtl            v1.8h,   v1.8b
69        uxtl            v2.8h,   v2.8b
70        uxtl            v3.8h,   v3.8b
71        str             \rw\()0, [x0]
72        str             s1,      [x0, #2*\w]
73        str             s31,     [x0, #2*\w+4]
74        add             x0,  x0,  #2*\stride
75        str             \rw\()2, [x0]
76        str             s3,      [x0, #2*\w]
77        str             s31,     [x0, #2*\w+4]
78.if \ret
79        ret
80.else
81        add             x0,  x0,  #2*\stride
82        b               3f
83.endif
84
852:
86        // !CDEF_HAVE_LEFT
87        tst             w7,  #2 // CDEF_HAVE_RIGHT
88        b.eq            1f
89        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
90        ldr             \rn\()0, [\s1]
91        ldr             h1,      [\s1, #\w]
92        ldr             \rn\()2, [\s2]
93        ldr             h3,      [\s2, #\w]
94        uxtl            v0.8h,  v0.8b
95        uxtl            v1.8h,  v1.8b
96        uxtl            v2.8h,  v2.8b
97        uxtl            v3.8h,  v3.8b
98        str             s31, [x0]
99        stur            \rw\()0, [x0, #4]
100        str             s1,      [x0, #4+2*\w]
101        add             x0,  x0,  #2*\stride
102        str             s31, [x0]
103        stur            \rw\()2, [x0, #4]
104        str             s3,      [x0, #4+2*\w]
105.if \ret
106        ret
107.else
108        add             x0,  x0,  #2*\stride
109        b               3f
110.endif
111
1121:
113        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
114        ldr             \rn\()0, [\s1]
115        ldr             \rn\()1, [\s2]
116        uxtl            v0.8h,  v0.8b
117        uxtl            v1.8h,  v1.8b
118        str             s31,     [x0]
119        stur            \rw\()0, [x0, #4]
120        str             s31,     [x0, #4+2*\w]
121        add             x0,  x0,  #2*\stride
122        str             s31,     [x0]
123        stur            \rw\()1, [x0, #4]
124        str             s31,     [x0, #4+2*\w]
125.if \ret
126        ret
127.else
128        add             x0,  x0,  #2*\stride
129.endif
1303:
131.endm
132
133.macro load_n_incr dst, src, incr, w
134.if \w == 4
135        ld1             {\dst\().s}[0], [\src], \incr
136.else
137        ld1             {\dst\().8b},   [\src], \incr
138.endif
139.endm
140
141// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
142//                                    ptrdiff_t src_stride, const pixel (*left)[2],
143//                                    const pixel *const top,
144//                                    const pixel *const bottom, int h,
145//                                    enum CdefEdgeFlags edges);
146
147.macro padding_func w, stride, rn, rw
148function cdef_padding\w\()_8bpc_neon, export=1
149        cmp             w7,  #0xf // fully edged
150        b.eq            cdef_padding\w\()_edged_8bpc_neon
151        movi            v30.8h,  #0x80, lsl #8
152        mov             v31.16b, v30.16b
153        sub             x0,  x0,  #2*(2*\stride+2)
154        tst             w7,  #4 // CDEF_HAVE_TOP
155        b.ne            1f
156        // !CDEF_HAVE_TOP
157        st1             {v30.8h, v31.8h}, [x0], #32
158.if \w == 8
159        st1             {v30.8h, v31.8h}, [x0], #32
160.endif
161        b               3f
1621:
163        // CDEF_HAVE_TOP
164        add             x9,  x4,  x2
165        pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0
166
167        // Middle section
1683:
169        tst             w7,  #1 // CDEF_HAVE_LEFT
170        b.eq            2f
171        // CDEF_HAVE_LEFT
172        tst             w7,  #2 // CDEF_HAVE_RIGHT
173        b.eq            1f
174        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
1750:
176        ld1             {v0.h}[0], [x3], #2
177        ldr             h2,      [x1, #\w]
178        load_n_incr     v1,  x1,  x2,  \w
179        subs            w6,  w6,  #1
180        uxtl            v0.8h,  v0.8b
181        uxtl            v1.8h,  v1.8b
182        uxtl            v2.8h,  v2.8b
183        str             s0,      [x0]
184        stur            \rw\()1, [x0, #4]
185        str             s2,      [x0, #4+2*\w]
186        add             x0,  x0,  #2*\stride
187        b.gt            0b
188        b               3f
1891:
190        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
191        ld1             {v0.h}[0], [x3], #2
192        load_n_incr     v1,  x1,  x2,  \w
193        subs            w6,  w6,  #1
194        uxtl            v0.8h,  v0.8b
195        uxtl            v1.8h,  v1.8b
196        str             s0,      [x0]
197        stur            \rw\()1, [x0, #4]
198        str             s31,     [x0, #4+2*\w]
199        add             x0,  x0,  #2*\stride
200        b.gt            1b
201        b               3f
2022:
203        tst             w7,  #2 // CDEF_HAVE_RIGHT
204        b.eq            1f
205        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
2060:
207        ldr             h1,      [x1, #\w]
208        load_n_incr     v0,  x1,  x2,  \w
209        subs            w6,  w6,  #1
210        uxtl            v0.8h,  v0.8b
211        uxtl            v1.8h,  v1.8b
212        str             s31,     [x0]
213        stur            \rw\()0, [x0, #4]
214        str             s1,      [x0, #4+2*\w]
215        add             x0,  x0,  #2*\stride
216        b.gt            0b
217        b               3f
2181:
219        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
220        load_n_incr     v0,  x1,  x2,  \w
221        subs            w6,  w6,  #1
222        uxtl            v0.8h,  v0.8b
223        str             s31,     [x0]
224        stur            \rw\()0, [x0, #4]
225        str             s31,     [x0, #4+2*\w]
226        add             x0,  x0,  #2*\stride
227        b.gt            1b
228
2293:
230        tst             w7,  #8 // CDEF_HAVE_BOTTOM
231        b.ne            1f
232        // !CDEF_HAVE_BOTTOM
233        st1             {v30.8h, v31.8h}, [x0], #32
234.if \w == 8
235        st1             {v30.8h, v31.8h}, [x0], #32
236.endif
237        ret
2381:
239        // CDEF_HAVE_BOTTOM
240        add             x9,  x5,  x2
241        pad_top_bottom  x5,  x9, \w, \stride, \rn, \rw, 1
242endfunc
243.endm
244
245padding_func 8, 16, d, q
246padding_func 4, 8,  s, d
247
248// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
249//                                    ptrdiff_t src_stride, const pixel (*left)[2],
250//                                    const pixel *const top,
251//                                    const pixel *const bottom, int h,
252//                                    enum CdefEdgeFlags edges);
253
254.macro padding_func_edged w, stride, reg
255function cdef_padding\w\()_edged_8bpc_neon, export=1
256        sub             x4,  x4,  #2
257        sub             x5,  x5,  #2
258        sub             x0,  x0,  #(2*\stride+2)
259
260.if \w == 4
261        ldr             d0, [x4]
262        ldr             d1, [x4, x2]
263        st1             {v0.8b, v1.8b}, [x0], #16
264.else
265        add             x9,  x4,  x2
266        ldr             d0, [x4]
267        ldr             s1, [x4, #8]
268        ldr             d2, [x9]
269        ldr             s3, [x9, #8]
270        str             d0, [x0]
271        str             s1, [x0, #8]
272        str             d2, [x0, #\stride]
273        str             s3, [x0, #\stride+8]
274        add             x0,  x0,  #2*\stride
275.endif
276
2770:
278        ld1             {v0.h}[0], [x3], #2
279        ldr             h2,      [x1, #\w]
280        load_n_incr     v1,  x1,  x2,  \w
281        subs            w6,  w6,  #1
282        str             h0,      [x0]
283        stur            \reg\()1, [x0, #2]
284        str             h2,      [x0, #2+\w]
285        add             x0,  x0,  #\stride
286        b.gt            0b
287
288.if \w == 4
289        ldr             d0, [x5]
290        ldr             d1, [x5, x2]
291        st1             {v0.8b, v1.8b}, [x0], #16
292.else
293        add             x9,  x5,  x2
294        ldr             d0, [x5]
295        ldr             s1, [x5, #8]
296        ldr             d2, [x9]
297        ldr             s3, [x9, #8]
298        str             d0, [x0]
299        str             s1, [x0, #8]
300        str             d2, [x0, #\stride]
301        str             s3, [x0, #\stride+8]
302.endif
303        ret
304endfunc
305.endm
306
307padding_func_edged 8, 16, d
308padding_func_edged 4, 8,  s
309
310tables
311
312filter 8, 8
313filter 4, 8
314
315find_dir 8
316
317.macro load_px_8 d1, d2, w
318.if \w == 8
319        add             x6,  x2,  w9, sxtb          // x + off
320        sub             x9,  x2,  w9, sxtb          // x - off
321        ld1             {\d1\().d}[0], [x6]         // p0
322        add             x6,  x6,  #16               // += stride
323        ld1             {\d2\().d}[0], [x9]         // p1
324        add             x9,  x9,  #16               // += stride
325        ld1             {\d1\().d}[1], [x6]         // p0
326        ld1             {\d2\().d}[1], [x9]         // p0
327.else
328        add             x6,  x2,  w9, sxtb          // x + off
329        sub             x9,  x2,  w9, sxtb          // x - off
330        ld1             {\d1\().s}[0], [x6]         // p0
331        add             x6,  x6,  #8                // += stride
332        ld1             {\d2\().s}[0], [x9]         // p1
333        add             x9,  x9,  #8                // += stride
334        ld1             {\d1\().s}[1], [x6]         // p0
335        add             x6,  x6,  #8                // += stride
336        ld1             {\d2\().s}[1], [x9]         // p1
337        add             x9,  x9,  #8                // += stride
338        ld1             {\d1\().s}[2], [x6]         // p0
339        add             x6,  x6,  #8                // += stride
340        ld1             {\d2\().s}[2], [x9]         // p1
341        add             x9,  x9,  #8                // += stride
342        ld1             {\d1\().s}[3], [x6]         // p0
343        ld1             {\d2\().s}[3], [x9]         // p1
344.endif
345.endm
346.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
347.if \min
348        umin            v3.16b,  v3.16b,  \s1\().16b
349        umax            v4.16b,  v4.16b,  \s1\().16b
350        umin            v3.16b,  v3.16b,  \s2\().16b
351        umax            v4.16b,  v4.16b,  \s2\().16b
352.endif
353        uabd            v16.16b, v0.16b,  \s1\().16b  // abs(diff)
354        uabd            v20.16b, v0.16b,  \s2\().16b  // abs(diff)
355        ushl            v17.16b, v16.16b, \shift      // abs(diff) >> shift
356        ushl            v21.16b, v20.16b, \shift      // abs(diff) >> shift
357        uqsub           v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
358        uqsub           v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
359        cmhi            v18.16b, v0.16b,  \s1\().16b  // px > p0
360        cmhi            v22.16b, v0.16b,  \s2\().16b  // px > p1
361        umin            v17.16b, v17.16b, v16.16b     // imin(abs(diff), clip)
362        umin            v21.16b, v21.16b, v20.16b     // imin(abs(diff), clip)
363        dup             v19.16b, \tap                 // taps[k]
364        neg             v16.16b, v17.16b              // -imin()
365        neg             v20.16b, v21.16b              // -imin()
366        bsl             v18.16b, v16.16b, v17.16b     // constrain() = apply_sign()
367        bsl             v22.16b, v20.16b, v21.16b     // constrain() = apply_sign()
368        mla             v1.16b,  v18.16b, v19.16b     // sum += taps[k] * constrain()
369        mla             v2.16b,  v22.16b, v19.16b     // sum += taps[k] * constrain()
370.endm
371
372// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
373//                                   const uint8_t *tmp, int pri_strength,
374//                                   int sec_strength, int dir, int damping,
375//                                   int h);
376.macro filter_func_8 w, pri, sec, min, suffix
377function cdef_filter\w\suffix\()_edged_8bpc_neon
378.if \pri
379        movrel          x8,  pri_taps
380        and             w9,  w3,  #1
381        add             x8,  x8,  w9, uxtw #1
382.endif
383        movrel          x9,  directions\w
384        add             x5,  x9,  w5, uxtw #1
385        movi            v30.8b,  #7
386        dup             v28.8b,  w6                 // damping
387
388.if \pri
389        dup             v25.16b, w3                 // threshold
390.endif
391.if \sec
392        dup             v27.16b, w4                 // threshold
393.endif
394        trn1            v24.8b,  v25.8b, v27.8b
395        clz             v24.8b,  v24.8b             // clz(threshold)
396        sub             v24.8b,  v30.8b, v24.8b     // ulog2(threshold)
397        uqsub           v24.8b,  v28.8b, v24.8b     // shift = imax(0, damping - ulog2(threshold))
398        neg             v24.8b,  v24.8b             // -shift
399.if \sec
400        dup             v26.16b, v24.b[1]
401.endif
402.if \pri
403        dup             v24.16b, v24.b[0]
404.endif
405
4061:
407.if \w == 8
408        add             x12, x2,  #16
409        ld1             {v0.d}[0], [x2]             // px
410        ld1             {v0.d}[1], [x12]            // px
411.else
412        add             x12, x2,  #1*8
413        add             x13, x2,  #2*8
414        add             x14, x2,  #3*8
415        ld1             {v0.s}[0], [x2]             // px
416        ld1             {v0.s}[1], [x12]            // px
417        ld1             {v0.s}[2], [x13]            // px
418        ld1             {v0.s}[3], [x14]            // px
419.endif
420
421        // We need 9-bits or two 8-bit accululators to fit the sum.
422        // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
423        // Start sum at -1 instead of 0 to help handle rounding later.
424        movi            v1.16b, #255                // sum
425        movi            v2.16b, #0                  // sum
426.if \min
427        mov             v3.16b, v0.16b              // min
428        mov             v4.16b, v0.16b              // max
429.endif
430
431        // Instead of loading sec_taps 2, 1 from memory, just set it
432        // to 2 initially and decrease for the second round.
433        // This is also used as loop counter.
434        mov             w11, #2                     // sec_taps[0]
435
4362:
437.if \pri
438        ldrb            w9,  [x5]                   // off1
439
440        load_px_8       v5,  v6, \w
441.endif
442
443.if \sec
444        add             x5,  x5,  #4                // +2*2
445        ldrb            w9,  [x5]                   // off2
446        load_px_8       v28, v29, \w
447.endif
448
449.if \pri
450        ldrb            w10, [x8]                   // *pri_taps
451
452        handle_pixel_8  v5,  v6,  v25.16b, v24.16b, w10, \min
453.endif
454
455.if \sec
456        add             x5,  x5,  #8                // +2*4
457        ldrb            w9,  [x5]                   // off3
458        load_px_8       v5,  v6,  \w
459
460        handle_pixel_8  v28, v29, v27.16b, v26.16b, w11, \min
461
462        handle_pixel_8  v5,  v6,  v27.16b, v26.16b, w11, \min
463
464        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
465.else
466        add             x5,  x5,  #1                // x5 += 1
467.endif
468        subs            w11, w11, #1                // sec_tap-- (value)
469.if \pri
470        add             x8,  x8,  #1                // pri_taps++ (pointer)
471.endif
472        b.ne            2b
473
474        // Perform halving adds since the value won't fit otherwise.
475        // To handle the offset for negative values, use both halving w/ and w/o rounding.
476        srhadd          v5.16b,  v1.16b,  v2.16b    // sum >> 1
477        shadd           v6.16b,  v1.16b,  v2.16b    // (sum - 1) >> 1
478        cmlt            v1.16b,  v5.16b,  #0        // sum < 0
479        bsl             v1.16b,  v6.16b,  v5.16b    // (sum - (sum < 0)) >> 1
480
481        srshr           v1.16b,  v1.16b,  #3        // (8 + sum - (sum < 0)) >> 4
482
483        usqadd          v0.16b,  v1.16b             // px + (8 + sum ...) >> 4
484.if \min
485        umin            v0.16b,  v0.16b,  v4.16b
486        umax            v0.16b,  v0.16b,  v3.16b    // iclip(px + .., min, max)
487.endif
488.if \w == 8
489        st1             {v0.d}[0], [x0], x1
490        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
491        subs            w7,  w7,  #2                // h -= 2
492        st1             {v0.d}[1], [x0], x1
493.else
494        st1             {v0.s}[0], [x0], x1
495        add             x2,  x2,  #4*8              // tmp += 4*tmp_stride
496        st1             {v0.s}[1], [x0], x1
497        subs            w7,  w7,  #4                // h -= 4
498        st1             {v0.s}[2], [x0], x1
499        st1             {v0.s}[3], [x0], x1
500.endif
501
502        // Reset pri_taps and directions back to the original point
503        sub             x5,  x5,  #2
504.if \pri
505        sub             x8,  x8,  #2
506.endif
507
508        b.gt            1b
509        ret
510endfunc
511.endm
512
513.macro filter_8 w
514filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
515filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
516filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
517.endm
518
519filter_8 8
520filter_8 4
521