xref: /aosp_15_r20/external/libdav1d/src/arm/32/cdef.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30#include "cdef_tmpl.S"
31
32// n1 = s0/d0
33// w1 = d0/q0
34// n2 = s4/d2
35// w2 = d2/q1
36.macro pad_top_bottom s1, s2, w, stride, n1, w1, n2, w2, align, ret
37        tst             r7,  #1 // CDEF_HAVE_LEFT
38        beq             2f
39        // CDEF_HAVE_LEFT
40        tst             r7,  #2 // CDEF_HAVE_RIGHT
41        beq             1f
42        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
43        ldrh            r12, [\s1, #-2]
44        vldr            \n1, [\s1]
45        vdup.16         d4,  r12
46        ldrh            r12, [\s1, #\w]
47        vmov.16         d4[1], r12
48        ldrh            r12, [\s2, #-2]
49        vldr            \n2, [\s2]
50        vmov.16         d4[2], r12
51        ldrh            r12, [\s2, #\w]
52        vmovl.u8        q0,  d0
53        vmov.16         d4[3], r12
54        vmovl.u8        q1,  d2
55        vmovl.u8        q2,  d4
56        vstr            s8,  [r0, #-4]
57        vst1.16         {\w1}, [r0, :\align]
58        vstr            s9,  [r0, #2*\w]
59        add             r0,  r0,  #2*\stride
60        vstr            s10, [r0, #-4]
61        vst1.16         {\w2}, [r0, :\align]
62        vstr            s11, [r0, #2*\w]
63.if \ret
64        pop             {r4-r8,pc}
65.else
66        add             r0,  r0,  #2*\stride
67        b               3f
68.endif
69
701:
71        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
72        ldrh            r12, [\s1, #-2]
73        vldr            \n1, [\s1]
74        vdup.16         d4,  r12
75        ldrh            r12, [\s2, #-2]
76        vldr            \n2, [\s2]
77        vmovl.u8        q0,  d0
78        vmov.16         d4[1], r12
79        vmovl.u8        q1,  d2
80        vmovl.u8        q2,  d4
81        vstr            s8,  [r0, #-4]
82        vst1.16         {\w1}, [r0, :\align]
83        vstr            s12, [r0, #2*\w]
84        add             r0,  r0,  #2*\stride
85        vstr            s9,  [r0, #-4]
86        vst1.16         {\w2}, [r0, :\align]
87        vstr            s12, [r0, #2*\w]
88.if \ret
89        pop             {r4-r8,pc}
90.else
91        add             r0,  r0,  #2*\stride
92        b               3f
93.endif
94
952:
96        // !CDEF_HAVE_LEFT
97        tst             r7,  #2 // CDEF_HAVE_RIGHT
98        beq             1f
99        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
100        vldr            \n1, [\s1]
101        ldrh            r12, [\s1, #\w]
102        vldr            \n2, [\s2]
103        vdup.16         d4,  r12
104        ldrh            r12, [\s2, #\w]
105        vmovl.u8        q0,  d0
106        vmov.16         d4[1], r12
107        vmovl.u8        q1,  d2
108        vmovl.u8        q2,  d4
109        vstr            s12, [r0, #-4]
110        vst1.16         {\w1}, [r0, :\align]
111        vstr            s8,  [r0, #2*\w]
112        add             r0,  r0,  #2*\stride
113        vstr            s12, [r0, #-4]
114        vst1.16         {\w2}, [r0, :\align]
115        vstr            s9,  [r0, #2*\w]
116.if \ret
117        pop             {r4-r8,pc}
118.else
119        add             r0,  r0,  #2*\stride
120        b               3f
121.endif
122
1231:
124        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
125        vldr            \n1, [\s1]
126        vldr            \n2, [\s2]
127        vmovl.u8        q0,  d0
128        vmovl.u8        q1,  d2
129        vstr            s12, [r0, #-4]
130        vst1.16         {\w1}, [r0, :\align]
131        vstr            s12, [r0, #2*\w]
132        add             r0,  r0,  #2*\stride
133        vstr            s12, [r0, #-4]
134        vst1.16         {\w2}, [r0, :\align]
135        vstr            s12, [r0, #2*\w]
136.if \ret
137        pop             {r4-r8,pc}
138.else
139        add             r0,  r0,  #2*\stride
140.endif
1413:
142.endm
143
144.macro load_n_incr dst, src, incr, w
145.if \w == 4
146        vld1.32         {\dst\()[0]}, [\src, :32], \incr
147.else
148        vld1.8          {\dst\()},    [\src, :64], \incr
149.endif
150.endm
151
152// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
153//                                    ptrdiff_t src_stride, const pixel (*left)[2],
154//                                    const pixel *const top,
155//                                    const pixel *const bottom, int h,
156//                                    enum CdefEdgeFlags edges);
157
158// n1 = s0/d0
159// w1 = d0/q0
160// n2 = s4/d2
161// w2 = d2/q1
162.macro padding_func w, stride, n1, w1, n2, w2, align
163function cdef_padding\w\()_8bpc_neon, export=1
164        push            {r4-r8,lr}
165        ldrd            r4,  r5,  [sp, #24]
166        ldrd            r6,  r7,  [sp, #32]
167        cmp             r7,  #0xf // fully edged
168        beq             cdef_padding\w\()_edged_8bpc_neon
169        vmov.i16        q3,  #0x8000
170        tst             r7,  #4 // CDEF_HAVE_TOP
171        bne             1f
172        // !CDEF_HAVE_TOP
173        sub             r12, r0,  #2*(2*\stride+2)
174        vmov.i16        q2,  #0x8000
175        vst1.16         {q2,q3}, [r12]!
176.if \w == 8
177        vst1.16         {q2,q3}, [r12]!
178.endif
179        b               3f
1801:
181        // CDEF_HAVE_TOP
182        add             r8,  r4,  r2
183        sub             r0,  r0,  #2*(2*\stride)
184        pad_top_bottom  r4,  r8,  \w, \stride, \n1, \w1, \n2, \w2, \align, 0
185
186        // Middle section
1873:
188        tst             r7,  #1 // CDEF_HAVE_LEFT
189        beq             2f
190        // CDEF_HAVE_LEFT
191        tst             r7,  #2 // CDEF_HAVE_RIGHT
192        beq             1f
193        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
1940:
195        vld1.16         {d2[]}, [r3, :16]!
196        ldrh            r12, [r1, #\w]
197        load_n_incr     d0,  r1,  r2,  \w
198        subs            r6,  r6,  #1
199        vmov.16         d2[1], r12
200        vmovl.u8        q0,  d0
201        vmovl.u8        q1,  d2
202        vstr            s4,  [r0, #-4]
203        vst1.16         {\w1}, [r0, :\align]
204        vstr            s5,  [r0, #2*\w]
205        add             r0,  r0,  #2*\stride
206        bgt             0b
207        b               3f
2081:
209        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
210        vld1.16         {d2[]}, [r3, :16]!
211        load_n_incr     d0,  r1,  r2,  \w
212        subs            r6,  r6,  #1
213        vmovl.u8        q0,  d0
214        vmovl.u8        q1,  d2
215        vstr            s4,  [r0, #-4]
216        vst1.16         {\w1}, [r0, :\align]
217        vstr            s12, [r0, #2*\w]
218        add             r0,  r0,  #2*\stride
219        bgt             1b
220        b               3f
2212:
222        tst             r7,  #2 // CDEF_HAVE_RIGHT
223        beq             1f
224        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
2250:
226        ldrh            r12, [r1, #\w]
227        load_n_incr     d0,  r1,  r2,  \w
228        vdup.16         d2,  r12
229        subs            r6,  r6,  #1
230        vmovl.u8        q0,  d0
231        vmovl.u8        q1,  d2
232        vstr            s12, [r0, #-4]
233        vst1.16         {\w1}, [r0, :\align]
234        vstr            s4,  [r0, #2*\w]
235        add             r0,  r0,  #2*\stride
236        bgt             0b
237        b               3f
2381:
239        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
240        load_n_incr     d0,  r1,  r2,  \w
241        subs            r6,  r6,  #1
242        vmovl.u8        q0,  d0
243        vstr            s12, [r0, #-4]
244        vst1.16         {\w1}, [r0, :\align]
245        vstr            s12, [r0, #2*\w]
246        add             r0,  r0,  #2*\stride
247        bgt             1b
248
2493:
250        tst             r7,  #8 // CDEF_HAVE_BOTTOM
251        bne             1f
252        // !CDEF_HAVE_BOTTOM
253        sub             r12, r0,  #4
254        vmov.i16        q2,  #0x8000
255        vst1.16         {q2,q3}, [r12]!
256.if \w == 8
257        vst1.16         {q2,q3}, [r12]!
258.endif
259        pop             {r4-r8,pc}
2601:
261        // CDEF_HAVE_BOTTOM
262        add             r8,  r5,  r2
263        pad_top_bottom  r5,  r8,  \w, \stride, \n1, \w1, \n2, \w2, \align, 1
264endfunc
265.endm
266
267padding_func 8, 16, d0, q0, d2, q1, 128
268padding_func 4, 8,  s0, d0, s4, d2, 64
269
270// void cdef_paddingX_edged_8bpc_neon(uint16_t *tmp, const pixel *src,
271//                                    ptrdiff_t src_stride, const pixel (*left)[2],
272//                                    const pixel *const top,
273//                                    const pixel *const bottom, int h,
274//                                    enum CdefEdgeFlags edges);
275
276.macro padding_func_edged w, stride, reg, align
277function cdef_padding\w\()_edged_8bpc_neon
278        sub             r0,  r0,  #(2*\stride)
279
280        ldrh            r12, [r4, #-2]
281        vldr            \reg, [r4]
282        add             r8,  r4,  r2
283        strh            r12, [r0, #-2]
284        ldrh            r12, [r4, #\w]
285        vstr            \reg, [r0]
286        strh            r12, [r0, #\w]
287
288        ldrh            r12, [r8, #-2]
289        vldr            \reg, [r8]
290        strh            r12, [r0, #\stride-2]
291        ldrh            r12, [r8, #\w]
292        vstr            \reg, [r0, #\stride]
293        strh            r12, [r0, #\stride+\w]
294        add             r0,  r0,  #2*\stride
295
2960:
297        ldrh            r12, [r3], #2
298        vldr            \reg, [r1]
299        str             r12, [r0, #-2]
300        ldrh            r12, [r1, #\w]
301        add             r1,  r1,  r2
302        subs            r6,  r6,  #1
303        vstr            \reg, [r0]
304        str             r12, [r0, #\w]
305        add             r0,  r0,  #\stride
306        bgt             0b
307
308        ldrh            r12, [r5, #-2]
309        vldr            \reg, [r5]
310        add             r8,  r5,  r2
311        strh            r12, [r0, #-2]
312        ldrh            r12, [r5, #\w]
313        vstr            \reg, [r0]
314        strh            r12, [r0, #\w]
315
316        ldrh            r12, [r8, #-2]
317        vldr            \reg, [r8]
318        strh            r12, [r0, #\stride-2]
319        ldrh            r12, [r8, #\w]
320        vstr            \reg, [r0, #\stride]
321        strh            r12, [r0, #\stride+\w]
322
323        pop             {r4-r8,pc}
324endfunc
325.endm
326
327padding_func_edged 8, 16, d0, 64
328padding_func_edged 4, 8,  s0, 32
329
330tables
331
332filter 8, 8
333filter 4, 8
334
335find_dir 8
336
337.macro load_px_8 d11, d12, d21, d22, w
338.if \w == 8
339        add             r6,  r2,  r9         // x + off
340        sub             r9,  r2,  r9         // x - off
341        vld1.8          {\d11}, [r6]         // p0
342        add             r6,  r6,  #16        // += stride
343        vld1.8          {\d21}, [r9]         // p1
344        add             r9,  r9,  #16        // += stride
345        vld1.8          {\d12}, [r6]         // p0
346        vld1.8          {\d22}, [r9]         // p1
347.else
348        add             r6,  r2,  r9         // x + off
349        sub             r9,  r2,  r9         // x - off
350        vld1.32         {\d11[0]}, [r6]      // p0
351        add             r6,  r6,  #8         // += stride
352        vld1.32         {\d21[0]}, [r9]      // p1
353        add             r9,  r9,  #8         // += stride
354        vld1.32         {\d11[1]}, [r6]      // p0
355        add             r6,  r6,  #8         // += stride
356        vld1.32         {\d21[1]}, [r9]      // p1
357        add             r9,  r9,  #8         // += stride
358        vld1.32         {\d12[0]}, [r6]      // p0
359        add             r6,  r6,  #8         // += stride
360        vld1.32         {\d22[0]}, [r9]      // p1
361        add             r9,  r9,  #8         // += stride
362        vld1.32         {\d12[1]}, [r6]      // p0
363        vld1.32         {\d22[1]}, [r9]      // p1
364.endif
365.endm
366.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
367.if \min
368        vmin.u8         q3,  q3,  \s1
369        vmax.u8         q4,  q4,  \s1
370        vmin.u8         q3,  q3,  \s2
371        vmax.u8         q4,  q4,  \s2
372.endif
373        vabd.u8         q8,  q0,  \s1        // abs(diff)
374        vabd.u8         q11, q0,  \s2        // abs(diff)
375        vshl.u8         q9,  q8,  \shift     // abs(diff) >> shift
376        vshl.u8         q12, q11, \shift     // abs(diff) >> shift
377        vqsub.u8        q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
378        vqsub.u8        q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
379        vcgt.u8         q10, q0,  \s1        // px > p0
380        vcgt.u8         q13, q0,  \s2        // px > p1
381        vmin.u8         q9,  q9,  q8         // imin(abs(diff), clip)
382        vmin.u8         q12, q12, q11        // imin(abs(diff), clip)
383        vneg.s8         q8,  q9              // -imin()
384        vneg.s8         q11, q12             // -imin()
385        vbsl            q10, q8,  q9         // constrain() = imax(imin(diff, clip), -clip)
386        vdup.8          d18, \tap            // taps[k]
387        vbsl            q13, q11, q12        // constrain() = imax(imin(diff, clip), -clip)
388        vmlal.s8        q1,  d20, d18        // sum += taps[k] * constrain()
389        vmlal.s8        q1,  d26, d18        // sum += taps[k] * constrain()
390        vmlal.s8        q2,  d21, d18        // sum += taps[k] * constrain()
391        vmlal.s8        q2,  d27, d18        // sum += taps[k] * constrain()
392.endm
393
394// void cdef_filterX_edged_neon(pixel *dst, ptrdiff_t dst_stride,
395//                              const uint16_t *tmp, int pri_strength,
396//                              int sec_strength, int dir, int damping,
397//                              int h, size_t edges);
398.macro filter_func_8 w, pri, sec, min, suffix
399function cdef_filter\w\suffix\()_edged_neon
400.if \pri
401        movrel_local    r8,  pri_taps
402        and             r9,  r3,  #1
403        add             r8,  r8,  r9, lsl #1
404.endif
405        movrel_local    r9,  directions\w
406        add             r5,  r9,  r5, lsl #1
407        vmov.u8         d17, #7
408        vdup.8          d16, r6              // damping
409
410        vmov.8          d8[0], r3
411        vmov.8          d8[1], r4
412        vclz.i8         d8,  d8              // clz(threshold)
413        vsub.i8         d8,  d17, d8         // ulog2(threshold)
414        vqsub.u8        d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
415        vneg.s8         d8,  d8              // -shift
416.if \sec
417        vdup.8          q6,  d8[1]
418.endif
419.if \pri
420        vdup.8          q5,  d8[0]
421.endif
422
4231:
424.if \w == 8
425        add             r12, r2,  #16
426        vld1.8          {d0},  [r2,  :64]    // px
427        vld1.8          {d1},  [r12, :64]    // px
428.else
429        add             r12, r2,  #8
430        vld1.32         {d0[0]},  [r2,  :32] // px
431        add             r9,  r2,  #2*8
432        vld1.32         {d0[1]},  [r12, :32] // px
433        add             r12, r12, #2*8
434        vld1.32         {d1[0]},  [r9,  :32] // px
435        vld1.32         {d1[1]},  [r12, :32] // px
436.endif
437
438        vmov.u8         q1,  #0              // sum
439        vmov.u8         q2,  #0              // sum
440.if \min
441        vmov.u16        q3,  q0              // min
442        vmov.u16        q4,  q0              // max
443.endif
444
445        // Instead of loading sec_taps 2, 1 from memory, just set it
446        // to 2 initially and decrease for the second round.
447        // This is also used as loop counter.
448        mov             lr,  #2              // sec_taps[0]
449
4502:
451.if \pri
452        ldrsb           r9,  [r5]            // off1
453
454        load_px_8       d28, d29, d30, d31, \w
455.endif
456
457.if \sec
458        add             r5,  r5,  #4         // +2*2
459        ldrsb           r9,  [r5]            // off2
460.endif
461
462.if \pri
463        ldrb            r12, [r8]            // *pri_taps
464        vdup.8          q7,  r3              // threshold
465
466        handle_pixel_8  q14, q15, q7,  q5,  r12, \min
467.endif
468
469.if \sec
470        load_px_8       d28, d29, d30, d31, \w
471
472        add             r5,  r5,  #8         // +2*4
473        ldrsb           r9,  [r5]            // off3
474
475        vdup.8          q7,  r4              // threshold
476
477        handle_pixel_8  q14, q15, q7,  q6,  lr, \min
478
479        load_px_8       d28, d29, d30, d31, \w
480
481        handle_pixel_8  q14, q15, q7,  q6,  lr, \min
482
483        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
484.else
485        add             r5,  r5,  #1         // r5 += 1
486.endif
487        subs            lr,  lr,  #1         // sec_tap-- (value)
488.if \pri
489        add             r8,  r8,  #1         // pri_taps++ (pointer)
490.endif
491        bne             2b
492
493        vshr.s16        q14, q1,  #15        // -(sum < 0)
494        vshr.s16        q15, q2,  #15        // -(sum < 0)
495        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
496        vadd.i16        q2,  q2,  q15        // sum - (sum < 0)
497        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
498        vrshr.s16       q2,  q2,  #4         // (8 + sum - (sum < 0)) >> 4
499        vaddw.u8        q1,  q1,  d0         // px + (8 + sum ...) >> 4
500        vaddw.u8        q2,  q2,  d1         // px + (8 + sum ...) >> 4
501        vqmovun.s16     d0,  q1
502        vqmovun.s16     d1,  q2
503.if \min
504        vmin.u8         q0,  q0,  q4
505        vmax.u8         q0,  q0,  q3         // iclip(px + .., min, max)
506.endif
507.if \w == 8
508        vst1.8          {d0}, [r0, :64], r1
509        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
510        subs            r7,  r7,  #2         // h -= 2
511        vst1.8          {d1}, [r0, :64], r1
512.else
513        vst1.32         {d0[0]}, [r0, :32], r1
514        add             r2,  r2,  #4*8       // tmp += 4*tmp_stride
515        vst1.32         {d0[1]}, [r0, :32], r1
516        subs            r7,  r7,  #4         // h -= 4
517        vst1.32         {d1[0]}, [r0, :32], r1
518        vst1.32         {d1[1]}, [r0, :32], r1
519.endif
520
521        // Reset pri_taps and directions back to the original point
522        sub             r5,  r5,  #2
523.if \pri
524        sub             r8,  r8,  #2
525.endif
526
527        bgt             1b
528        vpop            {q4-q7}
529        pop             {r4-r9,pc}
530endfunc
531.endm
532
533.macro filter_8 w
534filter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
535filter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
536filter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
537.endm
538
539filter_8 8
540filter_8 4
541