xref: /aosp_15_r20/external/libdav1d/src/arm/64/cdef_tmpl.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker.macro dir_table w, stride
32*c0909341SAndroid Build Coastguard Workerconst directions\w
33*c0909341SAndroid Build Coastguard Worker        .byte           -1 * \stride + 1, -2 * \stride + 2
34*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1, -1 * \stride + 2
35*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  0 * \stride + 2
36*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  1 * \stride + 2
37*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 1,  2 * \stride + 2
38*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride + 1
39*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride + 0
40*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride - 1
41*c0909341SAndroid Build Coastguard Worker// Repeated, to avoid & 7
42*c0909341SAndroid Build Coastguard Worker        .byte           -1 * \stride + 1, -2 * \stride + 2
43*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1, -1 * \stride + 2
44*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  0 * \stride + 2
45*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  1 * \stride + 2
46*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 1,  2 * \stride + 2
47*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride + 1
48*c0909341SAndroid Build Coastguard Workerendconst
49*c0909341SAndroid Build Coastguard Worker.endm
50*c0909341SAndroid Build Coastguard Worker
51*c0909341SAndroid Build Coastguard Worker.macro tables
52*c0909341SAndroid Build Coastguard Workerdir_table 8, 16
53*c0909341SAndroid Build Coastguard Workerdir_table 4, 8
54*c0909341SAndroid Build Coastguard Worker
55*c0909341SAndroid Build Coastguard Workerconst pri_taps
56*c0909341SAndroid Build Coastguard Worker        .byte           4, 2, 3, 3
57*c0909341SAndroid Build Coastguard Workerendconst
58*c0909341SAndroid Build Coastguard Worker.endm
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard Worker.macro load_px d1, d2, w
61*c0909341SAndroid Build Coastguard Worker.if \w == 8
62*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w9, sxtb #1       // x + off
63*c0909341SAndroid Build Coastguard Worker        sub             x9,  x2,  w9, sxtb #1       // x - off
64*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().8h}, [x6]           // p0
65*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().8h}, [x9]           // p1
66*c0909341SAndroid Build Coastguard Worker.else
67*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w9, sxtb #1       // x + off
68*c0909341SAndroid Build Coastguard Worker        sub             x9,  x2,  w9, sxtb #1       // x - off
69*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().4h}, [x6]           // p0
70*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  #2*8              // += stride
71*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().4h}, [x9]           // p1
72*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  #2*8              // += stride
73*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().d}[1], [x6]         // p0
74*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().d}[1], [x9]         // p1
75*c0909341SAndroid Build Coastguard Worker.endif
76*c0909341SAndroid Build Coastguard Worker.endm
77*c0909341SAndroid Build Coastguard Worker.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
78*c0909341SAndroid Build Coastguard Worker.if \min
79*c0909341SAndroid Build Coastguard Worker        umin            v2.8h,   v2.8h,  \s1\().8h
80*c0909341SAndroid Build Coastguard Worker        smax            v3.8h,   v3.8h,  \s1\().8h
81*c0909341SAndroid Build Coastguard Worker        umin            v2.8h,   v2.8h,  \s2\().8h
82*c0909341SAndroid Build Coastguard Worker        smax            v3.8h,   v3.8h,  \s2\().8h
83*c0909341SAndroid Build Coastguard Worker.endif
84*c0909341SAndroid Build Coastguard Worker        uabd            v16.8h, v0.8h,  \s1\().8h   // abs(diff)
85*c0909341SAndroid Build Coastguard Worker        uabd            v20.8h, v0.8h,  \s2\().8h   // abs(diff)
86*c0909341SAndroid Build Coastguard Worker        ushl            v17.8h, v16.8h, \shift      // abs(diff) >> shift
87*c0909341SAndroid Build Coastguard Worker        ushl            v21.8h, v20.8h, \shift      // abs(diff) >> shift
88*c0909341SAndroid Build Coastguard Worker        uqsub           v17.8h, \thresh_vec, v17.8h // clip = imax(0, threshold - (abs(diff) >> shift))
89*c0909341SAndroid Build Coastguard Worker        uqsub           v21.8h, \thresh_vec, v21.8h // clip = imax(0, threshold - (abs(diff) >> shift))
90*c0909341SAndroid Build Coastguard Worker        sub             v18.8h, \s1\().8h,  v0.8h   // diff = p0 - px
91*c0909341SAndroid Build Coastguard Worker        sub             v22.8h, \s2\().8h,  v0.8h   // diff = p1 - px
92*c0909341SAndroid Build Coastguard Worker        neg             v16.8h, v17.8h              // -clip
93*c0909341SAndroid Build Coastguard Worker        neg             v20.8h, v21.8h              // -clip
94*c0909341SAndroid Build Coastguard Worker        smin            v18.8h, v18.8h, v17.8h      // imin(diff, clip)
95*c0909341SAndroid Build Coastguard Worker        smin            v22.8h, v22.8h, v21.8h      // imin(diff, clip)
96*c0909341SAndroid Build Coastguard Worker        dup             v19.8h, \tap                // taps[k]
97*c0909341SAndroid Build Coastguard Worker        smax            v18.8h, v18.8h, v16.8h      // constrain() = imax(imin(diff, clip), -clip)
98*c0909341SAndroid Build Coastguard Worker        smax            v22.8h, v22.8h, v20.8h      // constrain() = imax(imin(diff, clip), -clip)
99*c0909341SAndroid Build Coastguard Worker        mla             v1.8h,  v18.8h, v19.8h      // sum += taps[k] * constrain()
100*c0909341SAndroid Build Coastguard Worker        mla             v1.8h,  v22.8h, v19.8h      // sum += taps[k] * constrain()
101*c0909341SAndroid Build Coastguard Worker.endm
102*c0909341SAndroid Build Coastguard Worker
103*c0909341SAndroid Build Coastguard Worker// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
104*c0909341SAndroid Build Coastguard Worker//                                   const uint16_t *tmp, int pri_strength,
105*c0909341SAndroid Build Coastguard Worker//                                   int sec_strength, int dir, int damping,
106*c0909341SAndroid Build Coastguard Worker//                                   int h, size_t edges);
107*c0909341SAndroid Build Coastguard Worker.macro filter_func w, bpc, pri, sec, min, suffix
108*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\suffix\()_\bpc\()bpc_neon
109*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
110*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]                   // edges
111*c0909341SAndroid Build Coastguard Worker        cmp             w8,  #0xf
112*c0909341SAndroid Build Coastguard Worker        b.eq            cdef_filter\w\suffix\()_edged_8bpc_neon
113*c0909341SAndroid Build Coastguard Worker.endif
114*c0909341SAndroid Build Coastguard Worker.if \pri
115*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
116*c0909341SAndroid Build Coastguard Worker        ldr             w9,  [sp, #8]               // bitdepth_max
117*c0909341SAndroid Build Coastguard Worker        clz             w9,  w9
118*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #24               // -bitdepth_min_8
119*c0909341SAndroid Build Coastguard Worker        neg             w9,  w9                     // bitdepth_min_8
120*c0909341SAndroid Build Coastguard Worker.endif
121*c0909341SAndroid Build Coastguard Worker        movrel          x8,  pri_taps
122*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
123*c0909341SAndroid Build Coastguard Worker        lsr             w9,  w3,  w9                // pri_strength >> bitdepth_min_8
124*c0909341SAndroid Build Coastguard Worker        and             w9,  w9,  #1                // (pri_strength >> bitdepth_min_8) & 1
125*c0909341SAndroid Build Coastguard Worker.else
126*c0909341SAndroid Build Coastguard Worker        and             w9,  w3,  #1
127*c0909341SAndroid Build Coastguard Worker.endif
128*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  w9, uxtw #1
129*c0909341SAndroid Build Coastguard Worker.endif
130*c0909341SAndroid Build Coastguard Worker        movrel          x9,  directions\w
131*c0909341SAndroid Build Coastguard Worker        add             x5,  x9,  w5, uxtw #1
132*c0909341SAndroid Build Coastguard Worker        movi            v30.4h,   #15
133*c0909341SAndroid Build Coastguard Worker        dup             v28.4h,   w6                // damping
134*c0909341SAndroid Build Coastguard Worker
135*c0909341SAndroid Build Coastguard Worker.if \pri
136*c0909341SAndroid Build Coastguard Worker        dup             v25.8h, w3                  // threshold
137*c0909341SAndroid Build Coastguard Worker.endif
138*c0909341SAndroid Build Coastguard Worker.if \sec
139*c0909341SAndroid Build Coastguard Worker        dup             v27.8h, w4                  // threshold
140*c0909341SAndroid Build Coastguard Worker.endif
141*c0909341SAndroid Build Coastguard Worker        trn1            v24.4h, v25.4h, v27.4h
142*c0909341SAndroid Build Coastguard Worker        clz             v24.4h, v24.4h              // clz(threshold)
143*c0909341SAndroid Build Coastguard Worker        sub             v24.4h, v30.4h, v24.4h      // ulog2(threshold)
144*c0909341SAndroid Build Coastguard Worker        uqsub           v24.4h, v28.4h, v24.4h      // shift = imax(0, damping - ulog2(threshold))
145*c0909341SAndroid Build Coastguard Worker        neg             v24.4h, v24.4h              // -shift
146*c0909341SAndroid Build Coastguard Worker.if \sec
147*c0909341SAndroid Build Coastguard Worker        dup             v26.8h, v24.h[1]
148*c0909341SAndroid Build Coastguard Worker.endif
149*c0909341SAndroid Build Coastguard Worker.if \pri
150*c0909341SAndroid Build Coastguard Worker        dup             v24.8h, v24.h[0]
151*c0909341SAndroid Build Coastguard Worker.endif
152*c0909341SAndroid Build Coastguard Worker
153*c0909341SAndroid Build Coastguard Worker1:
154*c0909341SAndroid Build Coastguard Worker.if \w == 8
155*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x2]               // px
156*c0909341SAndroid Build Coastguard Worker.else
157*c0909341SAndroid Build Coastguard Worker        add             x12, x2,  #2*8
158*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},   [x2]             // px
159*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x12]            // px
160*c0909341SAndroid Build Coastguard Worker.endif
161*c0909341SAndroid Build Coastguard Worker
162*c0909341SAndroid Build Coastguard Worker        movi            v1.8h,  #0                  // sum
163*c0909341SAndroid Build Coastguard Worker.if \min
164*c0909341SAndroid Build Coastguard Worker        mov             v2.16b, v0.16b              // min
165*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v0.16b              // max
166*c0909341SAndroid Build Coastguard Worker.endif
167*c0909341SAndroid Build Coastguard Worker
168*c0909341SAndroid Build Coastguard Worker        // Instead of loading sec_taps 2, 1 from memory, just set it
169*c0909341SAndroid Build Coastguard Worker        // to 2 initially and decrease for the second round.
170*c0909341SAndroid Build Coastguard Worker        // This is also used as loop counter.
171*c0909341SAndroid Build Coastguard Worker        mov             w11, #2                     // sec_taps[0]
172*c0909341SAndroid Build Coastguard Worker
173*c0909341SAndroid Build Coastguard Worker2:
174*c0909341SAndroid Build Coastguard Worker.if \pri
175*c0909341SAndroid Build Coastguard Worker        ldrb            w9,  [x5]                   // off1
176*c0909341SAndroid Build Coastguard Worker
177*c0909341SAndroid Build Coastguard Worker        load_px         v4,  v5, \w
178*c0909341SAndroid Build Coastguard Worker.endif
179*c0909341SAndroid Build Coastguard Worker
180*c0909341SAndroid Build Coastguard Worker.if \sec
181*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  #4                // +2*2
182*c0909341SAndroid Build Coastguard Worker        ldrb            w9,  [x5]                   // off2
183*c0909341SAndroid Build Coastguard Worker        load_px         v6,  v7,  \w
184*c0909341SAndroid Build Coastguard Worker.endif
185*c0909341SAndroid Build Coastguard Worker
186*c0909341SAndroid Build Coastguard Worker.if \pri
187*c0909341SAndroid Build Coastguard Worker        ldrb            w10, [x8]                   // *pri_taps
188*c0909341SAndroid Build Coastguard Worker
189*c0909341SAndroid Build Coastguard Worker        handle_pixel    v4,  v5,  v25.8h, v24.8h, w10, \min
190*c0909341SAndroid Build Coastguard Worker.endif
191*c0909341SAndroid Build Coastguard Worker
192*c0909341SAndroid Build Coastguard Worker.if \sec
193*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  #8                // +2*4
194*c0909341SAndroid Build Coastguard Worker        ldrb            w9,  [x5]                   // off3
195*c0909341SAndroid Build Coastguard Worker        load_px         v4,  v5,  \w
196*c0909341SAndroid Build Coastguard Worker
197*c0909341SAndroid Build Coastguard Worker        handle_pixel    v6,  v7,  v27.8h, v26.8h, w11, \min
198*c0909341SAndroid Build Coastguard Worker
199*c0909341SAndroid Build Coastguard Worker        handle_pixel    v4,  v5,  v27.8h, v26.8h, w11, \min
200*c0909341SAndroid Build Coastguard Worker
201*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
202*c0909341SAndroid Build Coastguard Worker.else
203*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  #1                // x5 += 1
204*c0909341SAndroid Build Coastguard Worker.endif
205*c0909341SAndroid Build Coastguard Worker        subs            w11, w11, #1                // sec_tap-- (value)
206*c0909341SAndroid Build Coastguard Worker.if \pri
207*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  #1                // pri_taps++ (pointer)
208*c0909341SAndroid Build Coastguard Worker.endif
209*c0909341SAndroid Build Coastguard Worker        b.ne            2b
210*c0909341SAndroid Build Coastguard Worker
211*c0909341SAndroid Build Coastguard Worker        cmlt            v4.8h,  v1.8h,  #0          // -(sum < 0)
212*c0909341SAndroid Build Coastguard Worker        add             v1.8h,  v1.8h,  v4.8h       // sum - (sum < 0)
213*c0909341SAndroid Build Coastguard Worker        srshr           v1.8h,  v1.8h,  #4          // (8 + sum - (sum < 0)) >> 4
214*c0909341SAndroid Build Coastguard Worker        add             v0.8h,  v0.8h,  v1.8h       // px + (8 + sum ...) >> 4
215*c0909341SAndroid Build Coastguard Worker.if \min
216*c0909341SAndroid Build Coastguard Worker        smin            v0.8h,  v0.8h,  v3.8h
217*c0909341SAndroid Build Coastguard Worker        smax            v0.8h,  v0.8h,  v2.8h       // iclip(px + .., min, max)
218*c0909341SAndroid Build Coastguard Worker.endif
219*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
220*c0909341SAndroid Build Coastguard Worker        xtn             v0.8b,  v0.8h
221*c0909341SAndroid Build Coastguard Worker.endif
222*c0909341SAndroid Build Coastguard Worker.if \w == 8
223*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2*16             // tmp += tmp_stride
224*c0909341SAndroid Build Coastguard Worker        subs            w7,  w7,  #1                // h--
225*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
226*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b}, [x0], x1
227*c0909341SAndroid Build Coastguard Worker.else
228*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], x1
229*c0909341SAndroid Build Coastguard Worker.endif
230*c0909341SAndroid Build Coastguard Worker.else
231*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
232*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0], [x0], x1
233*c0909341SAndroid Build Coastguard Worker.else
234*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[0], [x0], x1
235*c0909341SAndroid Build Coastguard Worker.endif
236*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
237*c0909341SAndroid Build Coastguard Worker        subs            w7,  w7,  #2                // h -= 2
238*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
239*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[1], [x0], x1
240*c0909341SAndroid Build Coastguard Worker.else
241*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[1], [x0], x1
242*c0909341SAndroid Build Coastguard Worker.endif
243*c0909341SAndroid Build Coastguard Worker.endif
244*c0909341SAndroid Build Coastguard Worker
245*c0909341SAndroid Build Coastguard Worker        // Reset pri_taps and directions back to the original point
246*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #2
247*c0909341SAndroid Build Coastguard Worker.if \pri
248*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  #2
249*c0909341SAndroid Build Coastguard Worker.endif
250*c0909341SAndroid Build Coastguard Worker
251*c0909341SAndroid Build Coastguard Worker        b.gt            1b
252*c0909341SAndroid Build Coastguard Worker        ret
253*c0909341SAndroid Build Coastguard Workerendfunc
254*c0909341SAndroid Build Coastguard Worker.endm
255*c0909341SAndroid Build Coastguard Worker
256*c0909341SAndroid Build Coastguard Worker.macro filter w, bpc
257*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
258*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
259*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
260*c0909341SAndroid Build Coastguard Worker
261*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\()_\bpc\()bpc_neon, export=1
262*c0909341SAndroid Build Coastguard Worker        cbnz            w3,  1f // pri_strength
263*c0909341SAndroid Build Coastguard Worker        b               cdef_filter\w\()_sec_\bpc\()bpc_neon     // only sec
264*c0909341SAndroid Build Coastguard Worker1:
265*c0909341SAndroid Build Coastguard Worker        cbnz            w4,  1f // sec_strength
266*c0909341SAndroid Build Coastguard Worker        b               cdef_filter\w\()_pri_\bpc\()bpc_neon     // only pri
267*c0909341SAndroid Build Coastguard Worker1:
268*c0909341SAndroid Build Coastguard Worker        b               cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
269*c0909341SAndroid Build Coastguard Workerendfunc
270*c0909341SAndroid Build Coastguard Worker.endm
271*c0909341SAndroid Build Coastguard Worker
272*c0909341SAndroid Build Coastguard Workerconst div_table
273*c0909341SAndroid Build Coastguard Worker        .short         840, 420, 280, 210, 168, 140, 120, 105
274*c0909341SAndroid Build Coastguard Workerendconst
275*c0909341SAndroid Build Coastguard Worker
276*c0909341SAndroid Build Coastguard Workerconst alt_fact
277*c0909341SAndroid Build Coastguard Worker        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
278*c0909341SAndroid Build Coastguard Workerendconst
279*c0909341SAndroid Build Coastguard Worker
280*c0909341SAndroid Build Coastguard Worker.macro cost_alt d1, d2, s1, s2, s3, s4
281*c0909341SAndroid Build Coastguard Worker        smull           v22.4s,  \s1\().4h, \s1\().4h // sum_alt[n]*sum_alt[n]
282*c0909341SAndroid Build Coastguard Worker        smull2          v23.4s,  \s1\().8h, \s1\().8h
283*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  \s2\().4h, \s2\().4h
284*c0909341SAndroid Build Coastguard Worker        smull           v25.4s,  \s3\().4h, \s3\().4h // sum_alt[n]*sum_alt[n]
285*c0909341SAndroid Build Coastguard Worker        smull2          v26.4s,  \s3\().8h, \s3\().8h
286*c0909341SAndroid Build Coastguard Worker        smull           v27.4s,  \s4\().4h, \s4\().4h
287*c0909341SAndroid Build Coastguard Worker        mul             v22.4s,  v22.4s,  v29.4s      // sum_alt[n]^2*fact
288*c0909341SAndroid Build Coastguard Worker        mla             v22.4s,  v23.4s,  v30.4s
289*c0909341SAndroid Build Coastguard Worker        mla             v22.4s,  v24.4s,  v31.4s
290*c0909341SAndroid Build Coastguard Worker        mul             v25.4s,  v25.4s,  v29.4s      // sum_alt[n]^2*fact
291*c0909341SAndroid Build Coastguard Worker        mla             v25.4s,  v26.4s,  v30.4s
292*c0909341SAndroid Build Coastguard Worker        mla             v25.4s,  v27.4s,  v31.4s
293*c0909341SAndroid Build Coastguard Worker        addv            \d1, v22.4s                   // *cost_ptr
294*c0909341SAndroid Build Coastguard Worker        addv            \d2, v25.4s                   // *cost_ptr
295*c0909341SAndroid Build Coastguard Worker.endm
296*c0909341SAndroid Build Coastguard Worker
297*c0909341SAndroid Build Coastguard Worker.macro find_best s1, s2, s3
298*c0909341SAndroid Build Coastguard Worker.ifnb \s2
299*c0909341SAndroid Build Coastguard Worker        mov             w5,  \s2\().s[0]
300*c0909341SAndroid Build Coastguard Worker.endif
301*c0909341SAndroid Build Coastguard Worker        cmp             w4,  w1                       // cost[n] > best_cost
302*c0909341SAndroid Build Coastguard Worker        csel            w0,  w3,  w0,  gt             // best_dir = n
303*c0909341SAndroid Build Coastguard Worker        csel            w1,  w4,  w1,  gt             // best_cost = cost[n]
304*c0909341SAndroid Build Coastguard Worker.ifnb \s2
305*c0909341SAndroid Build Coastguard Worker        add             w3,  w3,  #1                  // n++
306*c0909341SAndroid Build Coastguard Worker        cmp             w5,  w1                       // cost[n] > best_cost
307*c0909341SAndroid Build Coastguard Worker        mov             w4,  \s3\().s[0]
308*c0909341SAndroid Build Coastguard Worker        csel            w0,  w3,  w0,  gt             // best_dir = n
309*c0909341SAndroid Build Coastguard Worker        csel            w1,  w5,  w1,  gt             // best_cost = cost[n]
310*c0909341SAndroid Build Coastguard Worker        add             w3,  w3,  #1                  // n++
311*c0909341SAndroid Build Coastguard Worker.endif
312*c0909341SAndroid Build Coastguard Worker.endm
313*c0909341SAndroid Build Coastguard Worker
314*c0909341SAndroid Build Coastguard Worker// Steps for loading and preparing each row
315*c0909341SAndroid Build Coastguard Worker.macro dir_load_step1 s1, bpc
316*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
317*c0909341SAndroid Build Coastguard Worker        ld1             {\s1\().8b}, [x0], x1
318*c0909341SAndroid Build Coastguard Worker.else
319*c0909341SAndroid Build Coastguard Worker        ld1             {\s1\().8h}, [x0], x1
320*c0909341SAndroid Build Coastguard Worker.endif
321*c0909341SAndroid Build Coastguard Worker.endm
322*c0909341SAndroid Build Coastguard Worker
323*c0909341SAndroid Build Coastguard Worker.macro dir_load_step2 s1, bpc
324*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
325*c0909341SAndroid Build Coastguard Worker        usubl           \s1\().8h,  \s1\().8b, v31.8b
326*c0909341SAndroid Build Coastguard Worker.else
327*c0909341SAndroid Build Coastguard Worker        ushl            \s1\().8h,  \s1\().8h, v8.8h
328*c0909341SAndroid Build Coastguard Worker.endif
329*c0909341SAndroid Build Coastguard Worker.endm
330*c0909341SAndroid Build Coastguard Worker
331*c0909341SAndroid Build Coastguard Worker.macro dir_load_step3 s1, bpc
332*c0909341SAndroid Build Coastguard Worker// Nothing for \bpc == 8
333*c0909341SAndroid Build Coastguard Worker.if \bpc != 8
334*c0909341SAndroid Build Coastguard Worker        sub             \s1\().8h,  \s1\().8h, v31.8h
335*c0909341SAndroid Build Coastguard Worker.endif
336*c0909341SAndroid Build Coastguard Worker.endm
337*c0909341SAndroid Build Coastguard Worker
338*c0909341SAndroid Build Coastguard Worker// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
339*c0909341SAndroid Build Coastguard Worker//                                   unsigned *const var)
340*c0909341SAndroid Build Coastguard Worker.macro find_dir bpc
341*c0909341SAndroid Build Coastguard Workerfunction cdef_find_dir_\bpc\()bpc_neon, export=1
342*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
343*c0909341SAndroid Build Coastguard Worker        str             d8,  [sp, #-0x10]!
344*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3                       // clz(bitdepth_max)
345*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #24                 // -bitdepth_min_8
346*c0909341SAndroid Build Coastguard Worker        dup             v8.8h,   w3
347*c0909341SAndroid Build Coastguard Worker.endif
348*c0909341SAndroid Build Coastguard Worker        sub             sp,  sp,  #32 // cost
349*c0909341SAndroid Build Coastguard Worker        mov             w3,  #8
350*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
351*c0909341SAndroid Build Coastguard Worker        movi            v31.16b, #128
352*c0909341SAndroid Build Coastguard Worker.else
353*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #128
354*c0909341SAndroid Build Coastguard Worker.endif
355*c0909341SAndroid Build Coastguard Worker        movi            v30.16b, #0
356*c0909341SAndroid Build Coastguard Worker        movi            v1.8h,   #0 // v0-v1 sum_diag[0]
357*c0909341SAndroid Build Coastguard Worker        movi            v3.8h,   #0 // v2-v3 sum_diag[1]
358*c0909341SAndroid Build Coastguard Worker        movi            v5.8h,   #0 // v4-v5 sum_hv[0-1]
359*c0909341SAndroid Build Coastguard Worker        movi            v7.8h,   #0 // v6-v7 sum_alt[0]
360*c0909341SAndroid Build Coastguard Worker        dir_load_step1  v26, \bpc       // Setup first row early
361*c0909341SAndroid Build Coastguard Worker        movi            v17.8h,  #0 // v16-v17 sum_alt[1]
362*c0909341SAndroid Build Coastguard Worker        movi            v18.8h,  #0 // v18-v19 sum_alt[2]
363*c0909341SAndroid Build Coastguard Worker        dir_load_step2  v26, \bpc
364*c0909341SAndroid Build Coastguard Worker        movi            v19.8h,  #0
365*c0909341SAndroid Build Coastguard Worker        dir_load_step3  v26, \bpc
366*c0909341SAndroid Build Coastguard Worker        movi            v21.8h,  #0 // v20-v21 sum_alt[3]
367*c0909341SAndroid Build Coastguard Worker
368*c0909341SAndroid Build Coastguard Worker.irpc i, 01234567
369*c0909341SAndroid Build Coastguard Worker        addv            h25,     v26.8h               // [y]
370*c0909341SAndroid Build Coastguard Worker        rev64           v27.8h,  v26.8h
371*c0909341SAndroid Build Coastguard Worker        addp            v28.8h,  v26.8h,  v30.8h      // [(x >> 1)]
372*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v26.8h      // sum_hv[1]
373*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v27.16b, v27.16b, #8 // [-x]
374*c0909341SAndroid Build Coastguard Worker        rev64           v29.4h,  v28.4h               // [-(x >> 1)]
375*c0909341SAndroid Build Coastguard Worker        ins             v4.h[\i], v25.h[0]            // sum_hv[0]
376*c0909341SAndroid Build Coastguard Worker.if \i < 6
377*c0909341SAndroid Build Coastguard Worker        ext             v22.16b, v30.16b, v26.16b, #(16-2*(3-(\i/2)))
378*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v26.16b, v30.16b, #(16-2*(3-(\i/2)))
379*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v22.8h      // sum_alt[2]
380*c0909341SAndroid Build Coastguard Worker        add             v19.4h,  v19.4h,  v23.4h      // sum_alt[2]
381*c0909341SAndroid Build Coastguard Worker.else
382*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v26.8h      // sum_alt[2]
383*c0909341SAndroid Build Coastguard Worker.endif
384*c0909341SAndroid Build Coastguard Worker.if \i == 0
385*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v26.16b              // sum_alt[3]
386*c0909341SAndroid Build Coastguard Worker.elseif \i == 1
387*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v26.8h      // sum_alt[3]
388*c0909341SAndroid Build Coastguard Worker.else
389*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v30.16b, v26.16b, #(16-2*(\i/2))
390*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v26.16b, v30.16b, #(16-2*(\i/2))
391*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v24.8h      // sum_alt[3]
392*c0909341SAndroid Build Coastguard Worker        add             v21.4h,  v21.4h,  v25.4h      // sum_alt[3]
393*c0909341SAndroid Build Coastguard Worker.endif
394*c0909341SAndroid Build Coastguard Worker.if \i == 0
395*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v26.16b              // sum_diag[0]
396*c0909341SAndroid Build Coastguard Worker        dir_load_step1  v26, \bpc
397*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v27.16b              // sum_diag[1]
398*c0909341SAndroid Build Coastguard Worker        dir_load_step2  v26, \bpc
399*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v28.16b              // sum_alt[0]
400*c0909341SAndroid Build Coastguard Worker        dir_load_step3  v26, \bpc
401*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v29.16b              // sum_alt[1]
402*c0909341SAndroid Build Coastguard Worker.else
403*c0909341SAndroid Build Coastguard Worker        ext             v22.16b, v30.16b, v26.16b, #(16-2*\i)
404*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v26.16b, v30.16b, #(16-2*\i)
405*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v30.16b, v27.16b, #(16-2*\i)
406*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v27.16b, v30.16b, #(16-2*\i)
407*c0909341SAndroid Build Coastguard Worker.if \i != 7 // Nothing to load for the final row
408*c0909341SAndroid Build Coastguard Worker        dir_load_step1  v26, \bpc // Start setting up the next row early.
409*c0909341SAndroid Build Coastguard Worker.endif
410*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v22.8h      // sum_diag[0]
411*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v23.8h      // sum_diag[0]
412*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v24.8h      // sum_diag[1]
413*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v25.8h      // sum_diag[1]
414*c0909341SAndroid Build Coastguard Worker.if \i != 7
415*c0909341SAndroid Build Coastguard Worker        dir_load_step2  v26, \bpc
416*c0909341SAndroid Build Coastguard Worker.endif
417*c0909341SAndroid Build Coastguard Worker        ext             v22.16b, v30.16b, v28.16b, #(16-2*\i)
418*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v28.16b, v30.16b, #(16-2*\i)
419*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v30.16b, v29.16b, #(16-2*\i)
420*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v29.16b, v30.16b, #(16-2*\i)
421*c0909341SAndroid Build Coastguard Worker.if \i != 7
422*c0909341SAndroid Build Coastguard Worker        dir_load_step3  v26, \bpc
423*c0909341SAndroid Build Coastguard Worker.endif
424*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v22.8h      // sum_alt[0]
425*c0909341SAndroid Build Coastguard Worker        add             v7.4h,   v7.4h,   v23.4h      // sum_alt[0]
426*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v24.8h      // sum_alt[1]
427*c0909341SAndroid Build Coastguard Worker        add             v17.4h,  v17.4h,  v25.4h      // sum_alt[1]
428*c0909341SAndroid Build Coastguard Worker.endif
429*c0909341SAndroid Build Coastguard Worker.endr
430*c0909341SAndroid Build Coastguard Worker
431*c0909341SAndroid Build Coastguard Worker        movi            v31.4s,  #105
432*c0909341SAndroid Build Coastguard Worker
433*c0909341SAndroid Build Coastguard Worker        smull           v26.4s,  v4.4h,   v4.4h       // sum_hv[0]*sum_hv[0]
434*c0909341SAndroid Build Coastguard Worker        smlal2          v26.4s,  v4.8h,   v4.8h
435*c0909341SAndroid Build Coastguard Worker        smull           v27.4s,  v5.4h,   v5.4h       // sum_hv[1]*sum_hv[1]
436*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v5.8h,   v5.8h
437*c0909341SAndroid Build Coastguard Worker        mul             v26.4s,  v26.4s,  v31.4s      // cost[2] *= 105
438*c0909341SAndroid Build Coastguard Worker        mul             v27.4s,  v27.4s,  v31.4s      // cost[6] *= 105
439*c0909341SAndroid Build Coastguard Worker        addv            s4,  v26.4s                   // cost[2]
440*c0909341SAndroid Build Coastguard Worker        addv            s5,  v27.4s                   // cost[6]
441*c0909341SAndroid Build Coastguard Worker
442*c0909341SAndroid Build Coastguard Worker        rev64           v1.8h,   v1.8h
443*c0909341SAndroid Build Coastguard Worker        rev64           v3.8h,   v3.8h
444*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v1.16b,  v1.16b, #10 // sum_diag[0][14-n]
445*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v3.16b,  v3.16b, #10 // sum_diag[1][14-n]
446*c0909341SAndroid Build Coastguard Worker
447*c0909341SAndroid Build Coastguard Worker        str             s4,  [sp, #2*4]               // cost[2]
448*c0909341SAndroid Build Coastguard Worker        str             s5,  [sp, #6*4]               // cost[6]
449*c0909341SAndroid Build Coastguard Worker
450*c0909341SAndroid Build Coastguard Worker        movrel          x4,  div_table
451*c0909341SAndroid Build Coastguard Worker        ld1             {v31.8h}, [x4]
452*c0909341SAndroid Build Coastguard Worker
453*c0909341SAndroid Build Coastguard Worker        smull           v22.4s,  v0.4h,   v0.4h       // sum_diag[0]*sum_diag[0]
454*c0909341SAndroid Build Coastguard Worker        smull2          v23.4s,  v0.8h,   v0.8h
455*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v1.4h,   v1.4h
456*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v1.8h,   v1.8h
457*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v2.4h,   v2.4h       // sum_diag[1]*sum_diag[1]
458*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v2.8h,   v2.8h
459*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v3.4h,   v3.4h
460*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v3.8h,   v3.8h
461*c0909341SAndroid Build Coastguard Worker        uxtl            v30.4s,  v31.4h               // div_table
462*c0909341SAndroid Build Coastguard Worker        uxtl2           v31.4s,  v31.8h
463*c0909341SAndroid Build Coastguard Worker        mul             v22.4s,  v22.4s,  v30.4s      // cost[0]
464*c0909341SAndroid Build Coastguard Worker        mla             v22.4s,  v23.4s,  v31.4s      // cost[0]
465*c0909341SAndroid Build Coastguard Worker        mul             v24.4s,  v24.4s,  v30.4s      // cost[4]
466*c0909341SAndroid Build Coastguard Worker        mla             v24.4s,  v25.4s,  v31.4s      // cost[4]
467*c0909341SAndroid Build Coastguard Worker        addv            s0,  v22.4s                   // cost[0]
468*c0909341SAndroid Build Coastguard Worker        addv            s2,  v24.4s                   // cost[4]
469*c0909341SAndroid Build Coastguard Worker
470*c0909341SAndroid Build Coastguard Worker        movrel          x5,  alt_fact
471*c0909341SAndroid Build Coastguard Worker        ld1             {v29.4h, v30.4h, v31.4h}, [x5]// div_table[2*m+1] + 105
472*c0909341SAndroid Build Coastguard Worker
473*c0909341SAndroid Build Coastguard Worker        str             s0,  [sp, #0*4]               // cost[0]
474*c0909341SAndroid Build Coastguard Worker        str             s2,  [sp, #4*4]               // cost[4]
475*c0909341SAndroid Build Coastguard Worker
476*c0909341SAndroid Build Coastguard Worker        uxtl            v29.4s,  v29.4h               // div_table[2*m+1] + 105
477*c0909341SAndroid Build Coastguard Worker        uxtl            v30.4s,  v30.4h
478*c0909341SAndroid Build Coastguard Worker        uxtl            v31.4s,  v31.4h
479*c0909341SAndroid Build Coastguard Worker
480*c0909341SAndroid Build Coastguard Worker        cost_alt        s6,  s16, v6,  v7,  v16, v17  // cost[1], cost[3]
481*c0909341SAndroid Build Coastguard Worker        cost_alt        s18, s20, v18, v19, v20, v21  // cost[5], cost[7]
482*c0909341SAndroid Build Coastguard Worker        str             s6,  [sp, #1*4]               // cost[1]
483*c0909341SAndroid Build Coastguard Worker        str             s16, [sp, #3*4]               // cost[3]
484*c0909341SAndroid Build Coastguard Worker
485*c0909341SAndroid Build Coastguard Worker        mov             w0,  #0                       // best_dir
486*c0909341SAndroid Build Coastguard Worker        mov             w1,  v0.s[0]                  // best_cost
487*c0909341SAndroid Build Coastguard Worker        mov             w3,  #1                       // n
488*c0909341SAndroid Build Coastguard Worker
489*c0909341SAndroid Build Coastguard Worker        str             s18, [sp, #5*4]               // cost[5]
490*c0909341SAndroid Build Coastguard Worker        str             s20, [sp, #7*4]               // cost[7]
491*c0909341SAndroid Build Coastguard Worker
492*c0909341SAndroid Build Coastguard Worker        mov             w4,  v6.s[0]
493*c0909341SAndroid Build Coastguard Worker
494*c0909341SAndroid Build Coastguard Worker        find_best       v6,  v4, v16
495*c0909341SAndroid Build Coastguard Worker        find_best       v16, v2, v18
496*c0909341SAndroid Build Coastguard Worker        find_best       v18, v5, v20
497*c0909341SAndroid Build Coastguard Worker        find_best       v20
498*c0909341SAndroid Build Coastguard Worker
499*c0909341SAndroid Build Coastguard Worker        eor             w3,  w0,  #4                  // best_dir ^4
500*c0909341SAndroid Build Coastguard Worker        ldr             w4,  [sp, w3, uxtw #2]
501*c0909341SAndroid Build Coastguard Worker        sub             w1,  w1,  w4                  // best_cost - cost[best_dir ^ 4]
502*c0909341SAndroid Build Coastguard Worker        lsr             w1,  w1,  #10
503*c0909341SAndroid Build Coastguard Worker        str             w1,  [x2]                     // *var
504*c0909341SAndroid Build Coastguard Worker
505*c0909341SAndroid Build Coastguard Worker        add             sp,  sp,  #32
506*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
507*c0909341SAndroid Build Coastguard Worker        ldr             d8,  [sp], 0x10
508*c0909341SAndroid Build Coastguard Worker.endif
509*c0909341SAndroid Build Coastguard Worker        ret
510*c0909341SAndroid Build Coastguard Workerendfunc
511*c0909341SAndroid Build Coastguard Worker.endm
512