xref: /aosp_15_r20/external/libdav1d/src/arm/64/cdef.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2019, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker#include "cdef_tmpl.S"
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Worker.macro pad_top_bottom s1, s2, w, stride, rn, rw, ret
33*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // CDEF_HAVE_LEFT
34*c0909341SAndroid Build Coastguard Worker        b.eq            2f
35*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_LEFT
36*c0909341SAndroid Build Coastguard Worker        sub             \s1,  \s1,  #2
37*c0909341SAndroid Build Coastguard Worker        sub             \s2,  \s2,  #2
38*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // CDEF_HAVE_RIGHT
39*c0909341SAndroid Build Coastguard Worker        b.eq            1f
40*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
41*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()0, [\s1]
42*c0909341SAndroid Build Coastguard Worker        ldr             s1,      [\s1, #\w]
43*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()2, [\s2]
44*c0909341SAndroid Build Coastguard Worker        ldr             s3,      [\s2, #\w]
45*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v0.8b
46*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,   v1.8b
47*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,   v2.8b
48*c0909341SAndroid Build Coastguard Worker        uxtl            v3.8h,   v3.8b
49*c0909341SAndroid Build Coastguard Worker        str             \rw\()0, [x0]
50*c0909341SAndroid Build Coastguard Worker        str             d1,      [x0, #2*\w]
51*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
52*c0909341SAndroid Build Coastguard Worker        str             \rw\()2, [x0]
53*c0909341SAndroid Build Coastguard Worker        str             d3,      [x0, #2*\w]
54*c0909341SAndroid Build Coastguard Worker.if \ret
55*c0909341SAndroid Build Coastguard Worker        ret
56*c0909341SAndroid Build Coastguard Worker.else
57*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
58*c0909341SAndroid Build Coastguard Worker        b               3f
59*c0909341SAndroid Build Coastguard Worker.endif
60*c0909341SAndroid Build Coastguard Worker
61*c0909341SAndroid Build Coastguard Worker1:
62*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
63*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()0, [\s1]
64*c0909341SAndroid Build Coastguard Worker        ldr             h1,      [\s1, #\w]
65*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()2, [\s2]
66*c0909341SAndroid Build Coastguard Worker        ldr             h3,      [\s2, #\w]
67*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v0.8b
68*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,   v1.8b
69*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,   v2.8b
70*c0909341SAndroid Build Coastguard Worker        uxtl            v3.8h,   v3.8b
71*c0909341SAndroid Build Coastguard Worker        str             \rw\()0, [x0]
72*c0909341SAndroid Build Coastguard Worker        str             s1,      [x0, #2*\w]
73*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0, #2*\w+4]
74*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
75*c0909341SAndroid Build Coastguard Worker        str             \rw\()2, [x0]
76*c0909341SAndroid Build Coastguard Worker        str             s3,      [x0, #2*\w]
77*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0, #2*\w+4]
78*c0909341SAndroid Build Coastguard Worker.if \ret
79*c0909341SAndroid Build Coastguard Worker        ret
80*c0909341SAndroid Build Coastguard Worker.else
81*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
82*c0909341SAndroid Build Coastguard Worker        b               3f
83*c0909341SAndroid Build Coastguard Worker.endif
84*c0909341SAndroid Build Coastguard Worker
85*c0909341SAndroid Build Coastguard Worker2:
86*c0909341SAndroid Build Coastguard Worker        // !CDEF_HAVE_LEFT
87*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // CDEF_HAVE_RIGHT
88*c0909341SAndroid Build Coastguard Worker        b.eq            1f
89*c0909341SAndroid Build Coastguard Worker        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
90*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()0, [\s1]
91*c0909341SAndroid Build Coastguard Worker        ldr             h1,      [\s1, #\w]
92*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()2, [\s2]
93*c0909341SAndroid Build Coastguard Worker        ldr             h3,      [\s2, #\w]
94*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,  v0.8b
95*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,  v1.8b
96*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,  v2.8b
97*c0909341SAndroid Build Coastguard Worker        uxtl            v3.8h,  v3.8b
98*c0909341SAndroid Build Coastguard Worker        str             s31, [x0]
99*c0909341SAndroid Build Coastguard Worker        stur            \rw\()0, [x0, #4]
100*c0909341SAndroid Build Coastguard Worker        str             s1,      [x0, #4+2*\w]
101*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
102*c0909341SAndroid Build Coastguard Worker        str             s31, [x0]
103*c0909341SAndroid Build Coastguard Worker        stur            \rw\()2, [x0, #4]
104*c0909341SAndroid Build Coastguard Worker        str             s3,      [x0, #4+2*\w]
105*c0909341SAndroid Build Coastguard Worker.if \ret
106*c0909341SAndroid Build Coastguard Worker        ret
107*c0909341SAndroid Build Coastguard Worker.else
108*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
109*c0909341SAndroid Build Coastguard Worker        b               3f
110*c0909341SAndroid Build Coastguard Worker.endif
111*c0909341SAndroid Build Coastguard Worker
112*c0909341SAndroid Build Coastguard Worker1:
113*c0909341SAndroid Build Coastguard Worker        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
114*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()0, [\s1]
115*c0909341SAndroid Build Coastguard Worker        ldr             \rn\()1, [\s2]
116*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,  v0.8b
117*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,  v1.8b
118*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0]
119*c0909341SAndroid Build Coastguard Worker        stur            \rw\()0, [x0, #4]
120*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0, #4+2*\w]
121*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
122*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0]
123*c0909341SAndroid Build Coastguard Worker        stur            \rw\()1, [x0, #4]
124*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0, #4+2*\w]
125*c0909341SAndroid Build Coastguard Worker.if \ret
126*c0909341SAndroid Build Coastguard Worker        ret
127*c0909341SAndroid Build Coastguard Worker.else
128*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
129*c0909341SAndroid Build Coastguard Worker.endif
130*c0909341SAndroid Build Coastguard Worker3:
131*c0909341SAndroid Build Coastguard Worker.endm
132*c0909341SAndroid Build Coastguard Worker
133*c0909341SAndroid Build Coastguard Worker.macro load_n_incr dst, src, incr, w
134*c0909341SAndroid Build Coastguard Worker.if \w == 4
135*c0909341SAndroid Build Coastguard Worker        ld1             {\dst\().s}[0], [\src], \incr
136*c0909341SAndroid Build Coastguard Worker.else
137*c0909341SAndroid Build Coastguard Worker        ld1             {\dst\().8b},   [\src], \incr
138*c0909341SAndroid Build Coastguard Worker.endif
139*c0909341SAndroid Build Coastguard Worker.endm
140*c0909341SAndroid Build Coastguard Worker
141*c0909341SAndroid Build Coastguard Worker// void dav1d_cdef_paddingX_8bpc_neon(uint16_t *tmp, const pixel *src,
142*c0909341SAndroid Build Coastguard Worker//                                    ptrdiff_t src_stride, const pixel (*left)[2],
143*c0909341SAndroid Build Coastguard Worker//                                    const pixel *const top,
144*c0909341SAndroid Build Coastguard Worker//                                    const pixel *const bottom, int h,
145*c0909341SAndroid Build Coastguard Worker//                                    enum CdefEdgeFlags edges);
146*c0909341SAndroid Build Coastguard Worker
147*c0909341SAndroid Build Coastguard Worker.macro padding_func w, stride, rn, rw
148*c0909341SAndroid Build Coastguard Workerfunction cdef_padding\w\()_8bpc_neon, export=1
149*c0909341SAndroid Build Coastguard Worker        cmp             w7,  #0xf // fully edged
150*c0909341SAndroid Build Coastguard Worker        b.eq            cdef_padding\w\()_edged_8bpc_neon
151*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #0x80, lsl #8
152*c0909341SAndroid Build Coastguard Worker        mov             v31.16b, v30.16b
153*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  #2*(2*\stride+2)
154*c0909341SAndroid Build Coastguard Worker        tst             w7,  #4 // CDEF_HAVE_TOP
155*c0909341SAndroid Build Coastguard Worker        b.ne            1f
156*c0909341SAndroid Build Coastguard Worker        // !CDEF_HAVE_TOP
157*c0909341SAndroid Build Coastguard Worker        st1             {v30.8h, v31.8h}, [x0], #32
158*c0909341SAndroid Build Coastguard Worker.if \w == 8
159*c0909341SAndroid Build Coastguard Worker        st1             {v30.8h, v31.8h}, [x0], #32
160*c0909341SAndroid Build Coastguard Worker.endif
161*c0909341SAndroid Build Coastguard Worker        b               3f
162*c0909341SAndroid Build Coastguard Worker1:
163*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_TOP
164*c0909341SAndroid Build Coastguard Worker        add             x9,  x4,  x2
165*c0909341SAndroid Build Coastguard Worker        pad_top_bottom  x4,  x9, \w, \stride, \rn, \rw, 0
166*c0909341SAndroid Build Coastguard Worker
167*c0909341SAndroid Build Coastguard Worker        // Middle section
168*c0909341SAndroid Build Coastguard Worker3:
169*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // CDEF_HAVE_LEFT
170*c0909341SAndroid Build Coastguard Worker        b.eq            2f
171*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_LEFT
172*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // CDEF_HAVE_RIGHT
173*c0909341SAndroid Build Coastguard Worker        b.eq            1f
174*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
175*c0909341SAndroid Build Coastguard Worker0:
176*c0909341SAndroid Build Coastguard Worker        ld1             {v0.h}[0], [x3], #2
177*c0909341SAndroid Build Coastguard Worker        ldr             h2,      [x1, #\w]
178*c0909341SAndroid Build Coastguard Worker        load_n_incr     v1,  x1,  x2,  \w
179*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #1
180*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,  v0.8b
181*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,  v1.8b
182*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,  v2.8b
183*c0909341SAndroid Build Coastguard Worker        str             s0,      [x0]
184*c0909341SAndroid Build Coastguard Worker        stur            \rw\()1, [x0, #4]
185*c0909341SAndroid Build Coastguard Worker        str             s2,      [x0, #4+2*\w]
186*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
187*c0909341SAndroid Build Coastguard Worker        b.gt            0b
188*c0909341SAndroid Build Coastguard Worker        b               3f
189*c0909341SAndroid Build Coastguard Worker1:
190*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
191*c0909341SAndroid Build Coastguard Worker        ld1             {v0.h}[0], [x3], #2
192*c0909341SAndroid Build Coastguard Worker        load_n_incr     v1,  x1,  x2,  \w
193*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #1
194*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,  v0.8b
195*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,  v1.8b
196*c0909341SAndroid Build Coastguard Worker        str             s0,      [x0]
197*c0909341SAndroid Build Coastguard Worker        stur            \rw\()1, [x0, #4]
198*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0, #4+2*\w]
199*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
200*c0909341SAndroid Build Coastguard Worker        b.gt            1b
201*c0909341SAndroid Build Coastguard Worker        b               3f
202*c0909341SAndroid Build Coastguard Worker2:
203*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // CDEF_HAVE_RIGHT
204*c0909341SAndroid Build Coastguard Worker        b.eq            1f
205*c0909341SAndroid Build Coastguard Worker        // !CDEF_HAVE_LEFT+CDEF_HAVE_RIGHT
206*c0909341SAndroid Build Coastguard Worker0:
207*c0909341SAndroid Build Coastguard Worker        ldr             h1,      [x1, #\w]
208*c0909341SAndroid Build Coastguard Worker        load_n_incr     v0,  x1,  x2,  \w
209*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #1
210*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,  v0.8b
211*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,  v1.8b
212*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0]
213*c0909341SAndroid Build Coastguard Worker        stur            \rw\()0, [x0, #4]
214*c0909341SAndroid Build Coastguard Worker        str             s1,      [x0, #4+2*\w]
215*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
216*c0909341SAndroid Build Coastguard Worker        b.gt            0b
217*c0909341SAndroid Build Coastguard Worker        b               3f
218*c0909341SAndroid Build Coastguard Worker1:
219*c0909341SAndroid Build Coastguard Worker        // !CDEF_HAVE_LEFT+!CDEF_HAVE_RIGHT
220*c0909341SAndroid Build Coastguard Worker        load_n_incr     v0,  x1,  x2,  \w
221*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #1
222*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,  v0.8b
223*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0]
224*c0909341SAndroid Build Coastguard Worker        stur            \rw\()0, [x0, #4]
225*c0909341SAndroid Build Coastguard Worker        str             s31,     [x0, #4+2*\w]
226*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
227*c0909341SAndroid Build Coastguard Worker        b.gt            1b
228*c0909341SAndroid Build Coastguard Worker
229*c0909341SAndroid Build Coastguard Worker3:
230*c0909341SAndroid Build Coastguard Worker        tst             w7,  #8 // CDEF_HAVE_BOTTOM
231*c0909341SAndroid Build Coastguard Worker        b.ne            1f
232*c0909341SAndroid Build Coastguard Worker        // !CDEF_HAVE_BOTTOM
233*c0909341SAndroid Build Coastguard Worker        st1             {v30.8h, v31.8h}, [x0], #32
234*c0909341SAndroid Build Coastguard Worker.if \w == 8
235*c0909341SAndroid Build Coastguard Worker        st1             {v30.8h, v31.8h}, [x0], #32
236*c0909341SAndroid Build Coastguard Worker.endif
237*c0909341SAndroid Build Coastguard Worker        ret
238*c0909341SAndroid Build Coastguard Worker1:
239*c0909341SAndroid Build Coastguard Worker        // CDEF_HAVE_BOTTOM
240*c0909341SAndroid Build Coastguard Worker        add             x9,  x5,  x2
241*c0909341SAndroid Build Coastguard Worker        pad_top_bottom  x5,  x9, \w, \stride, \rn, \rw, 1
242*c0909341SAndroid Build Coastguard Workerendfunc
243*c0909341SAndroid Build Coastguard Worker.endm
244*c0909341SAndroid Build Coastguard Worker
245*c0909341SAndroid Build Coastguard Workerpadding_func 8, 16, d, q
246*c0909341SAndroid Build Coastguard Workerpadding_func 4, 8,  s, d
247*c0909341SAndroid Build Coastguard Worker
248*c0909341SAndroid Build Coastguard Worker// void cdef_paddingX_edged_8bpc_neon(uint8_t *tmp, const pixel *src,
249*c0909341SAndroid Build Coastguard Worker//                                    ptrdiff_t src_stride, const pixel (*left)[2],
250*c0909341SAndroid Build Coastguard Worker//                                    const pixel *const top,
251*c0909341SAndroid Build Coastguard Worker//                                    const pixel *const bottom, int h,
252*c0909341SAndroid Build Coastguard Worker//                                    enum CdefEdgeFlags edges);
253*c0909341SAndroid Build Coastguard Worker
254*c0909341SAndroid Build Coastguard Worker.macro padding_func_edged w, stride, reg
255*c0909341SAndroid Build Coastguard Workerfunction cdef_padding\w\()_edged_8bpc_neon, export=1
256*c0909341SAndroid Build Coastguard Worker        sub             x4,  x4,  #2
257*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #2
258*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  #(2*\stride+2)
259*c0909341SAndroid Build Coastguard Worker
260*c0909341SAndroid Build Coastguard Worker.if \w == 4
261*c0909341SAndroid Build Coastguard Worker        ldr             d0, [x4]
262*c0909341SAndroid Build Coastguard Worker        ldr             d1, [x4, x2]
263*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b, v1.8b}, [x0], #16
264*c0909341SAndroid Build Coastguard Worker.else
265*c0909341SAndroid Build Coastguard Worker        add             x9,  x4,  x2
266*c0909341SAndroid Build Coastguard Worker        ldr             d0, [x4]
267*c0909341SAndroid Build Coastguard Worker        ldr             s1, [x4, #8]
268*c0909341SAndroid Build Coastguard Worker        ldr             d2, [x9]
269*c0909341SAndroid Build Coastguard Worker        ldr             s3, [x9, #8]
270*c0909341SAndroid Build Coastguard Worker        str             d0, [x0]
271*c0909341SAndroid Build Coastguard Worker        str             s1, [x0, #8]
272*c0909341SAndroid Build Coastguard Worker        str             d2, [x0, #\stride]
273*c0909341SAndroid Build Coastguard Worker        str             s3, [x0, #\stride+8]
274*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2*\stride
275*c0909341SAndroid Build Coastguard Worker.endif
276*c0909341SAndroid Build Coastguard Worker
277*c0909341SAndroid Build Coastguard Worker0:
278*c0909341SAndroid Build Coastguard Worker        ld1             {v0.h}[0], [x3], #2
279*c0909341SAndroid Build Coastguard Worker        ldr             h2,      [x1, #\w]
280*c0909341SAndroid Build Coastguard Worker        load_n_incr     v1,  x1,  x2,  \w
281*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #1
282*c0909341SAndroid Build Coastguard Worker        str             h0,      [x0]
283*c0909341SAndroid Build Coastguard Worker        stur            \reg\()1, [x0, #2]
284*c0909341SAndroid Build Coastguard Worker        str             h2,      [x0, #2+\w]
285*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #\stride
286*c0909341SAndroid Build Coastguard Worker        b.gt            0b
287*c0909341SAndroid Build Coastguard Worker
288*c0909341SAndroid Build Coastguard Worker.if \w == 4
289*c0909341SAndroid Build Coastguard Worker        ldr             d0, [x5]
290*c0909341SAndroid Build Coastguard Worker        ldr             d1, [x5, x2]
291*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b, v1.8b}, [x0], #16
292*c0909341SAndroid Build Coastguard Worker.else
293*c0909341SAndroid Build Coastguard Worker        add             x9,  x5,  x2
294*c0909341SAndroid Build Coastguard Worker        ldr             d0, [x5]
295*c0909341SAndroid Build Coastguard Worker        ldr             s1, [x5, #8]
296*c0909341SAndroid Build Coastguard Worker        ldr             d2, [x9]
297*c0909341SAndroid Build Coastguard Worker        ldr             s3, [x9, #8]
298*c0909341SAndroid Build Coastguard Worker        str             d0, [x0]
299*c0909341SAndroid Build Coastguard Worker        str             s1, [x0, #8]
300*c0909341SAndroid Build Coastguard Worker        str             d2, [x0, #\stride]
301*c0909341SAndroid Build Coastguard Worker        str             s3, [x0, #\stride+8]
302*c0909341SAndroid Build Coastguard Worker.endif
303*c0909341SAndroid Build Coastguard Worker        ret
304*c0909341SAndroid Build Coastguard Workerendfunc
305*c0909341SAndroid Build Coastguard Worker.endm
306*c0909341SAndroid Build Coastguard Worker
307*c0909341SAndroid Build Coastguard Workerpadding_func_edged 8, 16, d
308*c0909341SAndroid Build Coastguard Workerpadding_func_edged 4, 8,  s
309*c0909341SAndroid Build Coastguard Worker
310*c0909341SAndroid Build Coastguard Workertables
311*c0909341SAndroid Build Coastguard Worker
312*c0909341SAndroid Build Coastguard Workerfilter 8, 8
313*c0909341SAndroid Build Coastguard Workerfilter 4, 8
314*c0909341SAndroid Build Coastguard Worker
315*c0909341SAndroid Build Coastguard Workerfind_dir 8
316*c0909341SAndroid Build Coastguard Worker
317*c0909341SAndroid Build Coastguard Worker.macro load_px_8 d1, d2, w
318*c0909341SAndroid Build Coastguard Worker.if \w == 8
319*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w9, sxtb          // x + off
320*c0909341SAndroid Build Coastguard Worker        sub             x9,  x2,  w9, sxtb          // x - off
321*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().d}[0], [x6]         // p0
322*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  #16               // += stride
323*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().d}[0], [x9]         // p1
324*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  #16               // += stride
325*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().d}[1], [x6]         // p0
326*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().d}[1], [x9]         // p0
327*c0909341SAndroid Build Coastguard Worker.else
328*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w9, sxtb          // x + off
329*c0909341SAndroid Build Coastguard Worker        sub             x9,  x2,  w9, sxtb          // x - off
330*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().s}[0], [x6]         // p0
331*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  #8                // += stride
332*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().s}[0], [x9]         // p1
333*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  #8                // += stride
334*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().s}[1], [x6]         // p0
335*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  #8                // += stride
336*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().s}[1], [x9]         // p1
337*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  #8                // += stride
338*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().s}[2], [x6]         // p0
339*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  #8                // += stride
340*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().s}[2], [x9]         // p1
341*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  #8                // += stride
342*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\().s}[3], [x6]         // p0
343*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\().s}[3], [x9]         // p1
344*c0909341SAndroid Build Coastguard Worker.endif
345*c0909341SAndroid Build Coastguard Worker.endm
346*c0909341SAndroid Build Coastguard Worker.macro handle_pixel_8 s1, s2, thresh_vec, shift, tap, min
347*c0909341SAndroid Build Coastguard Worker.if \min
348*c0909341SAndroid Build Coastguard Worker        umin            v3.16b,  v3.16b,  \s1\().16b
349*c0909341SAndroid Build Coastguard Worker        umax            v4.16b,  v4.16b,  \s1\().16b
350*c0909341SAndroid Build Coastguard Worker        umin            v3.16b,  v3.16b,  \s2\().16b
351*c0909341SAndroid Build Coastguard Worker        umax            v4.16b,  v4.16b,  \s2\().16b
352*c0909341SAndroid Build Coastguard Worker.endif
353*c0909341SAndroid Build Coastguard Worker        uabd            v16.16b, v0.16b,  \s1\().16b  // abs(diff)
354*c0909341SAndroid Build Coastguard Worker        uabd            v20.16b, v0.16b,  \s2\().16b  // abs(diff)
355*c0909341SAndroid Build Coastguard Worker        ushl            v17.16b, v16.16b, \shift      // abs(diff) >> shift
356*c0909341SAndroid Build Coastguard Worker        ushl            v21.16b, v20.16b, \shift      // abs(diff) >> shift
357*c0909341SAndroid Build Coastguard Worker        uqsub           v17.16b, \thresh_vec, v17.16b // clip = imax(0, threshold - (abs(diff) >> shift))
358*c0909341SAndroid Build Coastguard Worker        uqsub           v21.16b, \thresh_vec, v21.16b // clip = imax(0, threshold - (abs(diff) >> shift))
359*c0909341SAndroid Build Coastguard Worker        cmhi            v18.16b, v0.16b,  \s1\().16b  // px > p0
360*c0909341SAndroid Build Coastguard Worker        cmhi            v22.16b, v0.16b,  \s2\().16b  // px > p1
361*c0909341SAndroid Build Coastguard Worker        umin            v17.16b, v17.16b, v16.16b     // imin(abs(diff), clip)
362*c0909341SAndroid Build Coastguard Worker        umin            v21.16b, v21.16b, v20.16b     // imin(abs(diff), clip)
363*c0909341SAndroid Build Coastguard Worker        dup             v19.16b, \tap                 // taps[k]
364*c0909341SAndroid Build Coastguard Worker        neg             v16.16b, v17.16b              // -imin()
365*c0909341SAndroid Build Coastguard Worker        neg             v20.16b, v21.16b              // -imin()
366*c0909341SAndroid Build Coastguard Worker        bsl             v18.16b, v16.16b, v17.16b     // constrain() = apply_sign()
367*c0909341SAndroid Build Coastguard Worker        bsl             v22.16b, v20.16b, v21.16b     // constrain() = apply_sign()
368*c0909341SAndroid Build Coastguard Worker        mla             v1.16b,  v18.16b, v19.16b     // sum += taps[k] * constrain()
369*c0909341SAndroid Build Coastguard Worker        mla             v2.16b,  v22.16b, v19.16b     // sum += taps[k] * constrain()
370*c0909341SAndroid Build Coastguard Worker.endm
371*c0909341SAndroid Build Coastguard Worker
372*c0909341SAndroid Build Coastguard Worker// void cdef_filterX_edged_8bpc_neon(pixel *dst, ptrdiff_t dst_stride,
373*c0909341SAndroid Build Coastguard Worker//                                   const uint8_t *tmp, int pri_strength,
374*c0909341SAndroid Build Coastguard Worker//                                   int sec_strength, int dir, int damping,
375*c0909341SAndroid Build Coastguard Worker//                                   int h);
376*c0909341SAndroid Build Coastguard Worker.macro filter_func_8 w, pri, sec, min, suffix
377*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\suffix\()_edged_8bpc_neon
378*c0909341SAndroid Build Coastguard Worker.if \pri
379*c0909341SAndroid Build Coastguard Worker        movrel          x8,  pri_taps
380*c0909341SAndroid Build Coastguard Worker        and             w9,  w3,  #1
381*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  w9, uxtw #1
382*c0909341SAndroid Build Coastguard Worker.endif
383*c0909341SAndroid Build Coastguard Worker        movrel          x9,  directions\w
384*c0909341SAndroid Build Coastguard Worker        add             x5,  x9,  w5, uxtw #1
385*c0909341SAndroid Build Coastguard Worker        movi            v30.8b,  #7
386*c0909341SAndroid Build Coastguard Worker        dup             v28.8b,  w6                 // damping
387*c0909341SAndroid Build Coastguard Worker
388*c0909341SAndroid Build Coastguard Worker.if \pri
389*c0909341SAndroid Build Coastguard Worker        dup             v25.16b, w3                 // threshold
390*c0909341SAndroid Build Coastguard Worker.endif
391*c0909341SAndroid Build Coastguard Worker.if \sec
392*c0909341SAndroid Build Coastguard Worker        dup             v27.16b, w4                 // threshold
393*c0909341SAndroid Build Coastguard Worker.endif
394*c0909341SAndroid Build Coastguard Worker        trn1            v24.8b,  v25.8b, v27.8b
395*c0909341SAndroid Build Coastguard Worker        clz             v24.8b,  v24.8b             // clz(threshold)
396*c0909341SAndroid Build Coastguard Worker        sub             v24.8b,  v30.8b, v24.8b     // ulog2(threshold)
397*c0909341SAndroid Build Coastguard Worker        uqsub           v24.8b,  v28.8b, v24.8b     // shift = imax(0, damping - ulog2(threshold))
398*c0909341SAndroid Build Coastguard Worker        neg             v24.8b,  v24.8b             // -shift
399*c0909341SAndroid Build Coastguard Worker.if \sec
400*c0909341SAndroid Build Coastguard Worker        dup             v26.16b, v24.b[1]
401*c0909341SAndroid Build Coastguard Worker.endif
402*c0909341SAndroid Build Coastguard Worker.if \pri
403*c0909341SAndroid Build Coastguard Worker        dup             v24.16b, v24.b[0]
404*c0909341SAndroid Build Coastguard Worker.endif
405*c0909341SAndroid Build Coastguard Worker
406*c0909341SAndroid Build Coastguard Worker1:
407*c0909341SAndroid Build Coastguard Worker.if \w == 8
408*c0909341SAndroid Build Coastguard Worker        add             x12, x2,  #16
409*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[0], [x2]             // px
410*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x12]            // px
411*c0909341SAndroid Build Coastguard Worker.else
412*c0909341SAndroid Build Coastguard Worker        add             x12, x2,  #1*8
413*c0909341SAndroid Build Coastguard Worker        add             x13, x2,  #2*8
414*c0909341SAndroid Build Coastguard Worker        add             x14, x2,  #3*8
415*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[0], [x2]             // px
416*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[1], [x12]            // px
417*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[2], [x13]            // px
418*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[3], [x14]            // px
419*c0909341SAndroid Build Coastguard Worker.endif
420*c0909341SAndroid Build Coastguard Worker
421*c0909341SAndroid Build Coastguard Worker        // We need 9-bits or two 8-bit accululators to fit the sum.
422*c0909341SAndroid Build Coastguard Worker        // Max of |sum| > 15*2*6(pri) + 4*4*3(sec) = 228.
423*c0909341SAndroid Build Coastguard Worker        // Start sum at -1 instead of 0 to help handle rounding later.
424*c0909341SAndroid Build Coastguard Worker        movi            v1.16b, #255                // sum
425*c0909341SAndroid Build Coastguard Worker        movi            v2.16b, #0                  // sum
426*c0909341SAndroid Build Coastguard Worker.if \min
427*c0909341SAndroid Build Coastguard Worker        mov             v3.16b, v0.16b              // min
428*c0909341SAndroid Build Coastguard Worker        mov             v4.16b, v0.16b              // max
429*c0909341SAndroid Build Coastguard Worker.endif
430*c0909341SAndroid Build Coastguard Worker
431*c0909341SAndroid Build Coastguard Worker        // Instead of loading sec_taps 2, 1 from memory, just set it
432*c0909341SAndroid Build Coastguard Worker        // to 2 initially and decrease for the second round.
433*c0909341SAndroid Build Coastguard Worker        // This is also used as loop counter.
434*c0909341SAndroid Build Coastguard Worker        mov             w11, #2                     // sec_taps[0]
435*c0909341SAndroid Build Coastguard Worker
436*c0909341SAndroid Build Coastguard Worker2:
437*c0909341SAndroid Build Coastguard Worker.if \pri
438*c0909341SAndroid Build Coastguard Worker        ldrb            w9,  [x5]                   // off1
439*c0909341SAndroid Build Coastguard Worker
440*c0909341SAndroid Build Coastguard Worker        load_px_8       v5,  v6, \w
441*c0909341SAndroid Build Coastguard Worker.endif
442*c0909341SAndroid Build Coastguard Worker
443*c0909341SAndroid Build Coastguard Worker.if \sec
444*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  #4                // +2*2
445*c0909341SAndroid Build Coastguard Worker        ldrb            w9,  [x5]                   // off2
446*c0909341SAndroid Build Coastguard Worker        load_px_8       v28, v29, \w
447*c0909341SAndroid Build Coastguard Worker.endif
448*c0909341SAndroid Build Coastguard Worker
449*c0909341SAndroid Build Coastguard Worker.if \pri
450*c0909341SAndroid Build Coastguard Worker        ldrb            w10, [x8]                   // *pri_taps
451*c0909341SAndroid Build Coastguard Worker
452*c0909341SAndroid Build Coastguard Worker        handle_pixel_8  v5,  v6,  v25.16b, v24.16b, w10, \min
453*c0909341SAndroid Build Coastguard Worker.endif
454*c0909341SAndroid Build Coastguard Worker
455*c0909341SAndroid Build Coastguard Worker.if \sec
456*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  #8                // +2*4
457*c0909341SAndroid Build Coastguard Worker        ldrb            w9,  [x5]                   // off3
458*c0909341SAndroid Build Coastguard Worker        load_px_8       v5,  v6,  \w
459*c0909341SAndroid Build Coastguard Worker
460*c0909341SAndroid Build Coastguard Worker        handle_pixel_8  v28, v29, v27.16b, v26.16b, w11, \min
461*c0909341SAndroid Build Coastguard Worker
462*c0909341SAndroid Build Coastguard Worker        handle_pixel_8  v5,  v6,  v27.16b, v26.16b, w11, \min
463*c0909341SAndroid Build Coastguard Worker
464*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #11               // x5 -= 2*(2+4); x5 += 1;
465*c0909341SAndroid Build Coastguard Worker.else
466*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  #1                // x5 += 1
467*c0909341SAndroid Build Coastguard Worker.endif
468*c0909341SAndroid Build Coastguard Worker        subs            w11, w11, #1                // sec_tap-- (value)
469*c0909341SAndroid Build Coastguard Worker.if \pri
470*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  #1                // pri_taps++ (pointer)
471*c0909341SAndroid Build Coastguard Worker.endif
472*c0909341SAndroid Build Coastguard Worker        b.ne            2b
473*c0909341SAndroid Build Coastguard Worker
474*c0909341SAndroid Build Coastguard Worker        // Perform halving adds since the value won't fit otherwise.
475*c0909341SAndroid Build Coastguard Worker        // To handle the offset for negative values, use both halving w/ and w/o rounding.
476*c0909341SAndroid Build Coastguard Worker        srhadd          v5.16b,  v1.16b,  v2.16b    // sum >> 1
477*c0909341SAndroid Build Coastguard Worker        shadd           v6.16b,  v1.16b,  v2.16b    // (sum - 1) >> 1
478*c0909341SAndroid Build Coastguard Worker        cmlt            v1.16b,  v5.16b,  #0        // sum < 0
479*c0909341SAndroid Build Coastguard Worker        bsl             v1.16b,  v6.16b,  v5.16b    // (sum - (sum < 0)) >> 1
480*c0909341SAndroid Build Coastguard Worker
481*c0909341SAndroid Build Coastguard Worker        srshr           v1.16b,  v1.16b,  #3        // (8 + sum - (sum < 0)) >> 4
482*c0909341SAndroid Build Coastguard Worker
483*c0909341SAndroid Build Coastguard Worker        usqadd          v0.16b,  v1.16b             // px + (8 + sum ...) >> 4
484*c0909341SAndroid Build Coastguard Worker.if \min
485*c0909341SAndroid Build Coastguard Worker        umin            v0.16b,  v0.16b,  v4.16b
486*c0909341SAndroid Build Coastguard Worker        umax            v0.16b,  v0.16b,  v3.16b    // iclip(px + .., min, max)
487*c0909341SAndroid Build Coastguard Worker.endif
488*c0909341SAndroid Build Coastguard Worker.if \w == 8
489*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[0], [x0], x1
490*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2*16             // tmp += 2*tmp_stride
491*c0909341SAndroid Build Coastguard Worker        subs            w7,  w7,  #2                // h -= 2
492*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[1], [x0], x1
493*c0909341SAndroid Build Coastguard Worker.else
494*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0], [x0], x1
495*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #4*8              // tmp += 4*tmp_stride
496*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[1], [x0], x1
497*c0909341SAndroid Build Coastguard Worker        subs            w7,  w7,  #4                // h -= 4
498*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[2], [x0], x1
499*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[3], [x0], x1
500*c0909341SAndroid Build Coastguard Worker.endif
501*c0909341SAndroid Build Coastguard Worker
502*c0909341SAndroid Build Coastguard Worker        // Reset pri_taps and directions back to the original point
503*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #2
504*c0909341SAndroid Build Coastguard Worker.if \pri
505*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  #2
506*c0909341SAndroid Build Coastguard Worker.endif
507*c0909341SAndroid Build Coastguard Worker
508*c0909341SAndroid Build Coastguard Worker        b.gt            1b
509*c0909341SAndroid Build Coastguard Worker        ret
510*c0909341SAndroid Build Coastguard Workerendfunc
511*c0909341SAndroid Build Coastguard Worker.endm
512*c0909341SAndroid Build Coastguard Worker
513*c0909341SAndroid Build Coastguard Worker.macro filter_8 w
514*c0909341SAndroid Build Coastguard Workerfilter_func_8 \w, pri=1, sec=0, min=0, suffix=_pri
515*c0909341SAndroid Build Coastguard Workerfilter_func_8 \w, pri=0, sec=1, min=0, suffix=_sec
516*c0909341SAndroid Build Coastguard Workerfilter_func_8 \w, pri=1, sec=1, min=1, suffix=_pri_sec
517*c0909341SAndroid Build Coastguard Worker.endm
518*c0909341SAndroid Build Coastguard Worker
519*c0909341SAndroid Build Coastguard Workerfilter_8 8
520*c0909341SAndroid Build Coastguard Workerfilter_8 4
521