xref: /aosp_15_r20/external/libdav1d/src/arm/32/cdef_tmpl.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker.macro dir_table w, stride
32*c0909341SAndroid Build Coastguard Workerconst directions\w
33*c0909341SAndroid Build Coastguard Worker        .byte           -1 * \stride + 1, -2 * \stride + 2
34*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1, -1 * \stride + 2
35*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  0 * \stride + 2
36*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  1 * \stride + 2
37*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 1,  2 * \stride + 2
38*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride + 1
39*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride + 0
40*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride - 1
41*c0909341SAndroid Build Coastguard Worker// Repeated, to avoid & 7
42*c0909341SAndroid Build Coastguard Worker        .byte           -1 * \stride + 1, -2 * \stride + 2
43*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1, -1 * \stride + 2
44*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  0 * \stride + 2
45*c0909341SAndroid Build Coastguard Worker        .byte            0 * \stride + 1,  1 * \stride + 2
46*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 1,  2 * \stride + 2
47*c0909341SAndroid Build Coastguard Worker        .byte            1 * \stride + 0,  2 * \stride + 1
48*c0909341SAndroid Build Coastguard Workerendconst
49*c0909341SAndroid Build Coastguard Worker.endm
50*c0909341SAndroid Build Coastguard Worker
51*c0909341SAndroid Build Coastguard Worker.macro tables
52*c0909341SAndroid Build Coastguard Workerdir_table 8, 16
53*c0909341SAndroid Build Coastguard Workerdir_table 4, 8
54*c0909341SAndroid Build Coastguard Worker
55*c0909341SAndroid Build Coastguard Workerconst pri_taps
56*c0909341SAndroid Build Coastguard Worker        .byte           4, 2, 3, 3
57*c0909341SAndroid Build Coastguard Workerendconst
58*c0909341SAndroid Build Coastguard Worker.endm
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard Worker.macro load_px d11, d12, d21, d22, w
61*c0909341SAndroid Build Coastguard Worker.if \w == 8
62*c0909341SAndroid Build Coastguard Worker        add             r6,  r2,  r9, lsl #1 // x + off
63*c0909341SAndroid Build Coastguard Worker        sub             r9,  r2,  r9, lsl #1 // x - off
64*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d11,\d12}, [r6]    // p0
65*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d21,\d22}, [r9]    // p1
66*c0909341SAndroid Build Coastguard Worker.else
67*c0909341SAndroid Build Coastguard Worker        add             r6,  r2,  r9, lsl #1 // x + off
68*c0909341SAndroid Build Coastguard Worker        sub             r9,  r2,  r9, lsl #1 // x - off
69*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d11}, [r6]         // p0
70*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  #2*8       // += stride
71*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d21}, [r9]         // p1
72*c0909341SAndroid Build Coastguard Worker        add             r9,  r9,  #2*8       // += stride
73*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d12}, [r6]         // p0
74*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d22}, [r9]         // p1
75*c0909341SAndroid Build Coastguard Worker.endif
76*c0909341SAndroid Build Coastguard Worker.endm
77*c0909341SAndroid Build Coastguard Worker.macro handle_pixel s1, s2, thresh_vec, shift, tap, min
78*c0909341SAndroid Build Coastguard Worker.if \min
79*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  \s1
80*c0909341SAndroid Build Coastguard Worker        vmax.s16        q3,  q3,  \s1
81*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  \s2
82*c0909341SAndroid Build Coastguard Worker        vmax.s16        q3,  q3,  \s2
83*c0909341SAndroid Build Coastguard Worker.endif
84*c0909341SAndroid Build Coastguard Worker        vabd.u16        q8,  q0,  \s1        // abs(diff)
85*c0909341SAndroid Build Coastguard Worker        vabd.u16        q11, q0,  \s2        // abs(diff)
86*c0909341SAndroid Build Coastguard Worker        vshl.u16        q9,  q8,  \shift     // abs(diff) >> shift
87*c0909341SAndroid Build Coastguard Worker        vshl.u16        q12, q11, \shift     // abs(diff) >> shift
88*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q9,  \thresh_vec, q9 // clip = imax(0, threshold - (abs(diff) >> shift))
89*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q12, \thresh_vec, q12// clip = imax(0, threshold - (abs(diff) >> shift))
90*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, \s1, q0         // diff = p0 - px
91*c0909341SAndroid Build Coastguard Worker        vsub.i16        q13, \s2, q0         // diff = p1 - px
92*c0909341SAndroid Build Coastguard Worker        vneg.s16        q8,  q9              // -clip
93*c0909341SAndroid Build Coastguard Worker        vneg.s16        q11, q12             // -clip
94*c0909341SAndroid Build Coastguard Worker        vmin.s16        q10, q10, q9         // imin(diff, clip)
95*c0909341SAndroid Build Coastguard Worker        vmin.s16        q13, q13, q12        // imin(diff, clip)
96*c0909341SAndroid Build Coastguard Worker        vdup.16         q9,  \tap            // taps[k]
97*c0909341SAndroid Build Coastguard Worker        vmax.s16        q10, q10, q8         // constrain() = imax(imin(diff, clip), -clip)
98*c0909341SAndroid Build Coastguard Worker        vmax.s16        q13, q13, q11        // constrain() = imax(imin(diff, clip), -clip)
99*c0909341SAndroid Build Coastguard Worker        vmla.i16        q1,  q10, q9         // sum += taps[k] * constrain()
100*c0909341SAndroid Build Coastguard Worker        vmla.i16        q1,  q13, q9         // sum += taps[k] * constrain()
101*c0909341SAndroid Build Coastguard Worker.endm
102*c0909341SAndroid Build Coastguard Worker
103*c0909341SAndroid Build Coastguard Worker// void dav1d_cdef_filterX_Ybpc_neon(pixel *dst, ptrdiff_t dst_stride,
104*c0909341SAndroid Build Coastguard Worker//                                   const uint16_t *tmp, int pri_strength,
105*c0909341SAndroid Build Coastguard Worker//                                   int sec_strength, int dir, int damping,
106*c0909341SAndroid Build Coastguard Worker//                                   int h, size_t edges);
107*c0909341SAndroid Build Coastguard Worker.macro filter_func w, bpc, pri, sec, min, suffix
108*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\suffix\()_\bpc\()bpc_neon
109*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
110*c0909341SAndroid Build Coastguard Worker        cmp             r8,  #0xf
111*c0909341SAndroid Build Coastguard Worker        beq             cdef_filter\w\suffix\()_edged_neon
112*c0909341SAndroid Build Coastguard Worker.endif
113*c0909341SAndroid Build Coastguard Worker.if \pri
114*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
115*c0909341SAndroid Build Coastguard Worker        clz             r9,  r9
116*c0909341SAndroid Build Coastguard Worker        sub             r9,  r9,  #24        // -bitdepth_min_8
117*c0909341SAndroid Build Coastguard Worker        neg             r9,  r9              // bitdepth_min_8
118*c0909341SAndroid Build Coastguard Worker.endif
119*c0909341SAndroid Build Coastguard Worker        movrel_local    r8,  pri_taps
120*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
121*c0909341SAndroid Build Coastguard Worker        lsr             r9,  r3,  r9         // pri_strength >> bitdepth_min_8
122*c0909341SAndroid Build Coastguard Worker        and             r9,  r9,  #1         // (pri_strength >> bitdepth_min_8) & 1
123*c0909341SAndroid Build Coastguard Worker.else
124*c0909341SAndroid Build Coastguard Worker        and             r9,  r3,  #1
125*c0909341SAndroid Build Coastguard Worker.endif
126*c0909341SAndroid Build Coastguard Worker        add             r8,  r8,  r9, lsl #1
127*c0909341SAndroid Build Coastguard Worker.endif
128*c0909341SAndroid Build Coastguard Worker        movrel_local    r9,  directions\w
129*c0909341SAndroid Build Coastguard Worker        add             r5,  r9,  r5, lsl #1
130*c0909341SAndroid Build Coastguard Worker        vmov.u16        d17, #15
131*c0909341SAndroid Build Coastguard Worker        vdup.16         d16, r6              // damping
132*c0909341SAndroid Build Coastguard Worker
133*c0909341SAndroid Build Coastguard Worker.if \pri
134*c0909341SAndroid Build Coastguard Worker        vdup.16         q5,  r3              // threshold
135*c0909341SAndroid Build Coastguard Worker.endif
136*c0909341SAndroid Build Coastguard Worker.if \sec
137*c0909341SAndroid Build Coastguard Worker        vdup.16         q7,  r4              // threshold
138*c0909341SAndroid Build Coastguard Worker.endif
139*c0909341SAndroid Build Coastguard Worker        vmov.16         d8[0], r3
140*c0909341SAndroid Build Coastguard Worker        vmov.16         d8[1], r4
141*c0909341SAndroid Build Coastguard Worker        vclz.i16        d8,  d8              // clz(threshold)
142*c0909341SAndroid Build Coastguard Worker        vsub.i16        d8,  d17, d8         // ulog2(threshold)
143*c0909341SAndroid Build Coastguard Worker        vqsub.u16       d8,  d16, d8         // shift = imax(0, damping - ulog2(threshold))
144*c0909341SAndroid Build Coastguard Worker        vneg.s16        d8,  d8              // -shift
145*c0909341SAndroid Build Coastguard Worker.if \sec
146*c0909341SAndroid Build Coastguard Worker        vdup.16         q6,  d8[1]
147*c0909341SAndroid Build Coastguard Worker.endif
148*c0909341SAndroid Build Coastguard Worker.if \pri
149*c0909341SAndroid Build Coastguard Worker        vdup.16         q4,  d8[0]
150*c0909341SAndroid Build Coastguard Worker.endif
151*c0909341SAndroid Build Coastguard Worker
152*c0909341SAndroid Build Coastguard Worker1:
153*c0909341SAndroid Build Coastguard Worker.if \w == 8
154*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0},  [r2, :128]    // px
155*c0909341SAndroid Build Coastguard Worker.else
156*c0909341SAndroid Build Coastguard Worker        add             r12, r2,  #2*8
157*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0},  [r2,  :64]    // px
158*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1},  [r12, :64]    // px
159*c0909341SAndroid Build Coastguard Worker.endif
160*c0909341SAndroid Build Coastguard Worker
161*c0909341SAndroid Build Coastguard Worker        vmov.u16        q1,  #0              // sum
162*c0909341SAndroid Build Coastguard Worker.if \min
163*c0909341SAndroid Build Coastguard Worker        vmov.u16        q2,  q0              // min
164*c0909341SAndroid Build Coastguard Worker        vmov.u16        q3,  q0              // max
165*c0909341SAndroid Build Coastguard Worker.endif
166*c0909341SAndroid Build Coastguard Worker
167*c0909341SAndroid Build Coastguard Worker        // Instead of loading sec_taps 2, 1 from memory, just set it
168*c0909341SAndroid Build Coastguard Worker        // to 2 initially and decrease for the second round.
169*c0909341SAndroid Build Coastguard Worker        // This is also used as loop counter.
170*c0909341SAndroid Build Coastguard Worker        mov             lr,  #2              // sec_taps[0]
171*c0909341SAndroid Build Coastguard Worker
172*c0909341SAndroid Build Coastguard Worker2:
173*c0909341SAndroid Build Coastguard Worker.if \pri
174*c0909341SAndroid Build Coastguard Worker        ldrsb           r9,  [r5]            // off1
175*c0909341SAndroid Build Coastguard Worker
176*c0909341SAndroid Build Coastguard Worker        load_px         d28, d29, d30, d31, \w
177*c0909341SAndroid Build Coastguard Worker.endif
178*c0909341SAndroid Build Coastguard Worker
179*c0909341SAndroid Build Coastguard Worker.if \sec
180*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #4         // +2*2
181*c0909341SAndroid Build Coastguard Worker        ldrsb           r9,  [r5]            // off2
182*c0909341SAndroid Build Coastguard Worker.endif
183*c0909341SAndroid Build Coastguard Worker
184*c0909341SAndroid Build Coastguard Worker.if \pri
185*c0909341SAndroid Build Coastguard Worker        ldrb            r12, [r8]            // *pri_taps
186*c0909341SAndroid Build Coastguard Worker
187*c0909341SAndroid Build Coastguard Worker        handle_pixel    q14, q15, q5,  q4,  r12, \min
188*c0909341SAndroid Build Coastguard Worker.endif
189*c0909341SAndroid Build Coastguard Worker
190*c0909341SAndroid Build Coastguard Worker.if \sec
191*c0909341SAndroid Build Coastguard Worker        load_px         d28, d29, d30, d31, \w
192*c0909341SAndroid Build Coastguard Worker
193*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #8         // +2*4
194*c0909341SAndroid Build Coastguard Worker        ldrsb           r9,  [r5]            // off3
195*c0909341SAndroid Build Coastguard Worker
196*c0909341SAndroid Build Coastguard Worker        handle_pixel    q14, q15, q7,  q6,  lr, \min
197*c0909341SAndroid Build Coastguard Worker
198*c0909341SAndroid Build Coastguard Worker        load_px         d28, d29, d30, d31, \w
199*c0909341SAndroid Build Coastguard Worker
200*c0909341SAndroid Build Coastguard Worker        handle_pixel    q14, q15, q7,  q6,  lr, \min
201*c0909341SAndroid Build Coastguard Worker
202*c0909341SAndroid Build Coastguard Worker        sub             r5,  r5,  #11        // r5 -= 2*(2+4); r5 += 1;
203*c0909341SAndroid Build Coastguard Worker.else
204*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #1         // r5 += 1
205*c0909341SAndroid Build Coastguard Worker.endif
206*c0909341SAndroid Build Coastguard Worker        subs            lr,  lr,  #1         // sec_tap-- (value)
207*c0909341SAndroid Build Coastguard Worker.if \pri
208*c0909341SAndroid Build Coastguard Worker        add             r8,  r8,  #1         // pri_taps++ (pointer)
209*c0909341SAndroid Build Coastguard Worker.endif
210*c0909341SAndroid Build Coastguard Worker        bne             2b
211*c0909341SAndroid Build Coastguard Worker
212*c0909341SAndroid Build Coastguard Worker        vshr.s16        q14, q1,  #15        // -(sum < 0)
213*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q14        // sum - (sum < 0)
214*c0909341SAndroid Build Coastguard Worker        vrshr.s16       q1,  q1,  #4         // (8 + sum - (sum < 0)) >> 4
215*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q1         // px + (8 + sum ...) >> 4
216*c0909341SAndroid Build Coastguard Worker.if \min
217*c0909341SAndroid Build Coastguard Worker        vmin.s16        q0,  q0,  q3
218*c0909341SAndroid Build Coastguard Worker        vmax.s16        q0,  q0,  q2         // iclip(px + .., min, max)
219*c0909341SAndroid Build Coastguard Worker.endif
220*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
221*c0909341SAndroid Build Coastguard Worker        vmovn.u16       d0,  q0
222*c0909341SAndroid Build Coastguard Worker.endif
223*c0909341SAndroid Build Coastguard Worker.if \w == 8
224*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  #2*16      // tmp += tmp_stride
225*c0909341SAndroid Build Coastguard Worker        subs            r7,  r7,  #1         // h--
226*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
227*c0909341SAndroid Build Coastguard Worker        vst1.8          {d0}, [r0, :64], r1
228*c0909341SAndroid Build Coastguard Worker.else
229*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0}, [r0, :128], r1
230*c0909341SAndroid Build Coastguard Worker.endif
231*c0909341SAndroid Build Coastguard Worker.else
232*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
233*c0909341SAndroid Build Coastguard Worker        vst1.32         {d0[0]}, [r0, :32], r1
234*c0909341SAndroid Build Coastguard Worker.else
235*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0},    [r0, :64], r1
236*c0909341SAndroid Build Coastguard Worker.endif
237*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  #2*16      // tmp += 2*tmp_stride
238*c0909341SAndroid Build Coastguard Worker        subs            r7,  r7,  #2         // h -= 2
239*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
240*c0909341SAndroid Build Coastguard Worker        vst1.32         {d0[1]}, [r0, :32], r1
241*c0909341SAndroid Build Coastguard Worker.else
242*c0909341SAndroid Build Coastguard Worker        vst1.16         {d1},    [r0, :64], r1
243*c0909341SAndroid Build Coastguard Worker.endif
244*c0909341SAndroid Build Coastguard Worker.endif
245*c0909341SAndroid Build Coastguard Worker
246*c0909341SAndroid Build Coastguard Worker        // Reset pri_taps and directions back to the original point
247*c0909341SAndroid Build Coastguard Worker        sub             r5,  r5,  #2
248*c0909341SAndroid Build Coastguard Worker.if \pri
249*c0909341SAndroid Build Coastguard Worker        sub             r8,  r8,  #2
250*c0909341SAndroid Build Coastguard Worker.endif
251*c0909341SAndroid Build Coastguard Worker
252*c0909341SAndroid Build Coastguard Worker        bgt             1b
253*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
254*c0909341SAndroid Build Coastguard Worker        pop             {r4-r9,pc}
255*c0909341SAndroid Build Coastguard Workerendfunc
256*c0909341SAndroid Build Coastguard Worker.endm
257*c0909341SAndroid Build Coastguard Worker
258*c0909341SAndroid Build Coastguard Worker.macro filter w, bpc
259*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=0, min=0, suffix=_pri
260*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=0, sec=1, min=0, suffix=_sec
261*c0909341SAndroid Build Coastguard Workerfilter_func \w, \bpc, pri=1, sec=1, min=1, suffix=_pri_sec
262*c0909341SAndroid Build Coastguard Worker
263*c0909341SAndroid Build Coastguard Workerfunction cdef_filter\w\()_\bpc\()bpc_neon, export=1
264*c0909341SAndroid Build Coastguard Worker        push            {r4-r9,lr}
265*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
266*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #92]
267*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #100]
268*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
269*c0909341SAndroid Build Coastguard Worker        ldrd            r8,  r9,  [sp, #108]
270*c0909341SAndroid Build Coastguard Worker.else
271*c0909341SAndroid Build Coastguard Worker        ldr             r8,  [sp, #108]
272*c0909341SAndroid Build Coastguard Worker.endif
273*c0909341SAndroid Build Coastguard Worker        cmp             r3,  #0 // pri_strength
274*c0909341SAndroid Build Coastguard Worker        bne             1f
275*c0909341SAndroid Build Coastguard Worker        b               cdef_filter\w\()_sec_\bpc\()bpc_neon // only sec
276*c0909341SAndroid Build Coastguard Worker1:
277*c0909341SAndroid Build Coastguard Worker        cmp             r4,  #0 // sec_strength
278*c0909341SAndroid Build Coastguard Worker        bne             1f
279*c0909341SAndroid Build Coastguard Worker        b               cdef_filter\w\()_pri_\bpc\()bpc_neon // only pri
280*c0909341SAndroid Build Coastguard Worker1:
281*c0909341SAndroid Build Coastguard Worker        b               cdef_filter\w\()_pri_sec_\bpc\()bpc_neon // both pri and sec
282*c0909341SAndroid Build Coastguard Workerendfunc
283*c0909341SAndroid Build Coastguard Worker.endm
284*c0909341SAndroid Build Coastguard Worker
285*c0909341SAndroid Build Coastguard Workerconst div_table, align=4
286*c0909341SAndroid Build Coastguard Worker        .short         840, 420, 280, 210, 168, 140, 120, 105
287*c0909341SAndroid Build Coastguard Workerendconst
288*c0909341SAndroid Build Coastguard Worker
289*c0909341SAndroid Build Coastguard Workerconst alt_fact, align=4
290*c0909341SAndroid Build Coastguard Worker        .short         420, 210, 140, 105, 105, 105, 105, 105, 140, 210, 420, 0
291*c0909341SAndroid Build Coastguard Workerendconst
292*c0909341SAndroid Build Coastguard Worker
293*c0909341SAndroid Build Coastguard Worker.macro cost_alt dest, s1, s2, s3, s4, s5, s6
294*c0909341SAndroid Build Coastguard Worker        vmull.s16       q1,  \s1, \s1          // sum_alt[n]*sum_alt[n]
295*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  \s2, \s2
296*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  \s3, \s3
297*c0909341SAndroid Build Coastguard Worker        vmull.s16       q5,  \s4, \s4          // sum_alt[n]*sum_alt[n]
298*c0909341SAndroid Build Coastguard Worker        vmull.s16       q12, \s5, \s5
299*c0909341SAndroid Build Coastguard Worker        vmull.s16       q6,  \s6, \s6          // q6 overlaps the first \s1-\s2 here
300*c0909341SAndroid Build Coastguard Worker        vmul.i32        q1,  q1,  q13          // sum_alt[n]^2*fact
301*c0909341SAndroid Build Coastguard Worker        vmla.i32        q1,  q2,  q14
302*c0909341SAndroid Build Coastguard Worker        vmla.i32        q1,  q3,  q15
303*c0909341SAndroid Build Coastguard Worker        vmul.i32        q5,  q5,  q13          // sum_alt[n]^2*fact
304*c0909341SAndroid Build Coastguard Worker        vmla.i32        q5,  q12, q14
305*c0909341SAndroid Build Coastguard Worker        vmla.i32        q5,  q6,  q15
306*c0909341SAndroid Build Coastguard Worker        vadd.i32        d2,  d2,  d3
307*c0909341SAndroid Build Coastguard Worker        vadd.i32        d3,  d10, d11
308*c0909341SAndroid Build Coastguard Worker        vpadd.i32       \dest, d2, d3          // *cost_ptr
309*c0909341SAndroid Build Coastguard Worker.endm
310*c0909341SAndroid Build Coastguard Worker
311*c0909341SAndroid Build Coastguard Worker.macro find_best s1, s2, s3
312*c0909341SAndroid Build Coastguard Worker.ifnb \s2
313*c0909341SAndroid Build Coastguard Worker        vmov.32         lr,  \s2
314*c0909341SAndroid Build Coastguard Worker.endif
315*c0909341SAndroid Build Coastguard Worker        cmp             r12, r1                // cost[n] > best_cost
316*c0909341SAndroid Build Coastguard Worker        itt             gt
317*c0909341SAndroid Build Coastguard Worker        movgt           r0,  r3                // best_dir = n
318*c0909341SAndroid Build Coastguard Worker        movgt           r1,  r12               // best_cost = cost[n]
319*c0909341SAndroid Build Coastguard Worker.ifnb \s2
320*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  #1           // n++
321*c0909341SAndroid Build Coastguard Worker        cmp             lr,  r1                // cost[n] > best_cost
322*c0909341SAndroid Build Coastguard Worker        vmov.32         r12, \s3
323*c0909341SAndroid Build Coastguard Worker        itt             gt
324*c0909341SAndroid Build Coastguard Worker        movgt           r0,  r3                // best_dir = n
325*c0909341SAndroid Build Coastguard Worker        movgt           r1,  lr                // best_cost = cost[n]
326*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  #1           // n++
327*c0909341SAndroid Build Coastguard Worker.endif
328*c0909341SAndroid Build Coastguard Worker.endm
329*c0909341SAndroid Build Coastguard Worker
330*c0909341SAndroid Build Coastguard Worker// int dav1d_cdef_find_dir_Xbpc_neon(const pixel *img, const ptrdiff_t stride,
331*c0909341SAndroid Build Coastguard Worker//                                   unsigned *const var)
332*c0909341SAndroid Build Coastguard Worker.macro find_dir bpc
333*c0909341SAndroid Build Coastguard Workerfunction cdef_find_dir_\bpc\()bpc_neon, export=1
334*c0909341SAndroid Build Coastguard Worker        push            {lr}
335*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
336*c0909341SAndroid Build Coastguard Worker.if \bpc == 16
337*c0909341SAndroid Build Coastguard Worker        clz             r3,  r3                // clz(bitdepth_max)
338*c0909341SAndroid Build Coastguard Worker        sub             lr,  r3,  #24          // -bitdepth_min_8
339*c0909341SAndroid Build Coastguard Worker.endif
340*c0909341SAndroid Build Coastguard Worker        sub             sp,  sp,  #32          // cost
341*c0909341SAndroid Build Coastguard Worker        mov             r3,  #8
342*c0909341SAndroid Build Coastguard Worker        vmov.u16        q1,  #0                // q0-q1   sum_diag[0]
343*c0909341SAndroid Build Coastguard Worker        vmov.u16        q3,  #0                // q2-q3   sum_diag[1]
344*c0909341SAndroid Build Coastguard Worker        vmov.u16        q5,  #0                // q4-q5   sum_hv[0-1]
345*c0909341SAndroid Build Coastguard Worker        vmov.u16        q8,  #0                // q6,d16  sum_alt[0]
346*c0909341SAndroid Build Coastguard Worker                                               // q7,d17  sum_alt[1]
347*c0909341SAndroid Build Coastguard Worker        vmov.u16        q9,  #0                // q9,d22  sum_alt[2]
348*c0909341SAndroid Build Coastguard Worker        vmov.u16        q11, #0
349*c0909341SAndroid Build Coastguard Worker        vmov.u16        q10, #0                // q10,d23 sum_alt[3]
350*c0909341SAndroid Build Coastguard Worker
351*c0909341SAndroid Build Coastguard Worker
352*c0909341SAndroid Build Coastguard Worker.irpc i, 01234567
353*c0909341SAndroid Build Coastguard Worker.if \bpc == 8
354*c0909341SAndroid Build Coastguard Worker        vld1.8          {d30}, [r0, :64], r1
355*c0909341SAndroid Build Coastguard Worker        vmov.u8         d31, #128
356*c0909341SAndroid Build Coastguard Worker        vsubl.u8        q15, d30, d31          // img[x] - 128
357*c0909341SAndroid Build Coastguard Worker.else
358*c0909341SAndroid Build Coastguard Worker        vld1.16         {q15}, [r0, :128], r1
359*c0909341SAndroid Build Coastguard Worker        vdup.16         q14, lr                // -bitdepth_min_8
360*c0909341SAndroid Build Coastguard Worker        vshl.u16        q15, q15, q14
361*c0909341SAndroid Build Coastguard Worker        vmov.u16        q14, #128
362*c0909341SAndroid Build Coastguard Worker        vsub.i16        q15, q15, q14          // img[x] - 128
363*c0909341SAndroid Build Coastguard Worker.endif
364*c0909341SAndroid Build Coastguard Worker        vmov.u16        q14, #0
365*c0909341SAndroid Build Coastguard Worker
366*c0909341SAndroid Build Coastguard Worker.if \i == 0
367*c0909341SAndroid Build Coastguard Worker        vmov            q0,  q15               // sum_diag[0]
368*c0909341SAndroid Build Coastguard Worker.else
369*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q14, q15, #(16-2*\i)
370*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q15, q14, #(16-2*\i)
371*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q12          // sum_diag[0]
372*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q13          // sum_diag[0]
373*c0909341SAndroid Build Coastguard Worker.endif
374*c0909341SAndroid Build Coastguard Worker        vrev64.16       q13, q15
375*c0909341SAndroid Build Coastguard Worker        vswp            d26, d27               // [-x]
376*c0909341SAndroid Build Coastguard Worker.if \i == 0
377*c0909341SAndroid Build Coastguard Worker        vmov            q2,  q13               // sum_diag[1]
378*c0909341SAndroid Build Coastguard Worker.else
379*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q14, q13, #(16-2*\i)
380*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q13, q14, #(16-2*\i)
381*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q12          // sum_diag[1]
382*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q13          // sum_diag[1]
383*c0909341SAndroid Build Coastguard Worker.endif
384*c0909341SAndroid Build Coastguard Worker
385*c0909341SAndroid Build Coastguard Worker        vpadd.u16       d26, d30, d31          // [(x >> 1)]
386*c0909341SAndroid Build Coastguard Worker        vmov.u16        d27, #0
387*c0909341SAndroid Build Coastguard Worker        vpadd.u16       d24, d26, d28
388*c0909341SAndroid Build Coastguard Worker        vpadd.u16       d24, d24, d28          // [y]
389*c0909341SAndroid Build Coastguard Worker        vmov.u16        r12, d24[0]
390*c0909341SAndroid Build Coastguard Worker        vadd.i16        q5,  q5,  q15          // sum_hv[1]
391*c0909341SAndroid Build Coastguard Worker.if \i < 4
392*c0909341SAndroid Build Coastguard Worker        vmov.16         d8[\i],   r12          // sum_hv[0]
393*c0909341SAndroid Build Coastguard Worker.else
394*c0909341SAndroid Build Coastguard Worker        vmov.16         d9[\i-4], r12          // sum_hv[0]
395*c0909341SAndroid Build Coastguard Worker.endif
396*c0909341SAndroid Build Coastguard Worker
397*c0909341SAndroid Build Coastguard Worker.if \i == 0
398*c0909341SAndroid Build Coastguard Worker        vmov.u16        q6,  q13               // sum_alt[0]
399*c0909341SAndroid Build Coastguard Worker.else
400*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q14, q13, #(16-2*\i)
401*c0909341SAndroid Build Coastguard Worker        vext.8          q14, q13, q14, #(16-2*\i)
402*c0909341SAndroid Build Coastguard Worker        vadd.i16        q6,  q6,  q12          // sum_alt[0]
403*c0909341SAndroid Build Coastguard Worker        vadd.i16        d16, d16, d28          // sum_alt[0]
404*c0909341SAndroid Build Coastguard Worker.endif
405*c0909341SAndroid Build Coastguard Worker        vrev64.16       d26, d26               // [-(x >> 1)]
406*c0909341SAndroid Build Coastguard Worker        vmov.u16        q14, #0
407*c0909341SAndroid Build Coastguard Worker.if \i == 0
408*c0909341SAndroid Build Coastguard Worker        vmov            q7,  q13               // sum_alt[1]
409*c0909341SAndroid Build Coastguard Worker.else
410*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q14, q13, #(16-2*\i)
411*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q13, q14, #(16-2*\i)
412*c0909341SAndroid Build Coastguard Worker        vadd.i16        q7,  q7,  q12          // sum_alt[1]
413*c0909341SAndroid Build Coastguard Worker        vadd.i16        d17, d17, d26          // sum_alt[1]
414*c0909341SAndroid Build Coastguard Worker.endif
415*c0909341SAndroid Build Coastguard Worker
416*c0909341SAndroid Build Coastguard Worker.if \i < 6
417*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q14, q15, #(16-2*(3-(\i/2)))
418*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q15, q14, #(16-2*(3-(\i/2)))
419*c0909341SAndroid Build Coastguard Worker        vadd.i16        q9,  q9,  q12          // sum_alt[2]
420*c0909341SAndroid Build Coastguard Worker        vadd.i16        d22, d22, d26          // sum_alt[2]
421*c0909341SAndroid Build Coastguard Worker.else
422*c0909341SAndroid Build Coastguard Worker        vadd.i16        q9,  q9,  q15          // sum_alt[2]
423*c0909341SAndroid Build Coastguard Worker.endif
424*c0909341SAndroid Build Coastguard Worker.if \i == 0
425*c0909341SAndroid Build Coastguard Worker        vmov            q10, q15               // sum_alt[3]
426*c0909341SAndroid Build Coastguard Worker.elseif \i == 1
427*c0909341SAndroid Build Coastguard Worker        vadd.i16        q10, q10, q15          // sum_alt[3]
428*c0909341SAndroid Build Coastguard Worker.else
429*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q14, q15, #(16-2*(\i/2))
430*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q15, q14, #(16-2*(\i/2))
431*c0909341SAndroid Build Coastguard Worker        vadd.i16        q10, q10, q12          // sum_alt[3]
432*c0909341SAndroid Build Coastguard Worker        vadd.i16        d23, d23, d26          // sum_alt[3]
433*c0909341SAndroid Build Coastguard Worker.endif
434*c0909341SAndroid Build Coastguard Worker.endr
435*c0909341SAndroid Build Coastguard Worker
436*c0909341SAndroid Build Coastguard Worker        vmov.u32        q15, #105
437*c0909341SAndroid Build Coastguard Worker
438*c0909341SAndroid Build Coastguard Worker        vmull.s16       q12, d8,  d8           // sum_hv[0]*sum_hv[0]
439*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q12, d9,  d9
440*c0909341SAndroid Build Coastguard Worker        vmull.s16       q13, d10, d10          // sum_hv[1]*sum_hv[1]
441*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d11, d11
442*c0909341SAndroid Build Coastguard Worker        vadd.s32        d8,  d24, d25
443*c0909341SAndroid Build Coastguard Worker        vadd.s32        d9,  d26, d27
444*c0909341SAndroid Build Coastguard Worker        vpadd.s32       d8,  d8,  d9           // cost[2,6] (s16, s17)
445*c0909341SAndroid Build Coastguard Worker        vmul.i32        d8,  d8,  d30          // cost[2,6] *= 105
446*c0909341SAndroid Build Coastguard Worker
447*c0909341SAndroid Build Coastguard Worker        vrev64.16       q1,  q1
448*c0909341SAndroid Build Coastguard Worker        vrev64.16       q3,  q3
449*c0909341SAndroid Build Coastguard Worker        vext.8          q1,  q1,  q1,  #10     // sum_diag[0][14-n]
450*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q3,  q3,  #10     // sum_diag[1][14-n]
451*c0909341SAndroid Build Coastguard Worker
452*c0909341SAndroid Build Coastguard Worker        vstr            s16, [sp, #2*4]        // cost[2]
453*c0909341SAndroid Build Coastguard Worker        vstr            s17, [sp, #6*4]        // cost[6]
454*c0909341SAndroid Build Coastguard Worker
455*c0909341SAndroid Build Coastguard Worker        movrel_local    r12, div_table
456*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14}, [r12, :128]
457*c0909341SAndroid Build Coastguard Worker
458*c0909341SAndroid Build Coastguard Worker        vmull.s16       q5,  d0,  d0           // sum_diag[0]*sum_diag[0]
459*c0909341SAndroid Build Coastguard Worker        vmull.s16       q12, d1,  d1
460*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q5,  d2,  d2
461*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q12, d3,  d3
462*c0909341SAndroid Build Coastguard Worker        vmull.s16       q0,  d4,  d4           // sum_diag[1]*sum_diag[1]
463*c0909341SAndroid Build Coastguard Worker        vmull.s16       q1,  d5,  d5
464*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d6,  d6
465*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d7,  d7
466*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d28               // div_table
467*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q14, d29
468*c0909341SAndroid Build Coastguard Worker        vmul.i32        q5,  q5,  q13          // cost[0]
469*c0909341SAndroid Build Coastguard Worker        vmla.i32        q5,  q12, q14
470*c0909341SAndroid Build Coastguard Worker        vmul.i32        q0,  q0,  q13          // cost[4]
471*c0909341SAndroid Build Coastguard Worker        vmla.i32        q0,  q1,  q14
472*c0909341SAndroid Build Coastguard Worker        vadd.i32        d10, d10, d11
473*c0909341SAndroid Build Coastguard Worker        vadd.i32        d0,  d0,  d1
474*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d0,  d10, d0           // cost[0,4] = s0,s1
475*c0909341SAndroid Build Coastguard Worker
476*c0909341SAndroid Build Coastguard Worker        movrel_local    r12, alt_fact
477*c0909341SAndroid Build Coastguard Worker        vld1.16         {d29, d30, d31}, [r12, :64] // div_table[2*m+1] + 105
478*c0909341SAndroid Build Coastguard Worker
479*c0909341SAndroid Build Coastguard Worker        vstr            s0,  [sp, #0*4]        // cost[0]
480*c0909341SAndroid Build Coastguard Worker        vstr            s1,  [sp, #4*4]        // cost[4]
481*c0909341SAndroid Build Coastguard Worker
482*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d29               // div_table[2*m+1] + 105
483*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q14, d30
484*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q15, d31
485*c0909341SAndroid Build Coastguard Worker
486*c0909341SAndroid Build Coastguard Worker        cost_alt        d14, d12, d13, d16, d14, d15, d17 // cost[1], cost[3]
487*c0909341SAndroid Build Coastguard Worker        cost_alt        d15, d18, d19, d22, d20, d21, d23 // cost[5], cost[7]
488*c0909341SAndroid Build Coastguard Worker        vstr            s28, [sp, #1*4]        // cost[1]
489*c0909341SAndroid Build Coastguard Worker        vstr            s29, [sp, #3*4]        // cost[3]
490*c0909341SAndroid Build Coastguard Worker
491*c0909341SAndroid Build Coastguard Worker        mov             r0,  #0                // best_dir
492*c0909341SAndroid Build Coastguard Worker        vmov.32         r1,  d0[0]             // best_cost
493*c0909341SAndroid Build Coastguard Worker        mov             r3,  #1                // n
494*c0909341SAndroid Build Coastguard Worker
495*c0909341SAndroid Build Coastguard Worker        vstr            s30, [sp, #5*4]        // cost[5]
496*c0909341SAndroid Build Coastguard Worker        vstr            s31, [sp, #7*4]        // cost[7]
497*c0909341SAndroid Build Coastguard Worker
498*c0909341SAndroid Build Coastguard Worker        vmov.32         r12, d14[0]
499*c0909341SAndroid Build Coastguard Worker
500*c0909341SAndroid Build Coastguard Worker        find_best       d14[0], d8[0], d14[1]
501*c0909341SAndroid Build Coastguard Worker        find_best       d14[1], d0[1], d15[0]
502*c0909341SAndroid Build Coastguard Worker        find_best       d15[0], d8[1], d15[1]
503*c0909341SAndroid Build Coastguard Worker        find_best       d15[1]
504*c0909341SAndroid Build Coastguard Worker
505*c0909341SAndroid Build Coastguard Worker        eor             r3,  r0,  #4           // best_dir ^4
506*c0909341SAndroid Build Coastguard Worker        ldr             r12, [sp, r3, lsl #2]
507*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  r12          // best_cost - cost[best_dir ^ 4]
508*c0909341SAndroid Build Coastguard Worker        lsr             r1,  r1,  #10
509*c0909341SAndroid Build Coastguard Worker        str             r1,  [r2]              // *var
510*c0909341SAndroid Build Coastguard Worker
511*c0909341SAndroid Build Coastguard Worker        add             sp,  sp,  #32
512*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
513*c0909341SAndroid Build Coastguard Worker        pop             {pc}
514*c0909341SAndroid Build Coastguard Workerendfunc
515*c0909341SAndroid Build Coastguard Worker.endm
516