xref: /aosp_15_r20/external/libdav1d/src/arm/32/looprestoration16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Workerconst right_ext_mask_buf
32*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
33*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
35*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36*c0909341SAndroid Build Coastguard Workerright_ext_mask:
37*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
38*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
39*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
40*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
41*c0909341SAndroid Build Coastguard Workerendconst
42*c0909341SAndroid Build Coastguard Worker
43*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
44*c0909341SAndroid Build Coastguard Worker//                                       const pixel *src, ptrdiff_t stride,
45*c0909341SAndroid Build Coastguard Worker//                                       const int16_t fh[7], const intptr_t w,
46*c0909341SAndroid Build Coastguard Worker//                                       int h, enum LrEdgeFlags edges,
47*c0909341SAndroid Build Coastguard Worker//                                       const int bitdepth_max);
48*c0909341SAndroid Build Coastguard Workerfunction wiener_filter_h_16bpc_neon, export=1
49*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
50*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
51*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #100]
52*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #108]
53*c0909341SAndroid Build Coastguard Worker        ldr             r8,       [sp, #116] // bitdepth_max
54*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0}, [r4, :128]
55*c0909341SAndroid Build Coastguard Worker        clz             r8,  r8
56*c0909341SAndroid Build Coastguard Worker        vmov.i32        q14, #1
57*c0909341SAndroid Build Coastguard Worker        sub             r9,  r8,  #38  // -(bitdepth + 6)
58*c0909341SAndroid Build Coastguard Worker        sub             r8,  r8,  #25  // -round_bits_h
59*c0909341SAndroid Build Coastguard Worker        neg             r9,  r9        // bitdepth + 6
60*c0909341SAndroid Build Coastguard Worker        vdup.32         q1,  r9
61*c0909341SAndroid Build Coastguard Worker        vdup.32         q13, r8        // -round_bits_h
62*c0909341SAndroid Build Coastguard Worker        vmov.i16        q15, #8192
63*c0909341SAndroid Build Coastguard Worker        vshl.u32        q14, q14, q1   // 1 << (bitdepth + 6)
64*c0909341SAndroid Build Coastguard Worker        mov             r8,  r5
65*c0909341SAndroid Build Coastguard Worker        // Calculate mid_stride
66*c0909341SAndroid Build Coastguard Worker        add             r10, r5,  #7
67*c0909341SAndroid Build Coastguard Worker        bic             r10, r10, #7
68*c0909341SAndroid Build Coastguard Worker        lsl             r10, r10, #1
69*c0909341SAndroid Build Coastguard Worker
70*c0909341SAndroid Build Coastguard Worker        // Set up pointers for reading/writing alternate rows
71*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r10
72*c0909341SAndroid Build Coastguard Worker        lsl             r10, r10, #1
73*c0909341SAndroid Build Coastguard Worker        add             lr,  r2,  r3
74*c0909341SAndroid Build Coastguard Worker        lsl             r3,  r3,  #1
75*c0909341SAndroid Build Coastguard Worker
76*c0909341SAndroid Build Coastguard Worker        // Subtract the aligned width from mid_stride
77*c0909341SAndroid Build Coastguard Worker        add             r11, r5,  #7
78*c0909341SAndroid Build Coastguard Worker        bic             r11, r11, #7
79*c0909341SAndroid Build Coastguard Worker        sub             r10, r10, r11, lsl #1
80*c0909341SAndroid Build Coastguard Worker
81*c0909341SAndroid Build Coastguard Worker        // Subtract the number of pixels read from the source stride
82*c0909341SAndroid Build Coastguard Worker        add             r11, r11, #8
83*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  r11, lsl #1
84*c0909341SAndroid Build Coastguard Worker
85*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
86*c0909341SAndroid Build Coastguard Worker        tst             r7,  #1 // LR_HAVE_LEFT
87*c0909341SAndroid Build Coastguard Worker        beq             2f
88*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
89*c0909341SAndroid Build Coastguard Worker        cmp             r1,  #0
90*c0909341SAndroid Build Coastguard Worker        bne             0f
91*c0909341SAndroid Build Coastguard Worker        // left == NULL
92*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  #6
93*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  #6
94*c0909341SAndroid Build Coastguard Worker        b               1f
95*c0909341SAndroid Build Coastguard Worker0:      // LR_HAVE_LEFT, left != NULL
96*c0909341SAndroid Build Coastguard Worker2:      // !LR_HAVE_LEFT, increase the stride.
97*c0909341SAndroid Build Coastguard Worker        // For this case we don't read the left 3 pixels from the src pointer,
98*c0909341SAndroid Build Coastguard Worker        // but shift it as if we had done that.
99*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  #6
100*c0909341SAndroid Build Coastguard Worker
101*c0909341SAndroid Build Coastguard Worker
102*c0909341SAndroid Build Coastguard Worker1:      // Loop vertically
103*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r2]!
104*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4, q5}, [lr]!
105*c0909341SAndroid Build Coastguard Worker
106*c0909341SAndroid Build Coastguard Worker        tst             r7,  #1 // LR_HAVE_LEFT
107*c0909341SAndroid Build Coastguard Worker        beq             0f
108*c0909341SAndroid Build Coastguard Worker        cmp             r1,  #0
109*c0909341SAndroid Build Coastguard Worker        beq             2f
110*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
111*c0909341SAndroid Build Coastguard Worker        vld1.16         {d3},  [r1]!
112*c0909341SAndroid Build Coastguard Worker        // Move r2/lr back to account for the last 3 pixels we loaded earlier,
113*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
114*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  #6
115*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  #6
116*c0909341SAndroid Build Coastguard Worker        vld1.16         {d13}, [r1]!
117*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q2,  q3,  #10
118*c0909341SAndroid Build Coastguard Worker        vext.8          q2,  q1,  q2,  #10
119*c0909341SAndroid Build Coastguard Worker        vext.8          q5,  q4,  q5,  #10
120*c0909341SAndroid Build Coastguard Worker        vext.8          q4,  q6,  q4,  #10
121*c0909341SAndroid Build Coastguard Worker        b               2f
122*c0909341SAndroid Build Coastguard Worker0:
123*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
124*c0909341SAndroid Build Coastguard Worker        // and shift q2/q3 to have 3x the first pixel at the front.
125*c0909341SAndroid Build Coastguard Worker        vdup.16         q1,  d4[0]
126*c0909341SAndroid Build Coastguard Worker        vdup.16         q6,  d8[0]
127*c0909341SAndroid Build Coastguard Worker        // Move r2 back to account for the last 3 pixels we loaded before,
128*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
129*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  #6
130*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  #6
131*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q2,  q3,  #10
132*c0909341SAndroid Build Coastguard Worker        vext.8          q2,  q1,  q2,  #10
133*c0909341SAndroid Build Coastguard Worker        vext.8          q5,  q4,  q5,  #10
134*c0909341SAndroid Build Coastguard Worker        vext.8          q4,  q6,  q4,  #10
135*c0909341SAndroid Build Coastguard Worker
136*c0909341SAndroid Build Coastguard Worker2:
137*c0909341SAndroid Build Coastguard Worker
138*c0909341SAndroid Build Coastguard Worker        tst             r7,  #2 // LR_HAVE_RIGHT
139*c0909341SAndroid Build Coastguard Worker        bne             4f
140*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that pixel to pad with
141*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
142*c0909341SAndroid Build Coastguard Worker        sub             r9,  r5,  #14
143*c0909341SAndroid Build Coastguard Worker        lsl             r9,  r9,  #1
144*c0909341SAndroid Build Coastguard Worker        ldrh            r11, [r2, r9]
145*c0909341SAndroid Build Coastguard Worker        ldrh            r9,  [lr, r9]
146*c0909341SAndroid Build Coastguard Worker        // Fill q11/q12 with the right padding pixel
147*c0909341SAndroid Build Coastguard Worker        vdup.16         q11, r11
148*c0909341SAndroid Build Coastguard Worker        vdup.16         q12, r9
149*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
150*c0909341SAndroid Build Coastguard Worker
151*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
152*c0909341SAndroid Build Coastguard Worker        cmp             r5,  #11
153*c0909341SAndroid Build Coastguard Worker        bge             4f   // If w >= 11, all used input pixels are valid
154*c0909341SAndroid Build Coastguard Worker
155*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
156*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
157*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
158*c0909341SAndroid Build Coastguard Worker
159*c0909341SAndroid Build Coastguard Worker        // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
160*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
161*c0909341SAndroid Build Coastguard Worker        movrel_local    r4,  right_ext_mask, -6
162*c0909341SAndroid Build Coastguard Worker        sub             r4,  r4,  r5,  lsl #1
163*c0909341SAndroid Build Coastguard Worker        vld1.8          {q9, q10}, [r4]
164*c0909341SAndroid Build Coastguard Worker
165*c0909341SAndroid Build Coastguard Worker        vbit            q2,  q11, q9
166*c0909341SAndroid Build Coastguard Worker        vbit            q3,  q11, q10
167*c0909341SAndroid Build Coastguard Worker        vbit            q4,  q12, q9
168*c0909341SAndroid Build Coastguard Worker        vbit            q5,  q12, q10
169*c0909341SAndroid Build Coastguard Worker
170*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
171*c0909341SAndroid Build Coastguard Worker        vext.8          q7,  q2,  q3,  #4
172*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q2,  q3,  #8
173*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q2,  q3,  #2
174*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q2,  q3,  #10
175*c0909341SAndroid Build Coastguard Worker        vadd.i16        q8,  q8,  q7
176*c0909341SAndroid Build Coastguard Worker        vadd.i16        q9,  q9,  q6
177*c0909341SAndroid Build Coastguard Worker        vext.8          q6,  q2,  q3,  #12
178*c0909341SAndroid Build Coastguard Worker        vext.8          q7,  q2,  q3,  #6
179*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q6
180*c0909341SAndroid Build Coastguard Worker        vmull.s16       q6,  d14, d0[3]
181*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q6,  d16, d1[0]
182*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q6,  d18, d1[1]
183*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q6,  d4,  d1[2]
184*c0909341SAndroid Build Coastguard Worker        vmull.s16       q7,  d15, d0[3]
185*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q7,  d17, d1[0]
186*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q7,  d19, d1[1]
187*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q7,  d5,  d1[2]
188*c0909341SAndroid Build Coastguard Worker
189*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q4,  q5,  #4
190*c0909341SAndroid Build Coastguard Worker        vext.8          q10, q4,  q5,  #8
191*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q4,  q5,  #2
192*c0909341SAndroid Build Coastguard Worker        vext.8          q2,  q4,  q5,  #10
193*c0909341SAndroid Build Coastguard Worker        vadd.i16        q10, q10, q8
194*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q9
195*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q4,  q5,  #12
196*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q4,  q5,  #6
197*c0909341SAndroid Build Coastguard Worker        vadd.i16        q4,  q4,  q8
198*c0909341SAndroid Build Coastguard Worker        vmull.s16       q8,  d18, d0[3]
199*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q8,  d20, d1[0]
200*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q8,  d4,  d1[1]
201*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q8,  d8,  d1[2]
202*c0909341SAndroid Build Coastguard Worker        vmull.s16       q9,  d19, d0[3]
203*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q9,  d21, d1[0]
204*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q9,  d5,  d1[1]
205*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q9,  d9,  d1[2]
206*c0909341SAndroid Build Coastguard Worker
207*c0909341SAndroid Build Coastguard Worker        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
208*c0909341SAndroid Build Coastguard Worker        vadd.i32        q6,  q6,  q14
209*c0909341SAndroid Build Coastguard Worker        vadd.i32        q7,  q7,  q14
210*c0909341SAndroid Build Coastguard Worker        vadd.i32        q8,  q8,  q14
211*c0909341SAndroid Build Coastguard Worker        vadd.i32        q9,  q9,  q14
212*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q6,  q6,  q13
213*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q7,  q7,  q13
214*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q8,  q8,  q13
215*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q9,  q9,  q13
216*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d12, q6
217*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d13, q7
218*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d14, q8
219*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d15, q9
220*c0909341SAndroid Build Coastguard Worker        vmin.u16        q6,  q6,  q10
221*c0909341SAndroid Build Coastguard Worker        vmin.u16        q7,  q7,  q10
222*c0909341SAndroid Build Coastguard Worker        vsub.i16        q6,  q6,  q15
223*c0909341SAndroid Build Coastguard Worker        vsub.i16        q7,  q7,  q15
224*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #8
225*c0909341SAndroid Build Coastguard Worker        vst1.16         {q6}, [r0,  :128]!
226*c0909341SAndroid Build Coastguard Worker        vst1.16         {q7}, [r12, :128]!
227*c0909341SAndroid Build Coastguard Worker
228*c0909341SAndroid Build Coastguard Worker        ble             9f
229*c0909341SAndroid Build Coastguard Worker        tst             r7,  #2 // LR_HAVE_RIGHT
230*c0909341SAndroid Build Coastguard Worker        vmov            q2,  q3
231*c0909341SAndroid Build Coastguard Worker        vmov            q4,  q5
232*c0909341SAndroid Build Coastguard Worker        vld1.16         {q3}, [r2]!
233*c0909341SAndroid Build Coastguard Worker        vld1.16         {q5}, [lr]!
234*c0909341SAndroid Build Coastguard Worker        bne             4b // If we don't need to pad, just keep filtering.
235*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
236*c0909341SAndroid Build Coastguard Worker
237*c0909341SAndroid Build Coastguard Worker9:
238*c0909341SAndroid Build Coastguard Worker        subs            r6,  r6,  #2
239*c0909341SAndroid Build Coastguard Worker        ble             0f
240*c0909341SAndroid Build Coastguard Worker        // Jump to the next row and loop horizontally
241*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  r10
242*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r10
243*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  r3
244*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  r3
245*c0909341SAndroid Build Coastguard Worker        mov             r5,  r8
246*c0909341SAndroid Build Coastguard Worker        b               1b
247*c0909341SAndroid Build Coastguard Worker0:
248*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
249*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
250*c0909341SAndroid Build Coastguard Workerendfunc
251*c0909341SAndroid Build Coastguard Worker
252*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
253*c0909341SAndroid Build Coastguard Worker//                                       const int16_t *mid, int w, int h,
254*c0909341SAndroid Build Coastguard Worker//                                       const int16_t fv[7], enum LrEdgeFlags edges,
255*c0909341SAndroid Build Coastguard Worker//                                       ptrdiff_t mid_stride, const int bitdepth_max);
256*c0909341SAndroid Build Coastguard Workerfunction wiener_filter_v_16bpc_neon, export=1
257*c0909341SAndroid Build Coastguard Worker        push            {r4-r7,lr}
258*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q5}
259*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #52]
260*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #60]
261*c0909341SAndroid Build Coastguard Worker        ldr             lr,       [sp, #68] // bitdepth_max
262*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0},  [r5, :128]
263*c0909341SAndroid Build Coastguard Worker        vdup.16         q5,  lr
264*c0909341SAndroid Build Coastguard Worker        clz             lr,  lr
265*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  #11   // round_bits_v
266*c0909341SAndroid Build Coastguard Worker        vdup.32         q4,  lr
267*c0909341SAndroid Build Coastguard Worker        mov             lr,  r4
268*c0909341SAndroid Build Coastguard Worker        vneg.s32        q4,  q4         // -round_bits_v
269*c0909341SAndroid Build Coastguard Worker
270*c0909341SAndroid Build Coastguard Worker        // Calculate the number of rows to move back when looping vertically
271*c0909341SAndroid Build Coastguard Worker        mov             r12, r4
272*c0909341SAndroid Build Coastguard Worker        tst             r6,  #4 // LR_HAVE_TOP
273*c0909341SAndroid Build Coastguard Worker        beq             0f
274*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  r7, lsl #1
275*c0909341SAndroid Build Coastguard Worker        add             r12, r12, #2
276*c0909341SAndroid Build Coastguard Worker0:
277*c0909341SAndroid Build Coastguard Worker        tst             r6,  #8 // LR_HAVE_BOTTOM
278*c0909341SAndroid Build Coastguard Worker        beq             1f
279*c0909341SAndroid Build Coastguard Worker        add             r12, r12, #2
280*c0909341SAndroid Build Coastguard Worker
281*c0909341SAndroid Build Coastguard Worker1:      // Start of horizontal loop; start one vertical filter slice.
282*c0909341SAndroid Build Coastguard Worker        // Load rows into q8-q11 and pad properly.
283*c0909341SAndroid Build Coastguard Worker        tst             r6,  #4 // LR_HAVE_TOP
284*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8},  [r2, :128], r7
285*c0909341SAndroid Build Coastguard Worker        beq             2f
286*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_TOP
287*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10}, [r2, :128], r7
288*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q8
289*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [r2, :128], r7
290*c0909341SAndroid Build Coastguard Worker        b               3f
291*c0909341SAndroid Build Coastguard Worker2:      // !LR_HAVE_TOP
292*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q8
293*c0909341SAndroid Build Coastguard Worker        vmov            q10, q8
294*c0909341SAndroid Build Coastguard Worker        vmov            q11, q8
295*c0909341SAndroid Build Coastguard Worker
296*c0909341SAndroid Build Coastguard Worker3:
297*c0909341SAndroid Build Coastguard Worker        cmp             r4,  #4
298*c0909341SAndroid Build Coastguard Worker        blt             5f
299*c0909341SAndroid Build Coastguard Worker        // Start filtering normally; fill in q12-q14 with unique rows.
300*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12}, [r2, :128], r7
301*c0909341SAndroid Build Coastguard Worker        vld1.16         {q13}, [r2, :128], r7
302*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14}, [r2, :128], r7
303*c0909341SAndroid Build Coastguard Worker
304*c0909341SAndroid Build Coastguard Worker4:
305*c0909341SAndroid Build Coastguard Worker.macro filter compare
306*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #1
307*c0909341SAndroid Build Coastguard Worker        // Interleaving the mul/mla chains actually hurts performance
308*c0909341SAndroid Build Coastguard Worker        // significantly on Cortex A53, thus keeping mul/mla tightly
309*c0909341SAndroid Build Coastguard Worker        // chained like this.
310*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d16, d0[0]
311*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d18, d0[1]
312*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d0[2]
313*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d22, d0[3]
314*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d24, d1[0]
315*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d26, d1[1]
316*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d28, d1[2]
317*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d17, d0[0]
318*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d19, d0[1]
319*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d0[2]
320*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d23, d0[3]
321*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d1[0]
322*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d27, d1[1]
323*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d29, d1[2]
324*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q4    // round_bits_v
325*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q4
326*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2
327*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
328*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q5    // bitdepth_max
329*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2}, [r0, :128], r1
330*c0909341SAndroid Build Coastguard Worker.if \compare
331*c0909341SAndroid Build Coastguard Worker        cmp             r4,  #4
332*c0909341SAndroid Build Coastguard Worker.else
333*c0909341SAndroid Build Coastguard Worker        ble             9f
334*c0909341SAndroid Build Coastguard Worker.endif
335*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q9
336*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q10
337*c0909341SAndroid Build Coastguard Worker        vmov            q10, q11
338*c0909341SAndroid Build Coastguard Worker        vmov            q11, q12
339*c0909341SAndroid Build Coastguard Worker        vmov            q12, q13
340*c0909341SAndroid Build Coastguard Worker        vmov            q13, q14
341*c0909341SAndroid Build Coastguard Worker.endm
342*c0909341SAndroid Build Coastguard Worker        filter          1
343*c0909341SAndroid Build Coastguard Worker        blt             7f
344*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14}, [r2, :128], r7
345*c0909341SAndroid Build Coastguard Worker        b               4b
346*c0909341SAndroid Build Coastguard Worker
347*c0909341SAndroid Build Coastguard Worker5:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
348*c0909341SAndroid Build Coastguard Worker        tst             r6,  #8 // LR_HAVE_BOTTOM
349*c0909341SAndroid Build Coastguard Worker        beq             6f
350*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_BOTTOM
351*c0909341SAndroid Build Coastguard Worker        cmp             r4,  #2
352*c0909341SAndroid Build Coastguard Worker        // We load at least 2 rows in all cases.
353*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12}, [r2, :128], r7
354*c0909341SAndroid Build Coastguard Worker        vld1.16         {q13}, [r2, :128], r7
355*c0909341SAndroid Build Coastguard Worker        bgt             53f // 3 rows in total
356*c0909341SAndroid Build Coastguard Worker        beq             52f // 2 rows in total
357*c0909341SAndroid Build Coastguard Worker51:     // 1 row in total, q11 already loaded, load edge into q12-q14.
358*c0909341SAndroid Build Coastguard Worker        vmov            q13, q12
359*c0909341SAndroid Build Coastguard Worker        b               8f
360*c0909341SAndroid Build Coastguard Worker52:     // 2 rows in total, q11 already loaded, load q12 with content data
361*c0909341SAndroid Build Coastguard Worker        // and 2 rows of edge.
362*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14}, [r2, :128], r7
363*c0909341SAndroid Build Coastguard Worker        vmov            q15, q14
364*c0909341SAndroid Build Coastguard Worker        b               8f
365*c0909341SAndroid Build Coastguard Worker53:
366*c0909341SAndroid Build Coastguard Worker        // 3 rows in total, q11 already loaded, load q12 and q13 with content
367*c0909341SAndroid Build Coastguard Worker        // and 2 rows of edge.
368*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14}, [r2, :128], r7
369*c0909341SAndroid Build Coastguard Worker        vld1.16         {q15}, [r2, :128], r7
370*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q15
371*c0909341SAndroid Build Coastguard Worker        b               8f
372*c0909341SAndroid Build Coastguard Worker
373*c0909341SAndroid Build Coastguard Worker6:
374*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_BOTTOM
375*c0909341SAndroid Build Coastguard Worker        cmp             r4,  #2
376*c0909341SAndroid Build Coastguard Worker        bgt             63f // 3 rows in total
377*c0909341SAndroid Build Coastguard Worker        beq             62f // 2 rows in total
378*c0909341SAndroid Build Coastguard Worker61:     // 1 row in total, q11 already loaded, pad that into q12-q14.
379*c0909341SAndroid Build Coastguard Worker        vmov            q12, q11
380*c0909341SAndroid Build Coastguard Worker        vmov            q13, q11
381*c0909341SAndroid Build Coastguard Worker        vmov            q14, q11
382*c0909341SAndroid Build Coastguard Worker        b               8f
383*c0909341SAndroid Build Coastguard Worker62:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
384*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12}, [r2, :128], r7
385*c0909341SAndroid Build Coastguard Worker        vmov            q13, q12
386*c0909341SAndroid Build Coastguard Worker        vmov            q14, q12
387*c0909341SAndroid Build Coastguard Worker        vmov            q15, q12
388*c0909341SAndroid Build Coastguard Worker        b               8f
389*c0909341SAndroid Build Coastguard Worker63:
390*c0909341SAndroid Build Coastguard Worker        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
391*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12}, [r2, :128], r7
392*c0909341SAndroid Build Coastguard Worker        vld1.16         {q13}, [r2, :128], r7
393*c0909341SAndroid Build Coastguard Worker        vmov            q14, q13
394*c0909341SAndroid Build Coastguard Worker        vmov            q15, q13
395*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q13
396*c0909341SAndroid Build Coastguard Worker        b               8f
397*c0909341SAndroid Build Coastguard Worker
398*c0909341SAndroid Build Coastguard Worker7:
399*c0909341SAndroid Build Coastguard Worker        // All registers up to q13 are filled already, 3 valid rows left.
400*c0909341SAndroid Build Coastguard Worker        // < 4 valid rows left; fill in padding and filter the last
401*c0909341SAndroid Build Coastguard Worker        // few rows.
402*c0909341SAndroid Build Coastguard Worker        tst             r6,  #8 // LR_HAVE_BOTTOM
403*c0909341SAndroid Build Coastguard Worker        beq             71f
404*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_BOTTOM; load 2 rows of edge.
405*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14}, [r2, :128], r7
406*c0909341SAndroid Build Coastguard Worker        vld1.16         {q15}, [r2, :128], r7
407*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q15
408*c0909341SAndroid Build Coastguard Worker        b               8f
409*c0909341SAndroid Build Coastguard Worker71:
410*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_BOTTOM, pad 3 rows
411*c0909341SAndroid Build Coastguard Worker        vmov            q14, q13
412*c0909341SAndroid Build Coastguard Worker        vmov            q15, q13
413*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q13
414*c0909341SAndroid Build Coastguard Worker
415*c0909341SAndroid Build Coastguard Worker8:      // At this point, all registers up to q14-q15,q1 are loaded with
416*c0909341SAndroid Build Coastguard Worker        // edge/padding (depending on how many rows are left).
417*c0909341SAndroid Build Coastguard Worker        filter          0 // This branches to 9f when done
418*c0909341SAndroid Build Coastguard Worker        vmov            q14, q15
419*c0909341SAndroid Build Coastguard Worker        vmov            q15, q1
420*c0909341SAndroid Build Coastguard Worker        b               8b
421*c0909341SAndroid Build Coastguard Worker
422*c0909341SAndroid Build Coastguard Worker9:      // End of one vertical slice.
423*c0909341SAndroid Build Coastguard Worker        subs            r3,  r3,  #8
424*c0909341SAndroid Build Coastguard Worker        ble             0f
425*c0909341SAndroid Build Coastguard Worker        // Move pointers back up to the top and loop horizontally.
426*c0909341SAndroid Build Coastguard Worker        mls             r0,  r1,  lr,  r0
427*c0909341SAndroid Build Coastguard Worker        mls             r2,  r7,  r12, r2
428*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  #16
429*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  #16
430*c0909341SAndroid Build Coastguard Worker        mov             r4,  lr
431*c0909341SAndroid Build Coastguard Worker        b               1b
432*c0909341SAndroid Build Coastguard Worker
433*c0909341SAndroid Build Coastguard Worker0:
434*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q5}
435*c0909341SAndroid Build Coastguard Worker        pop             {r4-r7,pc}
436*c0909341SAndroid Build Coastguard Worker.purgem filter
437*c0909341SAndroid Build Coastguard Workerendfunc
438*c0909341SAndroid Build Coastguard Worker
439*c0909341SAndroid Build Coastguard Worker#define SUM_STRIDE (384+16)
440*c0909341SAndroid Build Coastguard Worker
441*c0909341SAndroid Build Coastguard Worker#include "looprestoration_tmpl.S"
442*c0909341SAndroid Build Coastguard Worker
443*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box3_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
444*c0909341SAndroid Build Coastguard Worker//                                  const pixel (*left)[4],
445*c0909341SAndroid Build Coastguard Worker//                                  const pixel *src, const ptrdiff_t stride,
446*c0909341SAndroid Build Coastguard Worker//                                  const int w, const int h,
447*c0909341SAndroid Build Coastguard Worker//                                  const enum LrEdgeFlags edges);
448*c0909341SAndroid Build Coastguard Workerfunction sgr_box3_h_16bpc_neon, export=1
449*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
450*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
451*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #100]
452*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #108]
453*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #2 // w += 2
454*c0909341SAndroid Build Coastguard Worker
455*c0909341SAndroid Build Coastguard Worker        // Set up pointers for reading/writing alternate rows
456*c0909341SAndroid Build Coastguard Worker        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
457*c0909341SAndroid Build Coastguard Worker        add             r11, r1,  #(2*SUM_STRIDE)   // sum
458*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r4                // src
459*c0909341SAndroid Build Coastguard Worker        lsl             r4,  r4,  #1
460*c0909341SAndroid Build Coastguard Worker        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
461*c0909341SAndroid Build Coastguard Worker
462*c0909341SAndroid Build Coastguard Worker        // Subtract the aligned width from the output stride.
463*c0909341SAndroid Build Coastguard Worker        add             lr,  r5,  #7
464*c0909341SAndroid Build Coastguard Worker        bic             lr,  lr,  #7
465*c0909341SAndroid Build Coastguard Worker        sub             r9,  r9,  lr, lsl #1
466*c0909341SAndroid Build Coastguard Worker
467*c0909341SAndroid Build Coastguard Worker        // Store the width for the vertical loop
468*c0909341SAndroid Build Coastguard Worker        mov             r8,  r5
469*c0909341SAndroid Build Coastguard Worker
470*c0909341SAndroid Build Coastguard Worker        // Subtract the number of pixels read from the input from the stride
471*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  #8
472*c0909341SAndroid Build Coastguard Worker        sub             r4,  r4,  lr, lsl #1
473*c0909341SAndroid Build Coastguard Worker
474*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
475*c0909341SAndroid Build Coastguard Worker        tst             r7,  #1 // LR_HAVE_LEFT
476*c0909341SAndroid Build Coastguard Worker        beq             2f
477*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
478*c0909341SAndroid Build Coastguard Worker        cmp             r2,  #0
479*c0909341SAndroid Build Coastguard Worker        bne             0f
480*c0909341SAndroid Build Coastguard Worker        // left == NULL
481*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #4
482*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, #4
483*c0909341SAndroid Build Coastguard Worker        b               1f
484*c0909341SAndroid Build Coastguard Worker0:      // LR_HAVE_LEFT, left != NULL
485*c0909341SAndroid Build Coastguard Worker2:      // !LR_HAVE_LEFT, increase the stride.
486*c0909341SAndroid Build Coastguard Worker        // For this case we don't read the left 2 pixels from the src pointer,
487*c0909341SAndroid Build Coastguard Worker        // but shift it as if we had done that.
488*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  #4
489*c0909341SAndroid Build Coastguard Worker
490*c0909341SAndroid Build Coastguard Worker
491*c0909341SAndroid Build Coastguard Worker1:      // Loop vertically
492*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r3]!
493*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4, q5}, [r12]!
494*c0909341SAndroid Build Coastguard Worker
495*c0909341SAndroid Build Coastguard Worker        tst             r7,  #1 // LR_HAVE_LEFT
496*c0909341SAndroid Build Coastguard Worker        beq             0f
497*c0909341SAndroid Build Coastguard Worker        cmp             r2,  #0
498*c0909341SAndroid Build Coastguard Worker        beq             2f
499*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
500*c0909341SAndroid Build Coastguard Worker        vld1.16         {d5}, [r2]!
501*c0909341SAndroid Build Coastguard Worker        // Move r3/r12 back to account for the last 2 pixels we loaded earlier,
502*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
503*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #4
504*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, #4
505*c0909341SAndroid Build Coastguard Worker        vld1.16         {d13}, [r2]!
506*c0909341SAndroid Build Coastguard Worker        vext.8          q1,  q0,  q1,  #12
507*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  q2,  q0,  #12
508*c0909341SAndroid Build Coastguard Worker        vext.8          q5,  q4,  q5,  #12
509*c0909341SAndroid Build Coastguard Worker        vext.8          q4,  q6,  q4,  #12
510*c0909341SAndroid Build Coastguard Worker        b               2f
511*c0909341SAndroid Build Coastguard Worker0:
512*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
513*c0909341SAndroid Build Coastguard Worker        // and shift q0 to have 2x the first byte at the front.
514*c0909341SAndroid Build Coastguard Worker        vdup.16         q2,  d0[0]
515*c0909341SAndroid Build Coastguard Worker        vdup.16         q6,  d8[0]
516*c0909341SAndroid Build Coastguard Worker        // Move r3 back to account for the last 2 pixels we loaded before,
517*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
518*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #4
519*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, #4
520*c0909341SAndroid Build Coastguard Worker        vext.8          q1,  q0,  q1,  #12
521*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  q2,  q0,  #12
522*c0909341SAndroid Build Coastguard Worker        vext.8          q5,  q4,  q5,  #12
523*c0909341SAndroid Build Coastguard Worker        vext.8          q4,  q6,  q4,  #12
524*c0909341SAndroid Build Coastguard Worker
525*c0909341SAndroid Build Coastguard Worker2:
526*c0909341SAndroid Build Coastguard Worker        tst             r7,  #2 // LR_HAVE_RIGHT
527*c0909341SAndroid Build Coastguard Worker        bne             4f
528*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that pixel to pad with
529*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
530*c0909341SAndroid Build Coastguard Worker        sub             lr,  r5,  #(2 + 16 - 2 + 1)
531*c0909341SAndroid Build Coastguard Worker        lsl             lr,  lr,  #1
532*c0909341SAndroid Build Coastguard Worker        ldrh            r11, [r3,  lr]
533*c0909341SAndroid Build Coastguard Worker        ldrh            lr,  [r12, lr]
534*c0909341SAndroid Build Coastguard Worker        // Fill q14/q15 with the right padding pixel
535*c0909341SAndroid Build Coastguard Worker        vdup.16         q14, r11
536*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, lr
537*c0909341SAndroid Build Coastguard Worker        // Restore r11 after using it for a temporary value
538*c0909341SAndroid Build Coastguard Worker        add             r11, r1,  #(2*SUM_STRIDE)
539*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
540*c0909341SAndroid Build Coastguard Worker
541*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
542*c0909341SAndroid Build Coastguard Worker        cmp             r5,  #10
543*c0909341SAndroid Build Coastguard Worker        bge             4f   // If w >= 10, all used input pixels are valid
544*c0909341SAndroid Build Coastguard Worker
545*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
546*c0909341SAndroid Build Coastguard Worker        // again; it's not strictly needed in those cases (we pad enough here),
547*c0909341SAndroid Build Coastguard Worker        // but keeping the code as simple as possible.
548*c0909341SAndroid Build Coastguard Worker
549*c0909341SAndroid Build Coastguard Worker        // Insert padding in q0/1.h[w] onwards
550*c0909341SAndroid Build Coastguard Worker        movrel_local    lr,  right_ext_mask
551*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  r5,  lsl #1
552*c0909341SAndroid Build Coastguard Worker        vld1.8          {q12, q13}, [lr]
553*c0909341SAndroid Build Coastguard Worker
554*c0909341SAndroid Build Coastguard Worker        vbit            q0,  q14, q12
555*c0909341SAndroid Build Coastguard Worker        vbit            q1,  q14, q13
556*c0909341SAndroid Build Coastguard Worker        vbit            q4,  q15, q12
557*c0909341SAndroid Build Coastguard Worker        vbit            q5,  q15, q13
558*c0909341SAndroid Build Coastguard Worker
559*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
560*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q0,  q1,  #2
561*c0909341SAndroid Build Coastguard Worker        vext.8          q10, q4,  q5,  #2
562*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q0,  q1,  #4
563*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q4,  q5,  #4
564*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q0,  q8
565*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q4,  q10
566*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q9
567*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q11
568*c0909341SAndroid Build Coastguard Worker
569*c0909341SAndroid Build Coastguard Worker        vmull.u16       q6,  d0,  d0
570*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q6,  d16, d16
571*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q6,  d18, d18
572*c0909341SAndroid Build Coastguard Worker        vmull.u16       q12, d8,  d8
573*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q12, d20, d20
574*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q12, d22, d22
575*c0909341SAndroid Build Coastguard Worker        vmull.u16       q7,  d1,  d1
576*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q7,  d17, d17
577*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q7,  d19, d19
578*c0909341SAndroid Build Coastguard Worker        vmull.u16       q13, d9,  d9
579*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q13, d21, d21
580*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q13, d23, d23
581*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #8
582*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2},       [r1,  :128]!
583*c0909341SAndroid Build Coastguard Worker        vst1.16         {q3},       [r11, :128]!
584*c0909341SAndroid Build Coastguard Worker        vst1.32         {q6,  q7},  [r0,  :128]!
585*c0909341SAndroid Build Coastguard Worker        vst1.32         {q12, q13}, [r10, :128]!
586*c0909341SAndroid Build Coastguard Worker
587*c0909341SAndroid Build Coastguard Worker        ble             9f
588*c0909341SAndroid Build Coastguard Worker        tst             r7,  #2 // LR_HAVE_RIGHT
589*c0909341SAndroid Build Coastguard Worker        vmov            q0,  q1
590*c0909341SAndroid Build Coastguard Worker        vmov            q4,  q5
591*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1}, [r3]!
592*c0909341SAndroid Build Coastguard Worker        vld1.16         {q5}, [r12]!
593*c0909341SAndroid Build Coastguard Worker
594*c0909341SAndroid Build Coastguard Worker        bne             4b // If we don't need to pad, just keep summing.
595*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
596*c0909341SAndroid Build Coastguard Worker
597*c0909341SAndroid Build Coastguard Worker9:
598*c0909341SAndroid Build Coastguard Worker        subs            r6,  r6,  #2
599*c0909341SAndroid Build Coastguard Worker        ble             0f
600*c0909341SAndroid Build Coastguard Worker        // Jump to the next row and loop horizontally
601*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  r9, lsl #1
602*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9, lsl #1
603*c0909341SAndroid Build Coastguard Worker        add             r1,  r1,  r9
604*c0909341SAndroid Build Coastguard Worker        add             r11, r11, r9
605*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  r4
606*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r4
607*c0909341SAndroid Build Coastguard Worker        mov             r5,  r8
608*c0909341SAndroid Build Coastguard Worker        b               1b
609*c0909341SAndroid Build Coastguard Worker0:
610*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
611*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
612*c0909341SAndroid Build Coastguard Workerendfunc
613*c0909341SAndroid Build Coastguard Worker
614*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box5_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
615*c0909341SAndroid Build Coastguard Worker//                                  const pixel (*left)[4],
616*c0909341SAndroid Build Coastguard Worker//                                  const pixel *src, const ptrdiff_t stride,
617*c0909341SAndroid Build Coastguard Worker//                                  const int w, const int h,
618*c0909341SAndroid Build Coastguard Worker//                                  const enum LrEdgeFlags edges);
619*c0909341SAndroid Build Coastguard Workerfunction sgr_box5_h_16bpc_neon, export=1
620*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
621*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
622*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #100]
623*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #108]
624*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #2 // w += 2
625*c0909341SAndroid Build Coastguard Worker
626*c0909341SAndroid Build Coastguard Worker        // Set up pointers for reading/writing alternate rows
627*c0909341SAndroid Build Coastguard Worker        add             r10, r0,  #(4*SUM_STRIDE)   // sumsq
628*c0909341SAndroid Build Coastguard Worker        add             r11, r1,  #(2*SUM_STRIDE)   // sum
629*c0909341SAndroid Build Coastguard Worker        add             r12, r3,  r4                // src
630*c0909341SAndroid Build Coastguard Worker        lsl             r4,  r4,  #1
631*c0909341SAndroid Build Coastguard Worker        mov             r9,       #(2*2*SUM_STRIDE) // double sum stride
632*c0909341SAndroid Build Coastguard Worker
633*c0909341SAndroid Build Coastguard Worker        // Subtract the aligned width from the output stride.
634*c0909341SAndroid Build Coastguard Worker        add             lr,  r5,  #7
635*c0909341SAndroid Build Coastguard Worker        bic             lr,  lr,  #7
636*c0909341SAndroid Build Coastguard Worker        sub             r9,  r9,  lr, lsl #1
637*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  #8
638*c0909341SAndroid Build Coastguard Worker        sub             r4,  r4,  lr, lsl #1
639*c0909341SAndroid Build Coastguard Worker
640*c0909341SAndroid Build Coastguard Worker        // Store the width for the vertical loop
641*c0909341SAndroid Build Coastguard Worker        mov             r8,  r5
642*c0909341SAndroid Build Coastguard Worker
643*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
644*c0909341SAndroid Build Coastguard Worker        tst             r7,  #1 // LR_HAVE_LEFT
645*c0909341SAndroid Build Coastguard Worker        beq             2f
646*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
647*c0909341SAndroid Build Coastguard Worker        cmp             r2,  #0
648*c0909341SAndroid Build Coastguard Worker        bne             0f
649*c0909341SAndroid Build Coastguard Worker        // left == NULL
650*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #6
651*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, #6
652*c0909341SAndroid Build Coastguard Worker        b               1f
653*c0909341SAndroid Build Coastguard Worker0:      // LR_HAVE_LEFT, left != NULL
654*c0909341SAndroid Build Coastguard Worker2:      // !LR_HAVE_LEFT, increase the stride.
655*c0909341SAndroid Build Coastguard Worker        // For this case we don't read the left 3 pixels from the src pointer,
656*c0909341SAndroid Build Coastguard Worker        // but shift it as if we had done that.
657*c0909341SAndroid Build Coastguard Worker        add             r4,  r4,  #6
658*c0909341SAndroid Build Coastguard Worker
659*c0909341SAndroid Build Coastguard Worker1:      // Loop vertically
660*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r3]!
661*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4, q5}, [r12]!
662*c0909341SAndroid Build Coastguard Worker
663*c0909341SAndroid Build Coastguard Worker        tst             r7,  #1 // LR_HAVE_LEFT
664*c0909341SAndroid Build Coastguard Worker        beq             0f
665*c0909341SAndroid Build Coastguard Worker        cmp             r2,  #0
666*c0909341SAndroid Build Coastguard Worker        beq             2f
667*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
668*c0909341SAndroid Build Coastguard Worker        vld1.16         {d5}, [r2]!
669*c0909341SAndroid Build Coastguard Worker        // Move r3/r12 back to account for the last 3 pixels we loaded earlier,
670*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
671*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #6
672*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, #6
673*c0909341SAndroid Build Coastguard Worker        vld1.16         {d13}, [r2]!
674*c0909341SAndroid Build Coastguard Worker        vext.8          q1,  q0,  q1,  #10
675*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  q2,  q0,  #10
676*c0909341SAndroid Build Coastguard Worker        vext.8          q5,  q4,  q5,  #10
677*c0909341SAndroid Build Coastguard Worker        vext.8          q4,  q6,  q4,  #10
678*c0909341SAndroid Build Coastguard Worker        b               2f
679*c0909341SAndroid Build Coastguard Worker0:
680*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill q2 with the leftmost pixel
681*c0909341SAndroid Build Coastguard Worker        // and shift q0 to have 3x the first pixel at the front.
682*c0909341SAndroid Build Coastguard Worker        vdup.16         q2,  d0[0]
683*c0909341SAndroid Build Coastguard Worker        vdup.16         q6,  d8[0]
684*c0909341SAndroid Build Coastguard Worker        // Move r3 back to account for the last 3 pixels we loaded before,
685*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
686*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #6
687*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, #6
688*c0909341SAndroid Build Coastguard Worker        vext.8          q1,  q0,  q1,  #10
689*c0909341SAndroid Build Coastguard Worker        vext.8          q0,  q2,  q0,  #10
690*c0909341SAndroid Build Coastguard Worker        vext.8          q5,  q4,  q5,  #10
691*c0909341SAndroid Build Coastguard Worker        vext.8          q4,  q6,  q4,  #10
692*c0909341SAndroid Build Coastguard Worker
693*c0909341SAndroid Build Coastguard Worker2:
694*c0909341SAndroid Build Coastguard Worker        tst             r7,  #2 // LR_HAVE_RIGHT
695*c0909341SAndroid Build Coastguard Worker        bne             4f
696*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that pixel to pad with
697*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
698*c0909341SAndroid Build Coastguard Worker        sub             lr,  r5,  #(2 + 16 - 3 + 1)
699*c0909341SAndroid Build Coastguard Worker        lsl             lr,  lr,  #1
700*c0909341SAndroid Build Coastguard Worker        ldrh            r11, [r3,  lr]
701*c0909341SAndroid Build Coastguard Worker        ldrh            lr,  [r12, lr]
702*c0909341SAndroid Build Coastguard Worker        // Fill q14/q15 with the right padding pixel
703*c0909341SAndroid Build Coastguard Worker        vdup.16         q14, r11
704*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, lr
705*c0909341SAndroid Build Coastguard Worker        // Restore r11 after using it for a temporary value
706*c0909341SAndroid Build Coastguard Worker        add             r11, r1,  #(2*SUM_STRIDE)
707*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
708*c0909341SAndroid Build Coastguard Worker
709*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
710*c0909341SAndroid Build Coastguard Worker        cmp             r5,  #11
711*c0909341SAndroid Build Coastguard Worker        bge             4f   // If w >= 11, all used input pixels are valid
712*c0909341SAndroid Build Coastguard Worker
713*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
714*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
715*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
716*c0909341SAndroid Build Coastguard Worker
717*c0909341SAndroid Build Coastguard Worker        // Insert padding in q0/1.h[w+1] onwards; fuse the +1 into the
718*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
719*c0909341SAndroid Build Coastguard Worker        movrel_local    lr,  right_ext_mask, -2
720*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  r5,  lsl #1
721*c0909341SAndroid Build Coastguard Worker        vld1.8          {q12, q13}, [lr]
722*c0909341SAndroid Build Coastguard Worker
723*c0909341SAndroid Build Coastguard Worker        vbit            q0,  q14, q12
724*c0909341SAndroid Build Coastguard Worker        vbit            q1,  q14, q13
725*c0909341SAndroid Build Coastguard Worker        vbit            q4,  q15, q12
726*c0909341SAndroid Build Coastguard Worker        vbit            q5,  q15, q13
727*c0909341SAndroid Build Coastguard Worker
728*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
729*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q0,  q1,  #2
730*c0909341SAndroid Build Coastguard Worker        vext.8          q10, q4,  q5,  #2
731*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q0,  q1,  #4
732*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q4,  q5,  #4
733*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q0,  q8
734*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q4,  q10
735*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q9
736*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q11
737*c0909341SAndroid Build Coastguard Worker
738*c0909341SAndroid Build Coastguard Worker        vmull.u16       q6,  d0,  d0
739*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q6,  d16, d16
740*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q6,  d18, d18
741*c0909341SAndroid Build Coastguard Worker        vmull.u16       q12, d8,  d8
742*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q12, d20, d20
743*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q12, d22, d22
744*c0909341SAndroid Build Coastguard Worker        vmull.u16       q7,  d1,  d1
745*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q7,  d17, d17
746*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q7,  d19, d19
747*c0909341SAndroid Build Coastguard Worker        vmull.u16       q13, d9,  d9
748*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q13, d21, d21
749*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q13, d23, d23
750*c0909341SAndroid Build Coastguard Worker
751*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q0,  q1,  #6
752*c0909341SAndroid Build Coastguard Worker        vext.8          q10, q4,  q5,  #6
753*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q0,  q1,  #8
754*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q4,  q5,  #8
755*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q8
756*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q10
757*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q9
758*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q11
759*c0909341SAndroid Build Coastguard Worker
760*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q6,  d16, d16
761*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q6,  d1,  d1
762*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q12, d20, d20
763*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q12, d9,  d9
764*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q7,  d17, d17
765*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q7,  d19, d19
766*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q13, d21, d21
767*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q13, d23, d23
768*c0909341SAndroid Build Coastguard Worker
769*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #8
770*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2},       [r1,  :128]!
771*c0909341SAndroid Build Coastguard Worker        vst1.16         {q3},       [r11, :128]!
772*c0909341SAndroid Build Coastguard Worker        vst1.32         {q6,  q7},  [r0,  :128]!
773*c0909341SAndroid Build Coastguard Worker        vst1.32         {q12, q13}, [r10, :128]!
774*c0909341SAndroid Build Coastguard Worker
775*c0909341SAndroid Build Coastguard Worker        ble             9f
776*c0909341SAndroid Build Coastguard Worker        tst             r7,  #2 // LR_HAVE_RIGHT
777*c0909341SAndroid Build Coastguard Worker        vmov            q0,  q1
778*c0909341SAndroid Build Coastguard Worker        vmov            q4,  q5
779*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1}, [r3]!
780*c0909341SAndroid Build Coastguard Worker        vld1.16         {q5}, [r12]!
781*c0909341SAndroid Build Coastguard Worker        bne             4b // If we don't need to pad, just keep summing.
782*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
783*c0909341SAndroid Build Coastguard Worker
784*c0909341SAndroid Build Coastguard Worker9:
785*c0909341SAndroid Build Coastguard Worker        subs            r6,  r6,  #2
786*c0909341SAndroid Build Coastguard Worker        ble             0f
787*c0909341SAndroid Build Coastguard Worker        // Jump to the next row and loop horizontally
788*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  r9, lsl #1
789*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9, lsl #1
790*c0909341SAndroid Build Coastguard Worker        add             r1,  r1,  r9
791*c0909341SAndroid Build Coastguard Worker        add             r11, r11, r9
792*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  r4
793*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r4
794*c0909341SAndroid Build Coastguard Worker        mov             r5,  r8
795*c0909341SAndroid Build Coastguard Worker        b               1b
796*c0909341SAndroid Build Coastguard Worker0:
797*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
798*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
799*c0909341SAndroid Build Coastguard Workerendfunc
800*c0909341SAndroid Build Coastguard Worker
801*c0909341SAndroid Build Coastguard Workersgr_funcs 16
802