xref: /aosp_15_r20/external/libdav1d/src/arm/64/looprestoration.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Workerconst right_ext_mask_buf
32*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
33*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
35*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
37*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
38*c0909341SAndroid Build Coastguard Workerright_ext_mask:
39*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
40*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
41*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
42*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
43*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
44*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
45*c0909341SAndroid Build Coastguard Workerendconst
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
48*c0909341SAndroid Build Coastguard Worker//                                     const pixel (*left)[4], const pixel *lpf,
49*c0909341SAndroid Build Coastguard Worker//                                     const int w, int h,
50*c0909341SAndroid Build Coastguard Worker//                                     const int16_t filter[2][8],
51*c0909341SAndroid Build Coastguard Worker//                                     const enum LrEdgeFlags edges);
52*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_8bpc_neon, export=1
53*c0909341SAndroid Build Coastguard Worker        AARCH64_SIGN_LINK_REGISTER
54*c0909341SAndroid Build Coastguard Worker        stp             x29, x30, [sp, #-16]!
55*c0909341SAndroid Build Coastguard Worker        mov             x29, sp
56*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x6]
57*c0909341SAndroid Build Coastguard Worker        tst             w7,  #4               // LR_HAVE_TOP
58*c0909341SAndroid Build Coastguard Worker        sub_sp          384*2*6
59*c0909341SAndroid Build Coastguard Worker
60*c0909341SAndroid Build Coastguard Worker        mov             w17, #(1 << 14) - (1 << 2)
61*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  w17
62*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #8, lsl #8
63*c0909341SAndroid Build Coastguard Worker
64*c0909341SAndroid Build Coastguard Worker        // x9  - t6
65*c0909341SAndroid Build Coastguard Worker        // x10 - t5
66*c0909341SAndroid Build Coastguard Worker        // x11 - t4
67*c0909341SAndroid Build Coastguard Worker        // x12 - t3
68*c0909341SAndroid Build Coastguard Worker        // x13 - t2
69*c0909341SAndroid Build Coastguard Worker        // x14 - t1
70*c0909341SAndroid Build Coastguard Worker        // x15 - t0
71*c0909341SAndroid Build Coastguard Worker        mov             x14, sp               // t1
72*c0909341SAndroid Build Coastguard Worker        b.eq            L(no_top_7)
73*c0909341SAndroid Build Coastguard Worker
74*c0909341SAndroid Build Coastguard Worker        mov             x16, x2               // backup left
75*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0
76*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
77*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride
78*c0909341SAndroid Build Coastguard Worker        mov             x9,  x14              // t6
79*c0909341SAndroid Build Coastguard Worker        mov             x10, x14              // t5
80*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
81*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
82*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
83*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride*5
84*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
85*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
86*c0909341SAndroid Build Coastguard Worker        mov             x2,  x16              // left
87*c0909341SAndroid Build Coastguard Worker        mov             x16, x3               // backup lpf
88*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
89*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
90*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
91*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
92*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
93*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_7)
94*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
95*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
96*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
97*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
98*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
99*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_7)
100*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
101*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
102*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
103*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
104*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
105*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
106*c0909341SAndroid Build Coastguard Worker
107*c0909341SAndroid Build Coastguard WorkerL(main_7):
108*c0909341SAndroid Build Coastguard Worker        add             x15, x14, #384*2      // t0 = t1 + 384*2
109*c0909341SAndroid Build Coastguard WorkerL(main_loop_7):
110*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_8bpc_neon
111*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
112*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_loop_7)
113*c0909341SAndroid Build Coastguard Worker        tst             w7,  #8 // LR_HAVE_BOTTOM
114*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
115*c0909341SAndroid Build Coastguard Worker
116*c0909341SAndroid Build Coastguard Worker        mov             x3,  x16              // restore lpf
117*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0               // left = NULL
118*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_8bpc_neon
119*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_8bpc_neon
120*c0909341SAndroid Build Coastguard WorkerL(v1_7):
121*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_v_8bpc_neon
122*c0909341SAndroid Build Coastguard Worker
123*c0909341SAndroid Build Coastguard Worker        mov             sp,  x29
124*c0909341SAndroid Build Coastguard Worker        ldp             x29, x30, [sp], #16
125*c0909341SAndroid Build Coastguard Worker        AARCH64_VALIDATE_LINK_REGISTER
126*c0909341SAndroid Build Coastguard Worker        ret
127*c0909341SAndroid Build Coastguard Worker
128*c0909341SAndroid Build Coastguard WorkerL(no_top_7):
129*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
130*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
131*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
132*c0909341SAndroid Build Coastguard Worker
133*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
134*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
135*c0909341SAndroid Build Coastguard Worker        mov             x9,  x14              // t6
136*c0909341SAndroid Build Coastguard Worker        mov             x10, x14              // t5
137*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
138*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
139*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
140*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_7)
141*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
142*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
143*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
144*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
145*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
146*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_7)
147*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
148*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
149*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_8bpc_neon
150*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
151*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
152*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
153*c0909341SAndroid Build Coastguard Worker        add             x15, x14, #384*2      // t0 = t1 + 384*2
154*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_8bpc_neon
155*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
156*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
157*c0909341SAndroid Build Coastguard Worker        add             x15, x15, #384*2*4    // t0 += 384*2*4
158*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_8bpc_neon
159*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
160*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_7)
161*c0909341SAndroid Build Coastguard WorkerL(v3_7):
162*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_v_8bpc_neon
163*c0909341SAndroid Build Coastguard WorkerL(v2_7):
164*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_v_8bpc_neon
165*c0909341SAndroid Build Coastguard Worker        b               L(v1_7)
166*c0909341SAndroid Build Coastguard Workerendfunc
167*c0909341SAndroid Build Coastguard Worker
168*c0909341SAndroid Build Coastguard Worker
169*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_h_8bpc_neon
170*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #-32]!
171*c0909341SAndroid Build Coastguard Worker        str             x14,      [sp, #16]
172*c0909341SAndroid Build Coastguard Worker
173*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
174*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
175*c0909341SAndroid Build Coastguard Worker        b.eq            1f
176*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
177*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
178*c0909341SAndroid Build Coastguard Worker        // left == NULL
179*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
180*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
181*c0909341SAndroid Build Coastguard Worker        b               2f
182*c0909341SAndroid Build Coastguard Worker
183*c0909341SAndroid Build Coastguard Worker0:
184*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
185*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b},  [x3], #16
186*c0909341SAndroid Build Coastguard Worker        ld1             {v2.s}[3], [x2], #4
187*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded earlier,
188*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
189*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
190*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b, #13
191*c0909341SAndroid Build Coastguard Worker        b               2f
192*c0909341SAndroid Build Coastguard Worker
193*c0909341SAndroid Build Coastguard Worker1:
194*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
195*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
196*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 3x the first byte at the front.
197*c0909341SAndroid Build Coastguard Worker        dup             v2.16b,  v3.b[0]
198*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded before,
199*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
200*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
201*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b, #13
202*c0909341SAndroid Build Coastguard Worker
203*c0909341SAndroid Build Coastguard Worker2:
204*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x3], #8
205*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,   v3.8b
206*c0909341SAndroid Build Coastguard Worker        uxtl2           v3.8h,   v3.16b
207*c0909341SAndroid Build Coastguard Worker        uxtl            v4.8h,   v4.8b
208*c0909341SAndroid Build Coastguard Worker
209*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
210*c0909341SAndroid Build Coastguard Worker        b.ne            4f
211*c0909341SAndroid Build Coastguard Worker
212*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
213*c0909341SAndroid Build Coastguard Worker
214*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
215*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #19
216*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 19, all used input pixels are valid
217*c0909341SAndroid Build Coastguard Worker
218*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
219*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
220*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
221*c0909341SAndroid Build Coastguard Worker
222*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
223*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
224*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #22
225*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
226*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
227*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -6
228*c0909341SAndroid Build Coastguard Worker        ldr             b28, [x3,  w17, sxtw]
229*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
230*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  v28.h[0]
231*c0909341SAndroid Build Coastguard Worker        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
232*c0909341SAndroid Build Coastguard Worker
233*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v28.16b, v25.16b
234*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v28.16b, v26.16b
235*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v28.16b, v27.16b
236*c0909341SAndroid Build Coastguard Worker
237*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
238*c0909341SAndroid Build Coastguard Worker        // Interleaving the mul/mla chains actually hurts performance
239*c0909341SAndroid Build Coastguard Worker        // significantly on Cortex A53, thus keeping mul/mla tightly
240*c0909341SAndroid Build Coastguard Worker        // chained like this.
241*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
242*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
243*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
244*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v2.16b,  v3.16b, #10
245*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v2.16b,  v3.16b, #12
246*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
247*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
248*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
249*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v2.8h
250*c0909341SAndroid Build Coastguard Worker        shl             v22.8h,  v18.8h,  #7
251*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v18.8h,  v0.h[3]
252*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v19.8h,  v0.h[4]
253*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v20.8h,  v0.h[5]
254*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v21.8h,  v0.h[6]
255*c0909341SAndroid Build Coastguard Worker
256*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
257*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
258*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
259*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v3.16b,  v4.16b, #10
260*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v3.16b,  v4.16b, #12
261*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
262*c0909341SAndroid Build Coastguard Worker
263*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
264*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
265*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v3.8h
266*c0909341SAndroid Build Coastguard Worker        shl             v23.8h,  v18.8h,  #7
267*c0909341SAndroid Build Coastguard Worker        mul             v7.8h,   v18.8h,  v0.h[3]
268*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v19.8h,  v0.h[4]
269*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v20.8h,  v0.h[5]
270*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v21.8h,  v0.h[6]
271*c0909341SAndroid Build Coastguard Worker
272*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v22.8h,  v30.8h
273*c0909341SAndroid Build Coastguard Worker        sub             v23.8h,  v23.8h,  v30.8h
274*c0909341SAndroid Build Coastguard Worker        sqadd           v6.8h,   v6.8h,   v22.8h
275*c0909341SAndroid Build Coastguard Worker        sqadd           v7.8h,   v7.8h,   v23.8h
276*c0909341SAndroid Build Coastguard Worker        sshr            v6.8h,   v6.8h,   #3
277*c0909341SAndroid Build Coastguard Worker        sshr            v7.8h,   v7.8h,   #3
278*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v31.8h
279*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v31.8h
280*c0909341SAndroid Build Coastguard Worker
281*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
282*c0909341SAndroid Build Coastguard Worker
283*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x14], #32
284*c0909341SAndroid Build Coastguard Worker
285*c0909341SAndroid Build Coastguard Worker        b.le            0f
286*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
287*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x3], #16
288*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
289*c0909341SAndroid Build Coastguard Worker        uxtl            v3.8h,   v4.8b
290*c0909341SAndroid Build Coastguard Worker        uxtl2           v4.8h,   v4.16b
291*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
292*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
293*c0909341SAndroid Build Coastguard Worker
294*c0909341SAndroid Build Coastguard Worker0:
295*c0909341SAndroid Build Coastguard Worker        ldr             x14,      [sp, #16]
296*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp], #32
297*c0909341SAndroid Build Coastguard Worker        ret
298*c0909341SAndroid Build Coastguard Workerendfunc
299*c0909341SAndroid Build Coastguard Worker
300*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_v_8bpc_neon
301*c0909341SAndroid Build Coastguard Worker        // Backing up/restoring registers shifted, so that x9 gets the value
302*c0909341SAndroid Build Coastguard Worker        // of x10, etc, afterwards.
303*c0909341SAndroid Build Coastguard Worker        stp             x10, x11, [sp, #-64]!
304*c0909341SAndroid Build Coastguard Worker        stp             x12, x13, [sp, #16]
305*c0909341SAndroid Build Coastguard Worker        stp             x14, x14, [sp, #32]
306*c0909341SAndroid Build Coastguard Worker        stp             x0,  x4,  [sp, #48]
307*c0909341SAndroid Build Coastguard Worker1:
308*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x11], #32
309*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h, v25.8h}, [x13], #32
310*c0909341SAndroid Build Coastguard Worker
311*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x10], #32
312*c0909341SAndroid Build Coastguard Worker        add             v24.8h,  v24.8h,  v20.8h
313*c0909341SAndroid Build Coastguard Worker        ld1             {v26.8h, v27.8h}, [x14], #32
314*c0909341SAndroid Build Coastguard Worker
315*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x9],  #32
316*c0909341SAndroid Build Coastguard Worker        add             v28.8h,  v26.8h,  v18.8h
317*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h, v23.8h}, [x12], #32
318*c0909341SAndroid Build Coastguard Worker
319*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v26.8h,  v16.8h
320*c0909341SAndroid Build Coastguard Worker        add             v25.8h,  v25.8h,  v21.8h
321*c0909341SAndroid Build Coastguard Worker
322*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v22.4h,  v1.h[3]
323*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v24.4h,  v1.h[4]
324*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v28.4h,  v1.h[5]
325*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v16.4h,  v1.h[6]
326*c0909341SAndroid Build Coastguard Worker        add             v29.8h,  v27.8h,  v19.8h
327*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v22.8h,  v1.h[3]
328*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v24.8h,  v1.h[4]
329*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v28.8h,  v1.h[5]
330*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v16.8h,  v1.h[6]
331*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v27.8h,  v17.8h
332*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v23.4h,  v1.h[3]
333*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v25.4h,  v1.h[4]
334*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v29.4h,  v1.h[5]
335*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v17.4h,  v1.h[6]
336*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v23.8h,  v1.h[3]
337*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v25.8h,  v1.h[4]
338*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v29.8h,  v1.h[5]
339*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v17.8h,  v1.h[6]
340*c0909341SAndroid Build Coastguard Worker        sqrshrun        v2.4h,   v2.4s,   #11
341*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v2.8h,   v3.4s,   #11
342*c0909341SAndroid Build Coastguard Worker        sqrshrun        v3.4h,   v4.4s,   #11
343*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v3.8h,   v5.4s,   #11
344*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.8b,   v2.8h
345*c0909341SAndroid Build Coastguard Worker        sqxtun2         v2.16b,  v3.8h
346*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
347*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x0], #16
348*c0909341SAndroid Build Coastguard Worker        b.gt            1b
349*c0909341SAndroid Build Coastguard Worker
350*c0909341SAndroid Build Coastguard Worker        ldp             x0,  x4,  [sp, #48]
351*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #32]
352*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp, #16]
353*c0909341SAndroid Build Coastguard Worker        ldp             x9,  x10, [sp], #64
354*c0909341SAndroid Build Coastguard Worker
355*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
356*c0909341SAndroid Build Coastguard Worker        ret
357*c0909341SAndroid Build Coastguard Workerendfunc
358*c0909341SAndroid Build Coastguard Worker
359*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_hv_8bpc_neon
360*c0909341SAndroid Build Coastguard Worker        // Backing up/restoring registers shifted, so that x9 gets the value
361*c0909341SAndroid Build Coastguard Worker        // of x10, etc, and x15==x9, afterwards.
362*c0909341SAndroid Build Coastguard Worker        stp             x10, x11, [sp, #-80]!
363*c0909341SAndroid Build Coastguard Worker        stp             x12, x13, [sp, #16]
364*c0909341SAndroid Build Coastguard Worker        stp             x14, x15, [sp, #32]
365*c0909341SAndroid Build Coastguard Worker        stp             x10, x0,  [sp, #48]
366*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #64]
367*c0909341SAndroid Build Coastguard Worker
368*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
369*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
370*c0909341SAndroid Build Coastguard Worker        b.eq            1f
371*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
372*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
373*c0909341SAndroid Build Coastguard Worker        // left == NULL
374*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
375*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
376*c0909341SAndroid Build Coastguard Worker        b               2f
377*c0909341SAndroid Build Coastguard Worker
378*c0909341SAndroid Build Coastguard Worker0:
379*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
380*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b},  [x3], #16
381*c0909341SAndroid Build Coastguard Worker        ld1             {v2.s}[3], [x2], #4
382*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded earlier,
383*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
384*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
385*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b, #13
386*c0909341SAndroid Build Coastguard Worker        b               2f
387*c0909341SAndroid Build Coastguard Worker1:
388*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
389*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
390*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 3x the first byte at the front.
391*c0909341SAndroid Build Coastguard Worker        dup             v2.16b,  v3.b[0]
392*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded before,
393*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
394*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
395*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b, #13
396*c0909341SAndroid Build Coastguard Worker
397*c0909341SAndroid Build Coastguard Worker2:
398*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x3], #8
399*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,   v3.8b
400*c0909341SAndroid Build Coastguard Worker        uxtl2           v3.8h,   v3.16b
401*c0909341SAndroid Build Coastguard Worker        uxtl            v4.8h,   v4.8b
402*c0909341SAndroid Build Coastguard Worker
403*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
404*c0909341SAndroid Build Coastguard Worker        b.ne            4f
405*c0909341SAndroid Build Coastguard Worker
406*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
407*c0909341SAndroid Build Coastguard Worker
408*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
409*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #19
410*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 19, all used input pixels are valid
411*c0909341SAndroid Build Coastguard Worker
412*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
413*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
414*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
415*c0909341SAndroid Build Coastguard Worker
416*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
417*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
418*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #22
419*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
420*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
421*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -6
422*c0909341SAndroid Build Coastguard Worker        ldr             b28, [x3,  w17, sxtw]
423*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
424*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  v28.h[0]
425*c0909341SAndroid Build Coastguard Worker        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
426*c0909341SAndroid Build Coastguard Worker
427*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v28.16b, v25.16b
428*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v28.16b, v26.16b
429*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v28.16b, v27.16b
430*c0909341SAndroid Build Coastguard Worker
431*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
432*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
433*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
434*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
435*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v2.16b,  v3.16b, #10
436*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v2.16b,  v3.16b, #12
437*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
438*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
439*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
440*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v2.8h
441*c0909341SAndroid Build Coastguard Worker        shl             v22.8h,  v18.8h,  #7
442*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v18.8h,  v0.h[3]
443*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v19.8h,  v0.h[4]
444*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v20.8h,  v0.h[5]
445*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v21.8h,  v0.h[6]
446*c0909341SAndroid Build Coastguard Worker
447*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
448*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
449*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
450*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v3.16b,  v4.16b, #10
451*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v3.16b,  v4.16b, #12
452*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
453*c0909341SAndroid Build Coastguard Worker
454*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
455*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
456*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v3.8h
457*c0909341SAndroid Build Coastguard Worker        shl             v23.8h,  v18.8h,  #7
458*c0909341SAndroid Build Coastguard Worker        mul             v7.8h,   v18.8h,  v0.h[3]
459*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v19.8h,  v0.h[4]
460*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v20.8h,  v0.h[5]
461*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v21.8h,  v0.h[6]
462*c0909341SAndroid Build Coastguard Worker
463*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x11], #32
464*c0909341SAndroid Build Coastguard Worker
465*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v22.8h,  v30.8h
466*c0909341SAndroid Build Coastguard Worker        sub             v23.8h,  v23.8h,  v30.8h
467*c0909341SAndroid Build Coastguard Worker        ld1             {v26.8h, v27.8h}, [x13], #32
468*c0909341SAndroid Build Coastguard Worker        sqadd           v6.8h,   v6.8h,   v22.8h
469*c0909341SAndroid Build Coastguard Worker        sqadd           v7.8h,   v7.8h,   v23.8h
470*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x10], #32
471*c0909341SAndroid Build Coastguard Worker        sshr            v6.8h,   v6.8h,   #3
472*c0909341SAndroid Build Coastguard Worker        sshr            v7.8h,   v7.8h,   #3
473*c0909341SAndroid Build Coastguard Worker        ld1             {v28.8h, v29.8h}, [x14], #32
474*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v31.8h
475*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v31.8h
476*c0909341SAndroid Build Coastguard Worker
477*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x9],  #32
478*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v20.8h,  v26.8h
479*c0909341SAndroid Build Coastguard Worker
480*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h, v25.8h}, [x12], #32
481*c0909341SAndroid Build Coastguard Worker        add             v28.8h,  v18.8h,  v28.8h
482*c0909341SAndroid Build Coastguard Worker
483*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v6.8h
484*c0909341SAndroid Build Coastguard Worker        add             v27.8h,  v21.8h,  v27.8h
485*c0909341SAndroid Build Coastguard Worker
486*c0909341SAndroid Build Coastguard Worker        smull           v18.4s,  v24.4h,  v1.h[3]
487*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v26.4h,  v1.h[4]
488*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v28.4h,  v1.h[5]
489*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v16.4h,  v1.h[6]
490*c0909341SAndroid Build Coastguard Worker        add             v29.8h,  v19.8h,  v29.8h
491*c0909341SAndroid Build Coastguard Worker        smull2          v19.4s,  v24.8h,  v1.h[3]
492*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v26.8h,  v1.h[4]
493*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v28.8h,  v1.h[5]
494*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v16.8h,  v1.h[6]
495*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v7.8h
496*c0909341SAndroid Build Coastguard Worker        smull           v20.4s,  v25.4h,  v1.h[3]
497*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v27.4h,  v1.h[4]
498*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v29.4h,  v1.h[5]
499*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v17.4h,  v1.h[6]
500*c0909341SAndroid Build Coastguard Worker        smull2          v21.4s,  v25.8h,  v1.h[3]
501*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v27.8h,  v1.h[4]
502*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v29.8h,  v1.h[5]
503*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v17.8h,  v1.h[6]
504*c0909341SAndroid Build Coastguard Worker        sqrshrun        v18.4h,  v18.4s,  #11
505*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v18.8h,  v19.4s,  #11
506*c0909341SAndroid Build Coastguard Worker        sqrshrun        v19.4h,  v20.4s,  #11
507*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v19.8h,  v21.4s,  #11
508*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x15], #32
509*c0909341SAndroid Build Coastguard Worker        sqxtun          v18.8b,  v18.8h
510*c0909341SAndroid Build Coastguard Worker        sqxtun2         v18.16b, v19.8h
511*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
512*c0909341SAndroid Build Coastguard Worker
513*c0909341SAndroid Build Coastguard Worker        st1             {v18.16b}, [x0], #16
514*c0909341SAndroid Build Coastguard Worker
515*c0909341SAndroid Build Coastguard Worker        b.le            0f
516*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
517*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x3], #16
518*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
519*c0909341SAndroid Build Coastguard Worker        uxtl            v3.8h,   v4.8b
520*c0909341SAndroid Build Coastguard Worker        uxtl2           v4.8h,   v4.16b
521*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
522*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
523*c0909341SAndroid Build Coastguard Worker
524*c0909341SAndroid Build Coastguard Worker0:
525*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp, #64]
526*c0909341SAndroid Build Coastguard Worker        ldp             x15, x0,  [sp, #48]
527*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #32]
528*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp, #16]
529*c0909341SAndroid Build Coastguard Worker        ldp             x9,  x10, [sp], #80
530*c0909341SAndroid Build Coastguard Worker
531*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1
532*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
533*c0909341SAndroid Build Coastguard Worker
534*c0909341SAndroid Build Coastguard Worker        ret
535*c0909341SAndroid Build Coastguard Workerendfunc
536*c0909341SAndroid Build Coastguard Worker
537*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
538*c0909341SAndroid Build Coastguard Worker//                                     const pixel (*left)[4], const pixel *lpf,
539*c0909341SAndroid Build Coastguard Worker//                                     const int w, int h,
540*c0909341SAndroid Build Coastguard Worker//                                     const int16_t filter[2][8],
541*c0909341SAndroid Build Coastguard Worker//                                     const enum LrEdgeFlags edges);
542*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_8bpc_neon, export=1
543*c0909341SAndroid Build Coastguard Worker        AARCH64_SIGN_LINK_REGISTER
544*c0909341SAndroid Build Coastguard Worker        stp             x29, x30, [sp, #-16]!
545*c0909341SAndroid Build Coastguard Worker        mov             x29, sp
546*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x6]
547*c0909341SAndroid Build Coastguard Worker        tst             w7,  #4               // LR_HAVE_TOP
548*c0909341SAndroid Build Coastguard Worker        sub_sp          384*2*4
549*c0909341SAndroid Build Coastguard Worker
550*c0909341SAndroid Build Coastguard Worker        mov             w17, #(1 << 14) - (1 << 2)
551*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  w17
552*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #8, lsl #8
553*c0909341SAndroid Build Coastguard Worker
554*c0909341SAndroid Build Coastguard Worker        // x11 - t4
555*c0909341SAndroid Build Coastguard Worker        // x12 - t3
556*c0909341SAndroid Build Coastguard Worker        // x13 - t2
557*c0909341SAndroid Build Coastguard Worker        // x14 - t1
558*c0909341SAndroid Build Coastguard Worker        // x15 - t0
559*c0909341SAndroid Build Coastguard Worker        mov             x14, sp               // t1
560*c0909341SAndroid Build Coastguard Worker        b.eq            L(no_top_5)
561*c0909341SAndroid Build Coastguard Worker
562*c0909341SAndroid Build Coastguard Worker        mov             x16, x2               // backup left
563*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0
564*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_8bpc_neon
565*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride
566*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
567*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
568*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_8bpc_neon
569*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
570*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride*5
571*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
572*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
573*c0909341SAndroid Build Coastguard Worker        mov             x2,  x16              // left
574*c0909341SAndroid Build Coastguard Worker        mov             x16, x3               // backup lpf
575*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
576*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_8bpc_neon
577*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
578*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
579*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_5)
580*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
581*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
582*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_8bpc_neon
583*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
584*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
585*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
586*c0909341SAndroid Build Coastguard Worker
587*c0909341SAndroid Build Coastguard WorkerL(main_5):
588*c0909341SAndroid Build Coastguard Worker        mov             x15, x11              // t0 = t4
589*c0909341SAndroid Build Coastguard WorkerL(main_loop_5):
590*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_8bpc_neon
591*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
592*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_loop_5)
593*c0909341SAndroid Build Coastguard Worker        tst             w7,  #8 // LR_HAVE_BOTTOM
594*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
595*c0909341SAndroid Build Coastguard Worker
596*c0909341SAndroid Build Coastguard Worker        mov             x3,  x16              // restore lpf
597*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0               // left = NULL
598*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_8bpc_neon
599*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_8bpc_neon
600*c0909341SAndroid Build Coastguard WorkerL(end_5):
601*c0909341SAndroid Build Coastguard Worker
602*c0909341SAndroid Build Coastguard Worker        mov             sp,  x29
603*c0909341SAndroid Build Coastguard Worker        ldp             x29, x30, [sp], #16
604*c0909341SAndroid Build Coastguard Worker        AARCH64_VALIDATE_LINK_REGISTER
605*c0909341SAndroid Build Coastguard Worker        ret
606*c0909341SAndroid Build Coastguard Worker
607*c0909341SAndroid Build Coastguard WorkerL(no_top_5):
608*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
609*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
610*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
611*c0909341SAndroid Build Coastguard Worker
612*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_8bpc_neon
613*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
614*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
615*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
616*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
617*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_5)
618*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
619*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
620*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_8bpc_neon
621*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
622*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
623*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
624*c0909341SAndroid Build Coastguard Worker        add             x15, x14, #384*2      // t0 = t1 + 384*2
625*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_8bpc_neon
626*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
627*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
628*c0909341SAndroid Build Coastguard Worker        add             x15, x15, #384*2*3    // t0 += 384*2*3
629*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_8bpc_neon
630*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
631*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_5)
632*c0909341SAndroid Build Coastguard WorkerL(v2_5):
633*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_v_8bpc_neon
634*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
635*c0909341SAndroid Build Coastguard Worker        mov             x11, x12
636*c0909341SAndroid Build Coastguard Worker        mov             x12, x13
637*c0909341SAndroid Build Coastguard Worker        mov             x13, x14
638*c0909341SAndroid Build Coastguard WorkerL(v1_5):
639*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_v_8bpc_neon
640*c0909341SAndroid Build Coastguard Worker        b               L(end_5)
641*c0909341SAndroid Build Coastguard Workerendfunc
642*c0909341SAndroid Build Coastguard Worker
643*c0909341SAndroid Build Coastguard Worker
644*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_h_8bpc_neon
645*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #-32]!
646*c0909341SAndroid Build Coastguard Worker        str             x14,      [sp, #16]
647*c0909341SAndroid Build Coastguard Worker
648*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
649*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
650*c0909341SAndroid Build Coastguard Worker        b.eq            1f
651*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
652*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
653*c0909341SAndroid Build Coastguard Worker        // left == NULL
654*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
655*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
656*c0909341SAndroid Build Coastguard Worker        b               2f
657*c0909341SAndroid Build Coastguard Worker
658*c0909341SAndroid Build Coastguard Worker0:
659*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
660*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b},  [x3], #16
661*c0909341SAndroid Build Coastguard Worker        ld1             {v2.s}[3], [x2], #4
662*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 bytes we loaded earlier,
663*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
664*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
665*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b, #14
666*c0909341SAndroid Build Coastguard Worker        b               2f
667*c0909341SAndroid Build Coastguard Worker
668*c0909341SAndroid Build Coastguard Worker1:
669*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
670*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
671*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 3x the first byte at the front.
672*c0909341SAndroid Build Coastguard Worker        dup             v2.16b,  v3.b[0]
673*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 bytes we loaded before,
674*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
675*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
676*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b, #14
677*c0909341SAndroid Build Coastguard Worker
678*c0909341SAndroid Build Coastguard Worker2:
679*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x3], #8
680*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,   v3.8b
681*c0909341SAndroid Build Coastguard Worker        uxtl2           v3.8h,   v3.16b
682*c0909341SAndroid Build Coastguard Worker        uxtl            v4.8h,   v4.8b
683*c0909341SAndroid Build Coastguard Worker
684*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
685*c0909341SAndroid Build Coastguard Worker        b.ne            4f
686*c0909341SAndroid Build Coastguard Worker
687*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
688*c0909341SAndroid Build Coastguard Worker
689*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
690*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #18
691*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 18, all used input pixels are valid
692*c0909341SAndroid Build Coastguard Worker
693*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
694*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
695*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
696*c0909341SAndroid Build Coastguard Worker
697*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
698*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
699*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #23
700*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
701*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
702*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -4
703*c0909341SAndroid Build Coastguard Worker        ldr             b28, [x3,  w17, sxtw]
704*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
705*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  v28.h[0]
706*c0909341SAndroid Build Coastguard Worker        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
707*c0909341SAndroid Build Coastguard Worker
708*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v28.16b, v25.16b
709*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v28.16b, v26.16b
710*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v28.16b, v27.16b
711*c0909341SAndroid Build Coastguard Worker
712*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
713*c0909341SAndroid Build Coastguard Worker        // Interleaving the mul/mla chains actually hurts performance
714*c0909341SAndroid Build Coastguard Worker        // significantly on Cortex A53, thus keeping mul/mla tightly
715*c0909341SAndroid Build Coastguard Worker        // chained like this.
716*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
717*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
718*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
719*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
720*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
721*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v2.8h
722*c0909341SAndroid Build Coastguard Worker        shl             v22.8h,  v17.8h,  #7
723*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v17.8h,  v0.h[3]
724*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v18.8h,  v0.h[4]
725*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v19.8h,  v0.h[5]
726*c0909341SAndroid Build Coastguard Worker
727*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
728*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
729*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
730*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
731*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
732*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
733*c0909341SAndroid Build Coastguard Worker        shl             v23.8h,  v17.8h,  #7
734*c0909341SAndroid Build Coastguard Worker        mul             v7.8h,   v17.8h,  v0.h[3]
735*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v18.8h,  v0.h[4]
736*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v19.8h,  v0.h[5]
737*c0909341SAndroid Build Coastguard Worker
738*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v22.8h,  v30.8h
739*c0909341SAndroid Build Coastguard Worker        sub             v23.8h,  v23.8h,  v30.8h
740*c0909341SAndroid Build Coastguard Worker        sqadd           v6.8h,   v6.8h,   v22.8h
741*c0909341SAndroid Build Coastguard Worker        sqadd           v7.8h,   v7.8h,   v23.8h
742*c0909341SAndroid Build Coastguard Worker        sshr            v6.8h,   v6.8h,   #3
743*c0909341SAndroid Build Coastguard Worker        sshr            v7.8h,   v7.8h,   #3
744*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v31.8h
745*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v31.8h
746*c0909341SAndroid Build Coastguard Worker
747*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
748*c0909341SAndroid Build Coastguard Worker
749*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x14], #32
750*c0909341SAndroid Build Coastguard Worker
751*c0909341SAndroid Build Coastguard Worker        b.le            0f
752*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
753*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x3], #16
754*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
755*c0909341SAndroid Build Coastguard Worker        uxtl            v3.8h,   v4.8b
756*c0909341SAndroid Build Coastguard Worker        uxtl2           v4.8h,   v4.16b
757*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
758*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
759*c0909341SAndroid Build Coastguard Worker
760*c0909341SAndroid Build Coastguard Worker0:
761*c0909341SAndroid Build Coastguard Worker        ldr             x14,      [sp, #16]
762*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp], #32
763*c0909341SAndroid Build Coastguard Worker        ret
764*c0909341SAndroid Build Coastguard Workerendfunc
765*c0909341SAndroid Build Coastguard Worker
766*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_v_8bpc_neon
767*c0909341SAndroid Build Coastguard Worker        stp             x11, x12, [sp, #-48]!
768*c0909341SAndroid Build Coastguard Worker        stp             x13, x14, [sp, #16]
769*c0909341SAndroid Build Coastguard Worker        stp             x0,  x4,  [sp, #32]
770*c0909341SAndroid Build Coastguard Worker1:
771*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x12], #32
772*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h, v23.8h}, [x14], #32
773*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x11], #32
774*c0909341SAndroid Build Coastguard Worker
775*c0909341SAndroid Build Coastguard Worker        add             v24.8h,  v22.8h,  v18.8h
776*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x13], #32
777*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v22.8h,  v16.8h
778*c0909341SAndroid Build Coastguard Worker        add             v25.8h,  v23.8h,  v19.8h
779*c0909341SAndroid Build Coastguard Worker
780*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v20.4h,  v1.h[3]
781*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v24.4h,  v1.h[4]
782*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v16.4h,  v1.h[5]
783*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v23.8h,  v17.8h
784*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v20.8h,  v1.h[3]
785*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v24.8h,  v1.h[4]
786*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v16.8h,  v1.h[5]
787*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v21.4h,  v1.h[3]
788*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v25.4h,  v1.h[4]
789*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v17.4h,  v1.h[5]
790*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v21.8h,  v1.h[3]
791*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v25.8h,  v1.h[4]
792*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v17.8h,  v1.h[5]
793*c0909341SAndroid Build Coastguard Worker        sqrshrun        v2.4h,   v2.4s,   #11
794*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v2.8h,   v3.4s,   #11
795*c0909341SAndroid Build Coastguard Worker        sqrshrun        v3.4h,   v4.4s,   #11
796*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v3.8h,   v5.4s,   #11
797*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.8b,   v2.8h
798*c0909341SAndroid Build Coastguard Worker        sqxtun2         v2.16b,  v3.8h
799*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
800*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x0], #16
801*c0909341SAndroid Build Coastguard Worker        b.gt            1b
802*c0909341SAndroid Build Coastguard Worker
803*c0909341SAndroid Build Coastguard Worker        ldp             x0,  x4,  [sp, #32]
804*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #16]
805*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp], #48
806*c0909341SAndroid Build Coastguard Worker
807*c0909341SAndroid Build Coastguard Worker        ret
808*c0909341SAndroid Build Coastguard Workerendfunc
809*c0909341SAndroid Build Coastguard Worker
810*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_hv_8bpc_neon
811*c0909341SAndroid Build Coastguard Worker        // Backing up/restoring registers shifted, so that x11 gets the value
812*c0909341SAndroid Build Coastguard Worker        // of x12, etc, and x15==x11, afterwards.
813*c0909341SAndroid Build Coastguard Worker        stp             x12, x13, [sp, #-64]!
814*c0909341SAndroid Build Coastguard Worker        stp             x14, x15, [sp, #16]
815*c0909341SAndroid Build Coastguard Worker        stp             x12, x0,  [sp, #32]
816*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #48]
817*c0909341SAndroid Build Coastguard Worker
818*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
819*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
820*c0909341SAndroid Build Coastguard Worker        b.eq            1f
821*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
822*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
823*c0909341SAndroid Build Coastguard Worker        // left == NULL
824*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
825*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
826*c0909341SAndroid Build Coastguard Worker        b               2f
827*c0909341SAndroid Build Coastguard Worker
828*c0909341SAndroid Build Coastguard Worker0:
829*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
830*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b},  [x3], #16
831*c0909341SAndroid Build Coastguard Worker        ld1             {v2.s}[3], [x2], #4
832*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 bytes we loaded earlier,
833*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
834*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
835*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b, #14
836*c0909341SAndroid Build Coastguard Worker        b               2f
837*c0909341SAndroid Build Coastguard Worker1:
838*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x3], #16
839*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
840*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 2x the first byte at the front.
841*c0909341SAndroid Build Coastguard Worker        dup             v2.16b,  v3.b[0]
842*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 bytes we loaded before,
843*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
844*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
845*c0909341SAndroid Build Coastguard Worker        ext             v3.16b, v2.16b, v3.16b, #14
846*c0909341SAndroid Build Coastguard Worker
847*c0909341SAndroid Build Coastguard Worker2:
848*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x3], #8
849*c0909341SAndroid Build Coastguard Worker        uxtl            v2.8h,  v3.8b
850*c0909341SAndroid Build Coastguard Worker        uxtl2           v3.8h,  v3.16b
851*c0909341SAndroid Build Coastguard Worker        uxtl            v4.8h,  v4.8b
852*c0909341SAndroid Build Coastguard Worker
853*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
854*c0909341SAndroid Build Coastguard Worker        b.ne            4f
855*c0909341SAndroid Build Coastguard Worker
856*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
857*c0909341SAndroid Build Coastguard Worker
858*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
859*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #18
860*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 18, all used input pixels are valid
861*c0909341SAndroid Build Coastguard Worker
862*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
863*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
864*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
865*c0909341SAndroid Build Coastguard Worker
866*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
867*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
868*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #23
869*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
870*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
871*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -4
872*c0909341SAndroid Build Coastguard Worker        ldr             b28, [x3,  w17, sxtw]
873*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
874*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  v28.h[0]
875*c0909341SAndroid Build Coastguard Worker        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
876*c0909341SAndroid Build Coastguard Worker
877*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v28.16b, v25.16b
878*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v28.16b, v26.16b
879*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v28.16b, v27.16b
880*c0909341SAndroid Build Coastguard Worker
881*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
882*c0909341SAndroid Build Coastguard Worker
883*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
884*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
885*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
886*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
887*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
888*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v2.8h
889*c0909341SAndroid Build Coastguard Worker        shl             v22.8h,  v17.8h,  #7
890*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v17.8h,  v0.h[3]
891*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v18.8h,  v0.h[4]
892*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v19.8h,  v0.h[5]
893*c0909341SAndroid Build Coastguard Worker
894*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
895*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
896*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
897*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
898*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
899*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
900*c0909341SAndroid Build Coastguard Worker        shl             v23.8h,  v17.8h,  #7
901*c0909341SAndroid Build Coastguard Worker        mul             v7.8h,   v17.8h,  v0.h[3]
902*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v18.8h,  v0.h[4]
903*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v19.8h,  v0.h[5]
904*c0909341SAndroid Build Coastguard Worker
905*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x12], #32
906*c0909341SAndroid Build Coastguard Worker
907*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v22.8h,  v30.8h
908*c0909341SAndroid Build Coastguard Worker        sub             v23.8h,  v23.8h,  v30.8h
909*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h, v25.8h}, [x14], #32
910*c0909341SAndroid Build Coastguard Worker        sqadd           v6.8h,   v6.8h,   v22.8h
911*c0909341SAndroid Build Coastguard Worker        sqadd           v7.8h,   v7.8h,   v23.8h
912*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x11], #32
913*c0909341SAndroid Build Coastguard Worker        sshr            v6.8h,   v6.8h,   #3
914*c0909341SAndroid Build Coastguard Worker        sshr            v7.8h,   v7.8h,   #3
915*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x13], #32
916*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v31.8h
917*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v31.8h
918*c0909341SAndroid Build Coastguard Worker
919*c0909341SAndroid Build Coastguard Worker        add             v24.8h,  v24.8h,  v18.8h
920*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v6.8h
921*c0909341SAndroid Build Coastguard Worker
922*c0909341SAndroid Build Coastguard Worker        smull           v18.4s,  v20.4h,  v1.h[3]
923*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v24.4h,  v1.h[4]
924*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v16.4h,  v1.h[5]
925*c0909341SAndroid Build Coastguard Worker        add             v25.8h,  v25.8h,  v19.8h
926*c0909341SAndroid Build Coastguard Worker        smull2          v19.4s,  v20.8h,  v1.h[3]
927*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v24.8h,  v1.h[4]
928*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v16.8h,  v1.h[5]
929*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v7.8h
930*c0909341SAndroid Build Coastguard Worker        smull           v20.4s,  v21.4h,  v1.h[3]
931*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v25.4h,  v1.h[4]
932*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v17.4h,  v1.h[5]
933*c0909341SAndroid Build Coastguard Worker        smull2          v21.4s,  v21.8h,  v1.h[3]
934*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v25.8h,  v1.h[4]
935*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v17.8h,  v1.h[5]
936*c0909341SAndroid Build Coastguard Worker        sqrshrun        v18.4h,  v18.4s,  #11
937*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v18.8h,  v19.4s,  #11
938*c0909341SAndroid Build Coastguard Worker        sqrshrun        v19.4h,  v20.4s,  #11
939*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v19.8h,  v21.4s,  #11
940*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x15], #32
941*c0909341SAndroid Build Coastguard Worker        sqxtun          v18.8b,  v18.8h
942*c0909341SAndroid Build Coastguard Worker        sqxtun2         v18.16b, v19.8h
943*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
944*c0909341SAndroid Build Coastguard Worker
945*c0909341SAndroid Build Coastguard Worker        st1             {v18.16b}, [x0], #16
946*c0909341SAndroid Build Coastguard Worker
947*c0909341SAndroid Build Coastguard Worker        b.le            0f
948*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
949*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x3], #16
950*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
951*c0909341SAndroid Build Coastguard Worker        uxtl            v3.8h,   v4.8b
952*c0909341SAndroid Build Coastguard Worker        uxtl2           v4.8h,   v4.16b
953*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
954*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
955*c0909341SAndroid Build Coastguard Worker
956*c0909341SAndroid Build Coastguard Worker0:
957*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp, #48]
958*c0909341SAndroid Build Coastguard Worker        ldp             x15, x0,  [sp, #32]
959*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #16]
960*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp], #64
961*c0909341SAndroid Build Coastguard Worker
962*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1
963*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
964*c0909341SAndroid Build Coastguard Worker
965*c0909341SAndroid Build Coastguard Worker        ret
966*c0909341SAndroid Build Coastguard Workerendfunc
967*c0909341SAndroid Build Coastguard Worker
968*c0909341SAndroid Build Coastguard Worker#include "looprestoration_tmpl.S"
969*c0909341SAndroid Build Coastguard Worker
970*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
971*c0909341SAndroid Build Coastguard Worker//                                     const pixel (*left)[4],
972*c0909341SAndroid Build Coastguard Worker//                                     const pixel *src, const int w,
973*c0909341SAndroid Build Coastguard Worker//                                     const enum LrEdgeFlags edges);
974*c0909341SAndroid Build Coastguard Workerfunction sgr_box3_row_h_8bpc_neon, export=1
975*c0909341SAndroid Build Coastguard Worker        add             w4,  w4,  #2 // w += 2
976*c0909341SAndroid Build Coastguard Worker
977*c0909341SAndroid Build Coastguard Worker        tst             w5,  #1 // LR_HAVE_LEFT
978*c0909341SAndroid Build Coastguard Worker        b.eq            1f
979*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
980*c0909341SAndroid Build Coastguard Worker
981*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT && left == NULL
982*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
983*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x3], #16
984*c0909341SAndroid Build Coastguard Worker        b               2f
985*c0909341SAndroid Build Coastguard Worker
986*c0909341SAndroid Build Coastguard Worker0:
987*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
988*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b},  [x3], #16
989*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[3], [x2]
990*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 bytes we loaded earlier,
991*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
992*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
993*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v1.16b, v0.16b, #14
994*c0909341SAndroid Build Coastguard Worker        b               2f
995*c0909341SAndroid Build Coastguard Worker
996*c0909341SAndroid Build Coastguard Worker1:
997*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x3], #16
998*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
999*c0909341SAndroid Build Coastguard Worker        // and shift v0 to have 2x the first byte at the front.
1000*c0909341SAndroid Build Coastguard Worker        dup             v1.16b, v0.b[0]
1001*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 bytes we loaded before,
1002*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
1003*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2
1004*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v1.16b, v0.16b, #14
1005*c0909341SAndroid Build Coastguard Worker
1006*c0909341SAndroid Build Coastguard Worker2:
1007*c0909341SAndroid Build Coastguard Worker        umull           v1.8h,   v0.8b,   v0.8b
1008*c0909341SAndroid Build Coastguard Worker        umull2          v2.8h,   v0.16b,  v0.16b
1009*c0909341SAndroid Build Coastguard Worker
1010*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1011*c0909341SAndroid Build Coastguard Worker        b.ne            4f
1012*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that byte to pad with
1013*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
1014*c0909341SAndroid Build Coastguard Worker        sub             w13, w4, #(2 + 16 - 2 + 1)
1015*c0909341SAndroid Build Coastguard Worker        ldr             b30, [x3,  w13, sxtw]
1016*c0909341SAndroid Build Coastguard Worker        // Fill v30 with the right padding pixel
1017*c0909341SAndroid Build Coastguard Worker        dup             v30.16b, v30.b[0]
1018*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
1019*c0909341SAndroid Build Coastguard Worker
1020*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
1021*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #10
1022*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 10, all used input pixels are valid
1023*c0909341SAndroid Build Coastguard Worker
1024*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
1025*c0909341SAndroid Build Coastguard Worker        // again; it's not strictly needed in those cases (we pad enough here),
1026*c0909341SAndroid Build Coastguard Worker        // but keeping the code as simple as possible.
1027*c0909341SAndroid Build Coastguard Worker
1028*c0909341SAndroid Build Coastguard Worker        // Insert padding in v0.b[w] onwards
1029*c0909341SAndroid Build Coastguard Worker        movrel          x13, right_ext_mask
1030*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, w4,  uxtw
1031*c0909341SAndroid Build Coastguard Worker        ld1             {v29.16b}, [x13]
1032*c0909341SAndroid Build Coastguard Worker
1033*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v30.16b, v29.16b
1034*c0909341SAndroid Build Coastguard Worker
1035*c0909341SAndroid Build Coastguard Worker        // Update the precalculated squares
1036*c0909341SAndroid Build Coastguard Worker        umull           v1.8h,   v0.8b,   v0.8b
1037*c0909341SAndroid Build Coastguard Worker        umull2          v2.8h,   v0.16b,  v0.16b
1038*c0909341SAndroid Build Coastguard Worker
1039*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
1040*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v0.16b, #1
1041*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v0.16b,  v0.16b, #2
1042*c0909341SAndroid Build Coastguard Worker        uaddl           v3.8h,   v0.8b,   v16.8b
1043*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v1.16b,  v2.16b, #2
1044*c0909341SAndroid Build Coastguard Worker        uaddw           v3.8h,   v3.8h,   v17.8b
1045*c0909341SAndroid Build Coastguard Worker
1046*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v1.16b,  v2.16b, #4
1047*c0909341SAndroid Build Coastguard Worker
1048*c0909341SAndroid Build Coastguard Worker        uaddl           v26.4s,  v1.4h,   v20.4h
1049*c0909341SAndroid Build Coastguard Worker        uaddl2          v27.4s,  v1.8h,   v20.8h
1050*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v21.4h
1051*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v21.8h
1052*c0909341SAndroid Build Coastguard Worker
1053*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #8
1054*c0909341SAndroid Build Coastguard Worker
1055*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h},         [x1],  #16
1056*c0909341SAndroid Build Coastguard Worker        st1             {v26.4s,v27.4s}, [x0],  #32
1057*c0909341SAndroid Build Coastguard Worker
1058*c0909341SAndroid Build Coastguard Worker        b.le            9f
1059*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1060*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8b},  [x3],  #8
1061*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v2.16b
1062*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v0.16b,  v3.16b, #8
1063*c0909341SAndroid Build Coastguard Worker        umull           v2.8h,   v3.8b,   v3.8b
1064*c0909341SAndroid Build Coastguard Worker
1065*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep summing.
1066*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
1067*c0909341SAndroid Build Coastguard Worker
1068*c0909341SAndroid Build Coastguard Worker9:
1069*c0909341SAndroid Build Coastguard Worker        ret
1070*c0909341SAndroid Build Coastguard Workerendfunc
1071*c0909341SAndroid Build Coastguard Worker
1072*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
1073*c0909341SAndroid Build Coastguard Worker//                                     const pixel (*left)[4],
1074*c0909341SAndroid Build Coastguard Worker//                                     const pixel *src, const int w,
1075*c0909341SAndroid Build Coastguard Worker//                                     const enum LrEdgeFlags edges);
1076*c0909341SAndroid Build Coastguard Workerfunction sgr_box5_row_h_8bpc_neon, export=1
1077*c0909341SAndroid Build Coastguard Worker        add             w4,  w4,  #2 // w += 2
1078*c0909341SAndroid Build Coastguard Worker
1079*c0909341SAndroid Build Coastguard Worker        tst             w5,  #1 // LR_HAVE_LEFT
1080*c0909341SAndroid Build Coastguard Worker        b.eq            1f
1081*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
1082*c0909341SAndroid Build Coastguard Worker
1083*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT && left == NULL
1084*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
1085*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x3], #16
1086*c0909341SAndroid Build Coastguard Worker        b               2f
1087*c0909341SAndroid Build Coastguard Worker
1088*c0909341SAndroid Build Coastguard Worker0:
1089*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
1090*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b},  [x3], #16
1091*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[3], [x2], #4
1092*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded earlier,
1093*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
1094*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
1095*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v1.16b, v0.16b, #13
1096*c0909341SAndroid Build Coastguard Worker        b               2f
1097*c0909341SAndroid Build Coastguard Worker
1098*c0909341SAndroid Build Coastguard Worker1:
1099*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x3], #16
1100*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
1101*c0909341SAndroid Build Coastguard Worker        // and shift v0 to have 3x the first byte at the front.
1102*c0909341SAndroid Build Coastguard Worker        dup             v1.16b, v0.b[0]
1103*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded before,
1104*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
1105*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #3
1106*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v1.16b, v0.16b, #13
1107*c0909341SAndroid Build Coastguard Worker
1108*c0909341SAndroid Build Coastguard Worker2:
1109*c0909341SAndroid Build Coastguard Worker        umull           v1.8h,   v0.8b,   v0.8b
1110*c0909341SAndroid Build Coastguard Worker        umull2          v2.8h,   v0.16b,  v0.16b
1111*c0909341SAndroid Build Coastguard Worker
1112*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1113*c0909341SAndroid Build Coastguard Worker        b.ne            4f
1114*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that byte to pad with
1115*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
1116*c0909341SAndroid Build Coastguard Worker        sub             w13, w4, #(2 + 16 - 3 + 1)
1117*c0909341SAndroid Build Coastguard Worker        ldr             b30, [x3,  w13, sxtw]
1118*c0909341SAndroid Build Coastguard Worker        // Fill v30 with the right padding pixel
1119*c0909341SAndroid Build Coastguard Worker        dup             v30.16b, v30.b[0]
1120*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
1121*c0909341SAndroid Build Coastguard Worker
1122*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
1123*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #11
1124*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 11, all used input pixels are valid
1125*c0909341SAndroid Build Coastguard Worker
1126*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1127*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
1128*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
1129*c0909341SAndroid Build Coastguard Worker
1130*c0909341SAndroid Build Coastguard Worker        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1131*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
1132*c0909341SAndroid Build Coastguard Worker        movrel          x13, right_ext_mask, -1
1133*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, w4,  uxtw
1134*c0909341SAndroid Build Coastguard Worker        ld1             {v29.16b}, [x13]
1135*c0909341SAndroid Build Coastguard Worker
1136*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v30.16b, v29.16b
1137*c0909341SAndroid Build Coastguard Worker
1138*c0909341SAndroid Build Coastguard Worker        // Update the precalculated squares
1139*c0909341SAndroid Build Coastguard Worker        umull           v1.8h,   v0.8b,   v0.8b
1140*c0909341SAndroid Build Coastguard Worker        umull2          v2.8h,   v0.16b,  v0.16b
1141*c0909341SAndroid Build Coastguard Worker
1142*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
1143*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v0.16b, #1
1144*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v0.16b,  v0.16b, #2
1145*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v0.16b,  v0.16b, #3
1146*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v0.16b,  v0.16b, #4
1147*c0909341SAndroid Build Coastguard Worker        uaddl           v3.8h,   v0.8b,   v16.8b
1148*c0909341SAndroid Build Coastguard Worker        uaddl           v24.8h,  v17.8b,  v18.8b
1149*c0909341SAndroid Build Coastguard Worker        uaddw           v3.8h,   v3.8h,   v19.8b
1150*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v24.8h
1151*c0909341SAndroid Build Coastguard Worker
1152*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v1.16b,  v2.16b, #2
1153*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v1.16b,  v2.16b, #4
1154*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v1.16b,  v2.16b, #6
1155*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v1.16b,  v2.16b, #8
1156*c0909341SAndroid Build Coastguard Worker
1157*c0909341SAndroid Build Coastguard Worker        uaddl           v26.4s,  v1.4h,   v16.4h
1158*c0909341SAndroid Build Coastguard Worker        uaddl2          v27.4s,  v1.8h,   v16.8h
1159*c0909341SAndroid Build Coastguard Worker        uaddl           v16.4s,  v17.4h,  v18.4h
1160*c0909341SAndroid Build Coastguard Worker        uaddl2          v17.4s,  v17.8h,  v18.8h
1161*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v19.4h
1162*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v19.8h
1163*c0909341SAndroid Build Coastguard Worker        add             v26.4s,  v26.4s,  v16.4s
1164*c0909341SAndroid Build Coastguard Worker        add             v27.4s,  v27.4s,  v17.4s
1165*c0909341SAndroid Build Coastguard Worker
1166*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #8
1167*c0909341SAndroid Build Coastguard Worker
1168*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h},         [x1],  #16
1169*c0909341SAndroid Build Coastguard Worker        st1             {v26.4s,v27.4s}, [x0],  #32
1170*c0909341SAndroid Build Coastguard Worker
1171*c0909341SAndroid Build Coastguard Worker        b.le            9f
1172*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1173*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8b},  [x3],  #8
1174*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v2.16b
1175*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v0.16b,  v3.16b, #8
1176*c0909341SAndroid Build Coastguard Worker        umull           v2.8h,   v3.8b,   v3.8b
1177*c0909341SAndroid Build Coastguard Worker
1178*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep summing.
1179*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
1180*c0909341SAndroid Build Coastguard Worker
1181*c0909341SAndroid Build Coastguard Worker9:
1182*c0909341SAndroid Build Coastguard Worker        ret
1183*c0909341SAndroid Build Coastguard Workerendfunc
1184*c0909341SAndroid Build Coastguard Worker
1185*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
1186*c0909341SAndroid Build Coastguard Worker//                                      int32_t *sumsq5, int16_t *sum5,
1187*c0909341SAndroid Build Coastguard Worker//                                      const pixel (*left)[4],
1188*c0909341SAndroid Build Coastguard Worker//                                      const pixel *src, const int w,
1189*c0909341SAndroid Build Coastguard Worker//                                      const enum LrEdgeFlags edges);
1190*c0909341SAndroid Build Coastguard Workerfunction sgr_box35_row_h_8bpc_neon, export=1
1191*c0909341SAndroid Build Coastguard Worker        add             w6,  w6,  #2 // w += 2
1192*c0909341SAndroid Build Coastguard Worker
1193*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
1194*c0909341SAndroid Build Coastguard Worker        b.eq            1f
1195*c0909341SAndroid Build Coastguard Worker        cbnz            x4,  0f
1196*c0909341SAndroid Build Coastguard Worker
1197*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT && left == NULL
1198*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #3
1199*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b},  [x5], #16
1200*c0909341SAndroid Build Coastguard Worker        b               2f
1201*c0909341SAndroid Build Coastguard Worker
1202*c0909341SAndroid Build Coastguard Worker0:
1203*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
1204*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b},  [x5], #16
1205*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[3], [x4], #4
1206*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded earlier,
1207*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
1208*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #3
1209*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v1.16b, v0.16b, #13
1210*c0909341SAndroid Build Coastguard Worker        b               2f
1211*c0909341SAndroid Build Coastguard Worker
1212*c0909341SAndroid Build Coastguard Worker1:
1213*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x5], #16
1214*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
1215*c0909341SAndroid Build Coastguard Worker        // and shift v0 to have 3x the first byte at the front.
1216*c0909341SAndroid Build Coastguard Worker        dup             v1.16b, v0.b[0]
1217*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 bytes we loaded before,
1218*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
1219*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #3
1220*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v1.16b, v0.16b, #13
1221*c0909341SAndroid Build Coastguard Worker
1222*c0909341SAndroid Build Coastguard Worker2:
1223*c0909341SAndroid Build Coastguard Worker        umull           v1.8h,   v0.8b,   v0.8b
1224*c0909341SAndroid Build Coastguard Worker        umull2          v2.8h,   v0.16b,  v0.16b
1225*c0909341SAndroid Build Coastguard Worker
1226*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
1227*c0909341SAndroid Build Coastguard Worker        b.ne            4f
1228*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that byte to pad with
1229*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
1230*c0909341SAndroid Build Coastguard Worker        sub             w13, w6, #(2 + 16 - 3 + 1)
1231*c0909341SAndroid Build Coastguard Worker        ldr             b30, [x5,  w13, sxtw]
1232*c0909341SAndroid Build Coastguard Worker        // Fill v30 with the right padding pixel
1233*c0909341SAndroid Build Coastguard Worker        dup             v30.16b, v30.b[0]
1234*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
1235*c0909341SAndroid Build Coastguard Worker
1236*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
1237*c0909341SAndroid Build Coastguard Worker        cmp             w6,  #11
1238*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 11, all used input pixels are valid
1239*c0909341SAndroid Build Coastguard Worker
1240*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1241*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
1242*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
1243*c0909341SAndroid Build Coastguard Worker
1244*c0909341SAndroid Build Coastguard Worker        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1245*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
1246*c0909341SAndroid Build Coastguard Worker        movrel          x13, right_ext_mask, -1
1247*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, w6,  uxtw
1248*c0909341SAndroid Build Coastguard Worker        ld1             {v29.16b}, [x13]
1249*c0909341SAndroid Build Coastguard Worker
1250*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v30.16b, v29.16b
1251*c0909341SAndroid Build Coastguard Worker
1252*c0909341SAndroid Build Coastguard Worker        // Update the precalculated squares
1253*c0909341SAndroid Build Coastguard Worker        umull           v1.8h,   v0.8b,   v0.8b
1254*c0909341SAndroid Build Coastguard Worker        umull2          v2.8h,   v0.16b,  v0.16b
1255*c0909341SAndroid Build Coastguard Worker
1256*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
1257*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v0.16b, #1
1258*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v0.16b,  v0.16b, #2
1259*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v0.16b,  v0.16b, #4
1260*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v0.16b,  v0.16b, #3
1261*c0909341SAndroid Build Coastguard Worker        uaddl           v3.8h,   v16.8b,  v17.8b
1262*c0909341SAndroid Build Coastguard Worker        uaddl           v24.8h,  v0.8b,   v19.8b
1263*c0909341SAndroid Build Coastguard Worker        uaddw           v3.8h,   v3.8h,   v18.8b
1264*c0909341SAndroid Build Coastguard Worker
1265*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v1.16b,  v2.16b, #2
1266*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v1.16b,  v2.16b, #4
1267*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v1.16b,  v2.16b, #8
1268*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v1.16b,  v2.16b, #6
1269*c0909341SAndroid Build Coastguard Worker
1270*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h},         [x1], #16
1271*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v24.8h
1272*c0909341SAndroid Build Coastguard Worker
1273*c0909341SAndroid Build Coastguard Worker        uaddl           v26.4s,  v16.4h,  v17.4h
1274*c0909341SAndroid Build Coastguard Worker        uaddl2          v27.4s,  v16.8h,  v17.8h
1275*c0909341SAndroid Build Coastguard Worker        uaddl           v16.4s,  v1.4h,   v19.4h
1276*c0909341SAndroid Build Coastguard Worker        uaddl2          v17.4s,  v1.8h,   v19.8h
1277*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v18.4h
1278*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v18.8h
1279*c0909341SAndroid Build Coastguard Worker
1280*c0909341SAndroid Build Coastguard Worker        st1             {v26.4s,v27.4s}, [x0], #32
1281*c0909341SAndroid Build Coastguard Worker        add             v26.4s,  v26.4s,  v16.4s
1282*c0909341SAndroid Build Coastguard Worker        add             v27.4s,  v27.4s,  v17.4s
1283*c0909341SAndroid Build Coastguard Worker
1284*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #8
1285*c0909341SAndroid Build Coastguard Worker
1286*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h},         [x3], #16
1287*c0909341SAndroid Build Coastguard Worker        st1             {v26.4s,v27.4s}, [x2], #32
1288*c0909341SAndroid Build Coastguard Worker
1289*c0909341SAndroid Build Coastguard Worker        b.le            9f
1290*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
1291*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8b},  [x5],  #8
1292*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v2.16b
1293*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v0.16b,  v3.16b, #8
1294*c0909341SAndroid Build Coastguard Worker        umull           v2.8h,   v3.8b,   v3.8b
1295*c0909341SAndroid Build Coastguard Worker
1296*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep summing.
1297*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
1298*c0909341SAndroid Build Coastguard Worker
1299*c0909341SAndroid Build Coastguard Worker9:
1300*c0909341SAndroid Build Coastguard Worker        ret
1301*c0909341SAndroid Build Coastguard Workerendfunc
1302*c0909341SAndroid Build Coastguard Worker
1303*c0909341SAndroid Build Coastguard Workersgr_funcs 8
1304