xref: /aosp_15_r20/external/libdav1d/src/arm/64/looprestoration16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Workerconst right_ext_mask_buf
32*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
33*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
34*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
35*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
36*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
37*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
38*c0909341SAndroid Build Coastguard Workerright_ext_mask:
39*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
40*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
41*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
42*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
43*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
44*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
45*c0909341SAndroid Build Coastguard Workerendconst
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
48*c0909341SAndroid Build Coastguard Worker//                                      const pixel (*left)[4], const pixel *lpf,
49*c0909341SAndroid Build Coastguard Worker//                                      const int w, int h,
50*c0909341SAndroid Build Coastguard Worker//                                      const int16_t filter[2][8],
51*c0909341SAndroid Build Coastguard Worker//                                      const enum LrEdgeFlags edges,
52*c0909341SAndroid Build Coastguard Worker//                                      const int bitdepth_max);
53*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_16bpc_neon, export=1
54*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]
55*c0909341SAndroid Build Coastguard Worker        AARCH64_SIGN_LINK_REGISTER
56*c0909341SAndroid Build Coastguard Worker        stp             x29, x30, [sp, #-32]!
57*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #16]
58*c0909341SAndroid Build Coastguard Worker        mov             x29, sp
59*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x6]
60*c0909341SAndroid Build Coastguard Worker        tst             w7,  #4               // LR_HAVE_TOP
61*c0909341SAndroid Build Coastguard Worker        sub_sp          384*2*6
62*c0909341SAndroid Build Coastguard Worker
63*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  w8           // bitdepth_max
64*c0909341SAndroid Build Coastguard Worker        clz             w8,  w8
65*c0909341SAndroid Build Coastguard Worker        movi            v30.4s,  #1
66*c0909341SAndroid Build Coastguard Worker        sub             w10, w8,  #38         // -(bitdepth + 6)
67*c0909341SAndroid Build Coastguard Worker        sub             w11, w8,  #11         // round_bits_v
68*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #25         // -round_bits_h
69*c0909341SAndroid Build Coastguard Worker        neg             w10, w10              // bitdepth + 6
70*c0909341SAndroid Build Coastguard Worker        neg             w11, w11              // -round_bits_v
71*c0909341SAndroid Build Coastguard Worker        dup             v2.4s,   w10
72*c0909341SAndroid Build Coastguard Worker        dup             v29.4s,  w8           // -round_bits_h
73*c0909341SAndroid Build Coastguard Worker        dup             v27.4s,  w11          // -round_bits_v
74*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
75*c0909341SAndroid Build Coastguard Worker        ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
76*c0909341SAndroid Build Coastguard Worker
77*c0909341SAndroid Build Coastguard Worker        zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
78*c0909341SAndroid Build Coastguard Worker
79*c0909341SAndroid Build Coastguard Worker        // x9  - t6
80*c0909341SAndroid Build Coastguard Worker        // x10 - t5
81*c0909341SAndroid Build Coastguard Worker        // x11 - t4
82*c0909341SAndroid Build Coastguard Worker        // x12 - t3
83*c0909341SAndroid Build Coastguard Worker        // x13 - t2
84*c0909341SAndroid Build Coastguard Worker        // x14 - t1
85*c0909341SAndroid Build Coastguard Worker        // x15 - t0
86*c0909341SAndroid Build Coastguard Worker        mov             x14, sp               // t1
87*c0909341SAndroid Build Coastguard Worker        b.eq            L(no_top_7)
88*c0909341SAndroid Build Coastguard Worker
89*c0909341SAndroid Build Coastguard Worker        mov             x16, x2               // backup left
90*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0
91*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
92*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride
93*c0909341SAndroid Build Coastguard Worker        mov             x9,  x14              // t6
94*c0909341SAndroid Build Coastguard Worker        mov             x10, x14              // t5
95*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
96*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
97*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
98*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride*5
99*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
100*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
101*c0909341SAndroid Build Coastguard Worker        mov             x2,  x16              // left
102*c0909341SAndroid Build Coastguard Worker        mov             x16, x3               // backup lpf
103*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
104*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
105*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
106*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
107*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
108*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_7)
109*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
110*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
111*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
112*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
113*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
114*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_7)
115*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
116*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
117*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
118*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
119*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
120*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
121*c0909341SAndroid Build Coastguard Worker
122*c0909341SAndroid Build Coastguard WorkerL(main_7):
123*c0909341SAndroid Build Coastguard Worker        add             x15, x14, #384*2      // t0 = t1 + 384*2
124*c0909341SAndroid Build Coastguard WorkerL(main_loop_7):
125*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_16bpc_neon
126*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
127*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_loop_7)
128*c0909341SAndroid Build Coastguard Worker        tst             w7,  #8 // LR_HAVE_BOTTOM
129*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
130*c0909341SAndroid Build Coastguard Worker
131*c0909341SAndroid Build Coastguard Worker        mov             x3,  x16              // restore lpf
132*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0               // left = NULL
133*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_16bpc_neon
134*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_16bpc_neon
135*c0909341SAndroid Build Coastguard WorkerL(v1_7):
136*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_v_16bpc_neon
137*c0909341SAndroid Build Coastguard Worker
138*c0909341SAndroid Build Coastguard Worker        mov             sp,  x29
139*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp, #16]
140*c0909341SAndroid Build Coastguard Worker        ldp             x29, x30, [sp], #32
141*c0909341SAndroid Build Coastguard Worker        AARCH64_VALIDATE_LINK_REGISTER
142*c0909341SAndroid Build Coastguard Worker        ret
143*c0909341SAndroid Build Coastguard Worker
144*c0909341SAndroid Build Coastguard WorkerL(no_top_7):
145*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
146*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
147*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
148*c0909341SAndroid Build Coastguard Worker
149*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
150*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
151*c0909341SAndroid Build Coastguard Worker        mov             x9,  x14              // t6
152*c0909341SAndroid Build Coastguard Worker        mov             x10, x14              // t5
153*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
154*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
155*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
156*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_7)
157*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += p_stride
158*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
159*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
160*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
161*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
162*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_7)
163*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += p_stride
164*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
165*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_h_16bpc_neon
166*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
167*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
168*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += p_stride
169*c0909341SAndroid Build Coastguard Worker        add             x15, x14, #384*2      // t0 = t1 + 384*2
170*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_16bpc_neon
171*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
172*c0909341SAndroid Build Coastguard Worker        b.eq            L(v3_7)
173*c0909341SAndroid Build Coastguard Worker        add             x15, x15, #384*2*4    // t0 += 384*2*4
174*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_hv_16bpc_neon
175*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
176*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_7)
177*c0909341SAndroid Build Coastguard WorkerL(v3_7):
178*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_v_16bpc_neon
179*c0909341SAndroid Build Coastguard WorkerL(v2_7):
180*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter7_v_16bpc_neon
181*c0909341SAndroid Build Coastguard Worker        b               L(v1_7)
182*c0909341SAndroid Build Coastguard Workerendfunc
183*c0909341SAndroid Build Coastguard Worker
184*c0909341SAndroid Build Coastguard Worker
185*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_h_16bpc_neon
186*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #-32]!
187*c0909341SAndroid Build Coastguard Worker        str             x14,      [sp, #16]
188*c0909341SAndroid Build Coastguard Worker
189*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
190*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
191*c0909341SAndroid Build Coastguard Worker        b.eq            1f
192*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
193*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
194*c0909341SAndroid Build Coastguard Worker        // left == NULL
195*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
196*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
197*c0909341SAndroid Build Coastguard Worker        b               2f
198*c0909341SAndroid Build Coastguard Worker
199*c0909341SAndroid Build Coastguard Worker0:
200*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
201*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
202*c0909341SAndroid Build Coastguard Worker        ld1             {v4.d}[1], [x2], #8
203*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 pixels we loaded earlier,
204*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
205*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
206*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #10
207*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #10
208*c0909341SAndroid Build Coastguard Worker        b               2f
209*c0909341SAndroid Build Coastguard Worker
210*c0909341SAndroid Build Coastguard Worker1:
211*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
212*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
213*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 3x the first pixel at the front.
214*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,  v2.h[0]
215*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 pixels we loaded before,
216*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
217*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
218*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #10
219*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #10
220*c0909341SAndroid Build Coastguard Worker
221*c0909341SAndroid Build Coastguard Worker2:
222*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [x3], #16
223*c0909341SAndroid Build Coastguard Worker
224*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
225*c0909341SAndroid Build Coastguard Worker        b.ne            4f
226*c0909341SAndroid Build Coastguard Worker
227*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
228*c0909341SAndroid Build Coastguard Worker
229*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
230*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #19
231*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 19, all used input pixels are valid
232*c0909341SAndroid Build Coastguard Worker
233*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
234*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
235*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
236*c0909341SAndroid Build Coastguard Worker
237*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
238*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
239*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #22
240*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
241*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
242*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -6
243*c0909341SAndroid Build Coastguard Worker        ldr             h26, [x3,  w17, sxtw #1]
244*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
245*c0909341SAndroid Build Coastguard Worker        dup             v26.8h,  v26.h[0]
246*c0909341SAndroid Build Coastguard Worker        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
247*c0909341SAndroid Build Coastguard Worker
248*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v26.16b, v23.16b
249*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v26.16b, v24.16b
250*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v26.16b, v25.16b
251*c0909341SAndroid Build Coastguard Worker
252*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
253*c0909341SAndroid Build Coastguard Worker        // Interleaving the mul/mla chains actually hurts performance
254*c0909341SAndroid Build Coastguard Worker        // significantly on Cortex A53, thus keeping mul/mla tightly
255*c0909341SAndroid Build Coastguard Worker        // chained like this.
256*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
257*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
258*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
259*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v2.16b,  v3.16b, #10
260*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v2.16b,  v3.16b, #12
261*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
262*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
263*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
264*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v2.8h
265*c0909341SAndroid Build Coastguard Worker        smull           v6.4s,   v18.4h,  v0.h[3]
266*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v19.4h,  v0.h[2]
267*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v20.4h,  v0.h[1]
268*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v21.4h,  v0.h[0]
269*c0909341SAndroid Build Coastguard Worker        smull2          v7.4s,   v18.8h,  v0.h[3]
270*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v19.8h,  v0.h[2]
271*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v20.8h,  v0.h[1]
272*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v21.8h,  v0.h[0]
273*c0909341SAndroid Build Coastguard Worker
274*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
275*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
276*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
277*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v3.16b,  v4.16b, #10
278*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v3.16b,  v4.16b, #12
279*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
280*c0909341SAndroid Build Coastguard Worker
281*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
282*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
283*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v3.8h
284*c0909341SAndroid Build Coastguard Worker        smull           v16.4s,  v18.4h,  v0.h[3]
285*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v19.4h,  v0.h[2]
286*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v20.4h,  v0.h[1]
287*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v21.4h,  v0.h[0]
288*c0909341SAndroid Build Coastguard Worker        smull2          v17.4s,  v18.8h,  v0.h[3]
289*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v19.8h,  v0.h[2]
290*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v20.8h,  v0.h[1]
291*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v21.8h,  v0.h[0]
292*c0909341SAndroid Build Coastguard Worker
293*c0909341SAndroid Build Coastguard Worker        mvni            v24.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
294*c0909341SAndroid Build Coastguard Worker        add             v6.4s,   v6.4s,   v30.4s
295*c0909341SAndroid Build Coastguard Worker        add             v7.4s,   v7.4s,   v30.4s
296*c0909341SAndroid Build Coastguard Worker        add             v16.4s,  v16.4s,  v30.4s
297*c0909341SAndroid Build Coastguard Worker        add             v17.4s,  v17.4s,  v30.4s
298*c0909341SAndroid Build Coastguard Worker        srshl           v6.4s,   v6.4s,   v29.4s
299*c0909341SAndroid Build Coastguard Worker        srshl           v7.4s,   v7.4s,   v29.4s
300*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v16.4s,  v29.4s
301*c0909341SAndroid Build Coastguard Worker        srshl           v17.4s,  v17.4s,  v29.4s
302*c0909341SAndroid Build Coastguard Worker        sqxtun          v6.4h,   v6.4s
303*c0909341SAndroid Build Coastguard Worker        sqxtun2         v6.8h,   v7.4s
304*c0909341SAndroid Build Coastguard Worker        sqxtun          v7.4h,   v16.4s
305*c0909341SAndroid Build Coastguard Worker        sqxtun2         v7.8h,   v17.4s
306*c0909341SAndroid Build Coastguard Worker        umin            v6.8h,   v6.8h,   v24.8h
307*c0909341SAndroid Build Coastguard Worker        umin            v7.8h,   v7.8h,   v24.8h
308*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v31.8h
309*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v7.8h,   v31.8h
310*c0909341SAndroid Build Coastguard Worker
311*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
312*c0909341SAndroid Build Coastguard Worker
313*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x14], #32
314*c0909341SAndroid Build Coastguard Worker
315*c0909341SAndroid Build Coastguard Worker        b.le            0f
316*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
317*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
318*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h}, [x3], #32
319*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
320*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
321*c0909341SAndroid Build Coastguard Worker
322*c0909341SAndroid Build Coastguard Worker0:
323*c0909341SAndroid Build Coastguard Worker        ldr             x14,      [sp, #16]
324*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp], #32
325*c0909341SAndroid Build Coastguard Worker        ret
326*c0909341SAndroid Build Coastguard Workerendfunc
327*c0909341SAndroid Build Coastguard Worker
328*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_v_16bpc_neon
329*c0909341SAndroid Build Coastguard Worker        // Backing up/restoring registers shifted, so that x9 gets the value
330*c0909341SAndroid Build Coastguard Worker        // of x10, etc, afterwards.
331*c0909341SAndroid Build Coastguard Worker        stp             x10, x11, [sp, #-64]!
332*c0909341SAndroid Build Coastguard Worker        stp             x12, x13, [sp, #16]
333*c0909341SAndroid Build Coastguard Worker        stp             x14, x14, [sp, #32]
334*c0909341SAndroid Build Coastguard Worker        stp             x0,  x4,  [sp, #48]
335*c0909341SAndroid Build Coastguard Worker1:
336*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x9],  #32
337*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x10], #32
338*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x11], #32
339*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h, v23.8h}, [x12], #32
340*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h, v25.8h}, [x13], #32
341*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h,  v7.8h},  [x14], #32
342*c0909341SAndroid Build Coastguard Worker
343*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v16.4h,  v0.h[4]
344*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v0.h[5]
345*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v20.4h,  v0.h[6]
346*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v22.4h,  v0.h[7]
347*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v24.4h,  v0.h[6]
348*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v6.4h,   v0.h[5]
349*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v6.4h,   v0.h[4]
350*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v16.8h,  v0.h[4]
351*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v18.8h,  v0.h[5]
352*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v20.8h,  v0.h[6]
353*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v22.8h,  v0.h[7]
354*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v24.8h,  v0.h[6]
355*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v6.8h,   v0.h[5]
356*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v6.8h,   v0.h[4]
357*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v17.4h,  v0.h[4]
358*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v19.4h,  v0.h[5]
359*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v21.4h,  v0.h[6]
360*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v23.4h,  v0.h[7]
361*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v25.4h,  v0.h[6]
362*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v7.4h,   v0.h[5]
363*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v7.4h,   v0.h[4]
364*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v17.8h,  v0.h[4]
365*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v19.8h,  v0.h[5]
366*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v21.8h,  v0.h[6]
367*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v23.8h,  v0.h[7]
368*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v25.8h,  v0.h[6]
369*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v7.8h,   v0.h[5]
370*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v7.8h,   v0.h[4]
371*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s,   v2.4s,   v27.4s  // -round_bits_v
372*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s,   v3.4s,   v27.4s
373*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v27.4s
374*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v27.4s
375*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.4h,   v2.4s
376*c0909341SAndroid Build Coastguard Worker        sqxtun2         v2.8h,   v3.4s
377*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.4h,   v4.4s
378*c0909341SAndroid Build Coastguard Worker        sqxtun2         v3.8h,   v5.4s
379*c0909341SAndroid Build Coastguard Worker        umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
380*c0909341SAndroid Build Coastguard Worker        umin            v3.8h,   v3.8h,   v28.8h
381*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
382*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h, v3.8h}, [x0], #32
383*c0909341SAndroid Build Coastguard Worker        b.gt            1b
384*c0909341SAndroid Build Coastguard Worker
385*c0909341SAndroid Build Coastguard Worker        ldp             x0,  x4,  [sp, #48]
386*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #32]
387*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp, #16]
388*c0909341SAndroid Build Coastguard Worker        ldp             x9,  x10, [sp], #64
389*c0909341SAndroid Build Coastguard Worker
390*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
391*c0909341SAndroid Build Coastguard Worker        ret
392*c0909341SAndroid Build Coastguard Workerendfunc
393*c0909341SAndroid Build Coastguard Worker
394*c0909341SAndroid Build Coastguard Workerfunction wiener_filter7_hv_16bpc_neon
395*c0909341SAndroid Build Coastguard Worker        // Backing up/restoring registers shifted, so that x9 gets the value
396*c0909341SAndroid Build Coastguard Worker        // of x10, etc, and x15==x9, afterwards.
397*c0909341SAndroid Build Coastguard Worker        stp             x10, x11, [sp, #-80]!
398*c0909341SAndroid Build Coastguard Worker        stp             x12, x13, [sp, #16]
399*c0909341SAndroid Build Coastguard Worker        stp             x14, x15, [sp, #32]
400*c0909341SAndroid Build Coastguard Worker        stp             x10, x0,  [sp, #48]
401*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #64]
402*c0909341SAndroid Build Coastguard Worker
403*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
404*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
405*c0909341SAndroid Build Coastguard Worker        b.eq            1f
406*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
407*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
408*c0909341SAndroid Build Coastguard Worker        // left == NULL
409*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
410*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
411*c0909341SAndroid Build Coastguard Worker        b               2f
412*c0909341SAndroid Build Coastguard Worker
413*c0909341SAndroid Build Coastguard Worker0:
414*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
415*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
416*c0909341SAndroid Build Coastguard Worker        ld1             {v4.d}[1], [x2], #8
417*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 pixels we loaded earlier,
418*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
419*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
420*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #10
421*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #10
422*c0909341SAndroid Build Coastguard Worker        b               2f
423*c0909341SAndroid Build Coastguard Worker1:
424*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
425*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
426*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 3x the first pixel at the front.
427*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,  v2.h[0]
428*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 pixels we loaded before,
429*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
430*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
431*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #10
432*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #10
433*c0909341SAndroid Build Coastguard Worker
434*c0909341SAndroid Build Coastguard Worker2:
435*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [x3], #16
436*c0909341SAndroid Build Coastguard Worker
437*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
438*c0909341SAndroid Build Coastguard Worker        b.ne            4f
439*c0909341SAndroid Build Coastguard Worker
440*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
441*c0909341SAndroid Build Coastguard Worker
442*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
443*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #19
444*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 19, all used input pixels are valid
445*c0909341SAndroid Build Coastguard Worker
446*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
447*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
448*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
449*c0909341SAndroid Build Coastguard Worker
450*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
451*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
452*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #22
453*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
454*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
455*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -6
456*c0909341SAndroid Build Coastguard Worker        ldr             h26, [x3,  w17, sxtw #1]
457*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
458*c0909341SAndroid Build Coastguard Worker        dup             v26.8h,  v26.h[0]
459*c0909341SAndroid Build Coastguard Worker        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
460*c0909341SAndroid Build Coastguard Worker
461*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v26.16b, v23.16b
462*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v26.16b, v24.16b
463*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v26.16b, v25.16b
464*c0909341SAndroid Build Coastguard Worker
465*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
466*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
467*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
468*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
469*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v2.16b,  v3.16b, #10
470*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v2.16b,  v3.16b, #12
471*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
472*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
473*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
474*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v2.8h
475*c0909341SAndroid Build Coastguard Worker        smull           v6.4s,   v18.4h,  v0.h[3]
476*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v19.4h,  v0.h[2]
477*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v20.4h,  v0.h[1]
478*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v21.4h,  v0.h[0]
479*c0909341SAndroid Build Coastguard Worker        smull2          v7.4s,   v18.8h,  v0.h[3]
480*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v19.8h,  v0.h[2]
481*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v20.8h,  v0.h[1]
482*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v21.8h,  v0.h[0]
483*c0909341SAndroid Build Coastguard Worker
484*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
485*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
486*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
487*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v3.16b,  v4.16b, #10
488*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v3.16b,  v4.16b, #12
489*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
490*c0909341SAndroid Build Coastguard Worker
491*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v17.8h
492*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v16.8h
493*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v3.8h
494*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v18.4h,  v0.h[3]
495*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v19.4h,  v0.h[2]
496*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v20.4h,  v0.h[1]
497*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v21.4h,  v0.h[0]
498*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v18.8h,  v0.h[3]
499*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v19.8h,  v0.h[2]
500*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v20.8h,  v0.h[1]
501*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v21.8h,  v0.h[0]
502*c0909341SAndroid Build Coastguard Worker
503*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x9],  #32
504*c0909341SAndroid Build Coastguard Worker
505*c0909341SAndroid Build Coastguard Worker        mvni            v26.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
506*c0909341SAndroid Build Coastguard Worker        add             v6.4s,   v6.4s,   v30.4s
507*c0909341SAndroid Build Coastguard Worker        add             v7.4s,   v7.4s,   v30.4s
508*c0909341SAndroid Build Coastguard Worker        add             v24.4s,  v24.4s,  v30.4s
509*c0909341SAndroid Build Coastguard Worker        add             v25.4s,  v25.4s,  v30.4s
510*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x10], #32
511*c0909341SAndroid Build Coastguard Worker        srshl           v6.4s,   v6.4s,   v29.4s
512*c0909341SAndroid Build Coastguard Worker        srshl           v7.4s,   v7.4s,   v29.4s
513*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s,  v24.4s,  v29.4s
514*c0909341SAndroid Build Coastguard Worker        srshl           v25.4s,  v25.4s,  v29.4s
515*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x11], #32
516*c0909341SAndroid Build Coastguard Worker        sqxtun          v6.4h,   v6.4s
517*c0909341SAndroid Build Coastguard Worker        sqxtun2         v6.8h,   v7.4s
518*c0909341SAndroid Build Coastguard Worker        sqxtun          v7.4h,   v24.4s
519*c0909341SAndroid Build Coastguard Worker        sqxtun2         v7.8h,   v25.4s
520*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h, v23.8h}, [x12], #32
521*c0909341SAndroid Build Coastguard Worker        umin            v6.8h,   v6.8h,   v26.8h
522*c0909341SAndroid Build Coastguard Worker        umin            v7.8h,   v7.8h,   v26.8h
523*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h, v25.8h}, [x13], #32
524*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v31.8h
525*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v7.8h,   v31.8h
526*c0909341SAndroid Build Coastguard Worker
527*c0909341SAndroid Build Coastguard Worker        ld1             {v8.8h,  v9.8h},  [x14], #32
528*c0909341SAndroid Build Coastguard Worker
529*c0909341SAndroid Build Coastguard Worker        smull           v1.4s,   v16.4h,  v0.h[4]
530*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v18.4h,  v0.h[5]
531*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v20.4h,  v0.h[6]
532*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v22.4h,  v0.h[7]
533*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v24.4h,  v0.h[6]
534*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v8.4h,   v0.h[5]
535*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v6.4h,   v0.h[4]
536*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v16.8h,  v0.h[4]
537*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v18.8h,  v0.h[5]
538*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v20.8h,  v0.h[6]
539*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v22.8h,  v0.h[7]
540*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v24.8h,  v0.h[6]
541*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v8.8h,   v0.h[5]
542*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v6.8h,   v0.h[4]
543*c0909341SAndroid Build Coastguard Worker        smull           v26.4s,  v17.4h,  v0.h[4]
544*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v19.4h,  v0.h[5]
545*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v21.4h,  v0.h[6]
546*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v23.4h,  v0.h[7]
547*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v25.4h,  v0.h[6]
548*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v9.4h,   v0.h[5]
549*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v7.4h,   v0.h[4]
550*c0909341SAndroid Build Coastguard Worker        smull2          v16.4s,  v17.8h,  v0.h[4]
551*c0909341SAndroid Build Coastguard Worker        smlal2          v16.4s,  v19.8h,  v0.h[5]
552*c0909341SAndroid Build Coastguard Worker        smlal2          v16.4s,  v21.8h,  v0.h[6]
553*c0909341SAndroid Build Coastguard Worker        smlal2          v16.4s,  v23.8h,  v0.h[7]
554*c0909341SAndroid Build Coastguard Worker        smlal2          v16.4s,  v25.8h,  v0.h[6]
555*c0909341SAndroid Build Coastguard Worker        smlal2          v16.4s,  v9.8h,   v0.h[5]
556*c0909341SAndroid Build Coastguard Worker        smlal2          v16.4s,  v7.8h,   v0.h[4]
557*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s,   v1.4s,   v27.4s  // -round_bits_v
558*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v27.4s
559*c0909341SAndroid Build Coastguard Worker        srshl           v26.4s,  v26.4s,  v27.4s
560*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v16.4s,  v27.4s
561*c0909341SAndroid Build Coastguard Worker        sqxtun          v18.4h,  v1.4s
562*c0909341SAndroid Build Coastguard Worker        sqxtun2         v18.8h,  v5.4s
563*c0909341SAndroid Build Coastguard Worker        sqxtun          v19.4h,  v26.4s
564*c0909341SAndroid Build Coastguard Worker        sqxtun2         v19.8h,  v16.4s
565*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x15], #32
566*c0909341SAndroid Build Coastguard Worker        umin            v18.8h,  v18.8h,  v28.8h  // bitdepth_max
567*c0909341SAndroid Build Coastguard Worker        umin            v19.8h,  v19.8h,  v28.8h
568*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
569*c0909341SAndroid Build Coastguard Worker
570*c0909341SAndroid Build Coastguard Worker        st1             {v18.8h, v19.8h}, [x0], #32
571*c0909341SAndroid Build Coastguard Worker
572*c0909341SAndroid Build Coastguard Worker        b.le            0f
573*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
574*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
575*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h}, [x3], #32
576*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
577*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
578*c0909341SAndroid Build Coastguard Worker
579*c0909341SAndroid Build Coastguard Worker0:
580*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp, #64]
581*c0909341SAndroid Build Coastguard Worker        ldp             x15, x0,  [sp, #48]
582*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #32]
583*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp, #16]
584*c0909341SAndroid Build Coastguard Worker        ldp             x9,  x10, [sp], #80
585*c0909341SAndroid Build Coastguard Worker
586*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1
587*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
588*c0909341SAndroid Build Coastguard Worker
589*c0909341SAndroid Build Coastguard Worker        ret
590*c0909341SAndroid Build Coastguard Workerendfunc
591*c0909341SAndroid Build Coastguard Worker
592*c0909341SAndroid Build Coastguard Worker// void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
593*c0909341SAndroid Build Coastguard Worker//                                      const pixel (*left)[4], const pixel *lpf,
594*c0909341SAndroid Build Coastguard Worker//                                      const int w, int h,
595*c0909341SAndroid Build Coastguard Worker//                                      const int16_t filter[2][8],
596*c0909341SAndroid Build Coastguard Worker//                                      const enum LrEdgeFlags edges,
597*c0909341SAndroid Build Coastguard Worker//                                      const int bitdepth_max);
598*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_16bpc_neon, export=1
599*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]
600*c0909341SAndroid Build Coastguard Worker        AARCH64_SIGN_LINK_REGISTER
601*c0909341SAndroid Build Coastguard Worker        stp             x29, x30, [sp, #-32]!
602*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #16]
603*c0909341SAndroid Build Coastguard Worker        mov             x29, sp
604*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x6]
605*c0909341SAndroid Build Coastguard Worker        tst             w7,  #4               // LR_HAVE_TOP
606*c0909341SAndroid Build Coastguard Worker        sub_sp          384*2*4
607*c0909341SAndroid Build Coastguard Worker
608*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  w8           // bitdepth_max
609*c0909341SAndroid Build Coastguard Worker        clz             w8,  w8
610*c0909341SAndroid Build Coastguard Worker        movi            v30.4s,  #1
611*c0909341SAndroid Build Coastguard Worker        sub             w10, w8,  #38         // -(bitdepth + 6)
612*c0909341SAndroid Build Coastguard Worker        sub             w11, w8,  #11         // round_bits_v
613*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #25         // -round_bits_h
614*c0909341SAndroid Build Coastguard Worker        neg             w10, w10              // bitdepth + 6
615*c0909341SAndroid Build Coastguard Worker        neg             w11, w11              // -round_bits_v
616*c0909341SAndroid Build Coastguard Worker        dup             v2.4s,   w10
617*c0909341SAndroid Build Coastguard Worker        dup             v29.4s,  w8           // -round_bits_h
618*c0909341SAndroid Build Coastguard Worker        dup             v27.4s,  w11          // -round_bits_v
619*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
620*c0909341SAndroid Build Coastguard Worker        ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
621*c0909341SAndroid Build Coastguard Worker
622*c0909341SAndroid Build Coastguard Worker        zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
623*c0909341SAndroid Build Coastguard Worker
624*c0909341SAndroid Build Coastguard Worker        // x11 - t4
625*c0909341SAndroid Build Coastguard Worker        // x12 - t3
626*c0909341SAndroid Build Coastguard Worker        // x13 - t2
627*c0909341SAndroid Build Coastguard Worker        // x14 - t1
628*c0909341SAndroid Build Coastguard Worker        // x15 - t0
629*c0909341SAndroid Build Coastguard Worker        mov             x14, sp               // t1
630*c0909341SAndroid Build Coastguard Worker        b.eq            L(no_top_5)
631*c0909341SAndroid Build Coastguard Worker
632*c0909341SAndroid Build Coastguard Worker        mov             x16, x2               // backup left
633*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0
634*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_16bpc_neon
635*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride
636*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
637*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
638*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_16bpc_neon
639*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
640*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // lpf += stride*5
641*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
642*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
643*c0909341SAndroid Build Coastguard Worker        mov             x2,  x16              // left
644*c0909341SAndroid Build Coastguard Worker        mov             x16, x3               // backup lpf
645*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
646*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_16bpc_neon
647*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
648*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
649*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_5)
650*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
651*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
652*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_16bpc_neon
653*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
654*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
655*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
656*c0909341SAndroid Build Coastguard Worker
657*c0909341SAndroid Build Coastguard WorkerL(main_5):
658*c0909341SAndroid Build Coastguard Worker        mov             x15, x11              // t0 = t4
659*c0909341SAndroid Build Coastguard WorkerL(main_loop_5):
660*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_16bpc_neon
661*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
662*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_loop_5)
663*c0909341SAndroid Build Coastguard Worker        tst             w7,  #8 // LR_HAVE_BOTTOM
664*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
665*c0909341SAndroid Build Coastguard Worker
666*c0909341SAndroid Build Coastguard Worker        mov             x3,  x16              // restore lpf
667*c0909341SAndroid Build Coastguard Worker        mov             x2,  #0               // left = NULL
668*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_16bpc_neon
669*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_16bpc_neon
670*c0909341SAndroid Build Coastguard WorkerL(end_5):
671*c0909341SAndroid Build Coastguard Worker
672*c0909341SAndroid Build Coastguard Worker        mov             sp,  x29
673*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp, #16]
674*c0909341SAndroid Build Coastguard Worker        ldp             x29, x30, [sp], #32
675*c0909341SAndroid Build Coastguard Worker        AARCH64_VALIDATE_LINK_REGISTER
676*c0909341SAndroid Build Coastguard Worker        ret
677*c0909341SAndroid Build Coastguard Worker
678*c0909341SAndroid Build Coastguard WorkerL(no_top_5):
679*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1,  lsl #2
680*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
681*c0909341SAndroid Build Coastguard Worker        mov             x3,  x0               // lpf = p
682*c0909341SAndroid Build Coastguard Worker
683*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_16bpc_neon
684*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
685*c0909341SAndroid Build Coastguard Worker        mov             x11, x14              // t4
686*c0909341SAndroid Build Coastguard Worker        mov             x12, x14              // t3
687*c0909341SAndroid Build Coastguard Worker        mov             x13, x14              // t2
688*c0909341SAndroid Build Coastguard Worker        b.eq            L(v1_5)
689*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
690*c0909341SAndroid Build Coastguard Worker        add             x14, x14, #384*2      // t1 += 384*2
691*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_h_16bpc_neon
692*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
693*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
694*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1          // src += stride
695*c0909341SAndroid Build Coastguard Worker        add             x15, x14, #384*2      // t0 = t1 + 384*2
696*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_16bpc_neon
697*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
698*c0909341SAndroid Build Coastguard Worker        b.eq            L(v2_5)
699*c0909341SAndroid Build Coastguard Worker        add             x15, x15, #384*2*3    // t0 += 384*2*3
700*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_hv_16bpc_neon
701*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1          // h--
702*c0909341SAndroid Build Coastguard Worker        b.ne            L(main_5)
703*c0909341SAndroid Build Coastguard WorkerL(v2_5):
704*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_v_16bpc_neon
705*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
706*c0909341SAndroid Build Coastguard Worker        mov             x11, x12
707*c0909341SAndroid Build Coastguard Worker        mov             x12, x13
708*c0909341SAndroid Build Coastguard Worker        mov             x13, x14
709*c0909341SAndroid Build Coastguard WorkerL(v1_5):
710*c0909341SAndroid Build Coastguard Worker        bl              wiener_filter5_v_16bpc_neon
711*c0909341SAndroid Build Coastguard Worker        b               L(end_5)
712*c0909341SAndroid Build Coastguard Workerendfunc
713*c0909341SAndroid Build Coastguard Worker
714*c0909341SAndroid Build Coastguard Worker
715*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_h_16bpc_neon
716*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #-32]!
717*c0909341SAndroid Build Coastguard Worker        str             x14,      [sp, #16]
718*c0909341SAndroid Build Coastguard Worker
719*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
720*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
721*c0909341SAndroid Build Coastguard Worker        b.eq            1f
722*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
723*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
724*c0909341SAndroid Build Coastguard Worker        // left == NULL
725*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
726*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
727*c0909341SAndroid Build Coastguard Worker        b               2f
728*c0909341SAndroid Build Coastguard Worker
729*c0909341SAndroid Build Coastguard Worker0:
730*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
731*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
732*c0909341SAndroid Build Coastguard Worker        ld1             {v4.d}[1], [x2], #8
733*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 pixels we loaded earlier,
734*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
735*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
736*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #12
737*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #12
738*c0909341SAndroid Build Coastguard Worker        b               2f
739*c0909341SAndroid Build Coastguard Worker
740*c0909341SAndroid Build Coastguard Worker1:
741*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
742*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
743*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 3x the first pixel at the front.
744*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,  v2.h[0]
745*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 pixels we loaded before,
746*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
747*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
748*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #12
749*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #12
750*c0909341SAndroid Build Coastguard Worker
751*c0909341SAndroid Build Coastguard Worker2:
752*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [x3], #16
753*c0909341SAndroid Build Coastguard Worker
754*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
755*c0909341SAndroid Build Coastguard Worker        b.ne            4f
756*c0909341SAndroid Build Coastguard Worker
757*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
758*c0909341SAndroid Build Coastguard Worker
759*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
760*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #18
761*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 18, all used input pixels are valid
762*c0909341SAndroid Build Coastguard Worker
763*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
764*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
765*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
766*c0909341SAndroid Build Coastguard Worker
767*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
768*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
769*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #23
770*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
771*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
772*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -4
773*c0909341SAndroid Build Coastguard Worker        ldr             h26, [x3,  w17, sxtw #1]
774*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
775*c0909341SAndroid Build Coastguard Worker        dup             v26.8h,  v26.h[0]
776*c0909341SAndroid Build Coastguard Worker        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
777*c0909341SAndroid Build Coastguard Worker
778*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v26.16b, v23.16b
779*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v26.16b, v24.16b
780*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v26.16b, v25.16b
781*c0909341SAndroid Build Coastguard Worker
782*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
783*c0909341SAndroid Build Coastguard Worker        // Interleaving the mul/mla chains actually hurts performance
784*c0909341SAndroid Build Coastguard Worker        // significantly on Cortex A53, thus keeping mul/mla tightly
785*c0909341SAndroid Build Coastguard Worker        // chained like this.
786*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
787*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
788*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
789*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
790*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
791*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v2.8h
792*c0909341SAndroid Build Coastguard Worker        smull           v6.4s,   v17.4h,  v0.h[3]
793*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v18.4h,  v0.h[2]
794*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v19.4h,  v0.h[1]
795*c0909341SAndroid Build Coastguard Worker        smull2          v7.4s,   v17.8h,  v0.h[3]
796*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v18.8h,  v0.h[2]
797*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v19.8h,  v0.h[1]
798*c0909341SAndroid Build Coastguard Worker
799*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
800*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
801*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
802*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
803*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
804*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
805*c0909341SAndroid Build Coastguard Worker        smull           v16.4s,  v17.4h,  v0.h[3]
806*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v18.4h,  v0.h[2]
807*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v19.4h,  v0.h[1]
808*c0909341SAndroid Build Coastguard Worker        smull2          v17.4s,  v17.8h,  v0.h[3]
809*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v18.8h,  v0.h[2]
810*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v19.8h,  v0.h[1]
811*c0909341SAndroid Build Coastguard Worker
812*c0909341SAndroid Build Coastguard Worker        mvni            v24.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
813*c0909341SAndroid Build Coastguard Worker        add             v6.4s,   v6.4s,   v30.4s
814*c0909341SAndroid Build Coastguard Worker        add             v7.4s,   v7.4s,   v30.4s
815*c0909341SAndroid Build Coastguard Worker        add             v16.4s,  v16.4s,  v30.4s
816*c0909341SAndroid Build Coastguard Worker        add             v17.4s,  v17.4s,  v30.4s
817*c0909341SAndroid Build Coastguard Worker        srshl           v6.4s,   v6.4s,   v29.4s
818*c0909341SAndroid Build Coastguard Worker        srshl           v7.4s,   v7.4s,   v29.4s
819*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v16.4s,  v29.4s
820*c0909341SAndroid Build Coastguard Worker        srshl           v17.4s,  v17.4s,  v29.4s
821*c0909341SAndroid Build Coastguard Worker        sqxtun          v6.4h,   v6.4s
822*c0909341SAndroid Build Coastguard Worker        sqxtun2         v6.8h,   v7.4s
823*c0909341SAndroid Build Coastguard Worker        sqxtun          v7.4h,   v16.4s
824*c0909341SAndroid Build Coastguard Worker        sqxtun2         v7.8h,   v17.4s
825*c0909341SAndroid Build Coastguard Worker        umin            v6.8h,   v6.8h,   v24.8h
826*c0909341SAndroid Build Coastguard Worker        umin            v7.8h,   v7.8h,   v24.8h
827*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v31.8h
828*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v7.8h,   v31.8h
829*c0909341SAndroid Build Coastguard Worker
830*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
831*c0909341SAndroid Build Coastguard Worker
832*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x14], #32
833*c0909341SAndroid Build Coastguard Worker
834*c0909341SAndroid Build Coastguard Worker        b.le            0f
835*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
836*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
837*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h}, [x3], #32
838*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
839*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
840*c0909341SAndroid Build Coastguard Worker
841*c0909341SAndroid Build Coastguard Worker0:
842*c0909341SAndroid Build Coastguard Worker        ldr             x14,      [sp, #16]
843*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp], #32
844*c0909341SAndroid Build Coastguard Worker        ret
845*c0909341SAndroid Build Coastguard Workerendfunc
846*c0909341SAndroid Build Coastguard Worker
847*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_v_16bpc_neon
848*c0909341SAndroid Build Coastguard Worker        stp             x11, x12, [sp, #-48]!
849*c0909341SAndroid Build Coastguard Worker        stp             x13, x14, [sp, #16]
850*c0909341SAndroid Build Coastguard Worker        stp             x0,  x4,  [sp, #32]
851*c0909341SAndroid Build Coastguard Worker1:
852*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x11], #32
853*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x12], #32
854*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x13], #32
855*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h, v23.8h}, [x14], #32
856*c0909341SAndroid Build Coastguard Worker
857*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v16.4h,  v0.h[5]
858*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v0.h[6]
859*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v20.4h,  v0.h[7]
860*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v22.4h,  v0.h[6]
861*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v22.4h,  v0.h[5]
862*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v16.8h,  v0.h[5]
863*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v18.8h,  v0.h[6]
864*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v20.8h,  v0.h[7]
865*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v22.8h,  v0.h[6]
866*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v22.8h,  v0.h[5]
867*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v17.4h,  v0.h[5]
868*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v19.4h,  v0.h[6]
869*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v21.4h,  v0.h[7]
870*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v23.4h,  v0.h[6]
871*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v23.4h,  v0.h[5]
872*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v17.8h,  v0.h[5]
873*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v19.8h,  v0.h[6]
874*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v21.8h,  v0.h[7]
875*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v23.8h,  v0.h[6]
876*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v23.8h,  v0.h[5]
877*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s,   v2.4s,   v27.4s  // -round_bits_v
878*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s,   v3.4s,   v27.4s
879*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v27.4s
880*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v27.4s
881*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.4h,   v2.4s
882*c0909341SAndroid Build Coastguard Worker        sqxtun2         v2.8h,   v3.4s
883*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.4h,   v4.4s
884*c0909341SAndroid Build Coastguard Worker        sqxtun2         v3.8h,   v5.4s
885*c0909341SAndroid Build Coastguard Worker        umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
886*c0909341SAndroid Build Coastguard Worker        umin            v3.8h,   v3.8h,   v28.8h
887*c0909341SAndroid Build Coastguard Worker
888*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
889*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h, v3.8h}, [x0], #32
890*c0909341SAndroid Build Coastguard Worker        b.gt            1b
891*c0909341SAndroid Build Coastguard Worker
892*c0909341SAndroid Build Coastguard Worker        ldp             x0,  x4,  [sp, #32]
893*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #16]
894*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp], #48
895*c0909341SAndroid Build Coastguard Worker
896*c0909341SAndroid Build Coastguard Worker        ret
897*c0909341SAndroid Build Coastguard Workerendfunc
898*c0909341SAndroid Build Coastguard Worker
899*c0909341SAndroid Build Coastguard Workerfunction wiener_filter5_hv_16bpc_neon
900*c0909341SAndroid Build Coastguard Worker        // Backing up/restoring registers shifted, so that x11 gets the value
901*c0909341SAndroid Build Coastguard Worker        // of x12, etc, and x15==x11, afterwards.
902*c0909341SAndroid Build Coastguard Worker        stp             x12, x13, [sp, #-64]!
903*c0909341SAndroid Build Coastguard Worker        stp             x14, x15, [sp, #16]
904*c0909341SAndroid Build Coastguard Worker        stp             x12, x0,  [sp, #32]
905*c0909341SAndroid Build Coastguard Worker        stp             x3,  x4,  [sp, #48]
906*c0909341SAndroid Build Coastguard Worker
907*c0909341SAndroid Build Coastguard Worker        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
908*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
909*c0909341SAndroid Build Coastguard Worker        b.eq            1f
910*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT
911*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
912*c0909341SAndroid Build Coastguard Worker        // left == NULL
913*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
914*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
915*c0909341SAndroid Build Coastguard Worker        b               2f
916*c0909341SAndroid Build Coastguard Worker
917*c0909341SAndroid Build Coastguard Worker0:
918*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
919*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
920*c0909341SAndroid Build Coastguard Worker        ld1             {v4.d}[1], [x2], #8
921*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 pixels we loaded earlier,
922*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
923*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
924*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #12
925*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #12
926*c0909341SAndroid Build Coastguard Worker        b               2f
927*c0909341SAndroid Build Coastguard Worker1:
928*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x3], #32
929*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
930*c0909341SAndroid Build Coastguard Worker        // and shift v3 to have 2x the first pixel at the front.
931*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   v2.h[0]
932*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 pixels we loaded before,
933*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
934*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
935*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #12
936*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v4.16b,  v2.16b,  #12
937*c0909341SAndroid Build Coastguard Worker
938*c0909341SAndroid Build Coastguard Worker2:
939*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [x3], #16
940*c0909341SAndroid Build Coastguard Worker
941*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
942*c0909341SAndroid Build Coastguard Worker        b.ne            4f
943*c0909341SAndroid Build Coastguard Worker
944*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
945*c0909341SAndroid Build Coastguard Worker
946*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
947*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #18
948*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 18, all used input pixels are valid
949*c0909341SAndroid Build Coastguard Worker
950*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
951*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
952*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
953*c0909341SAndroid Build Coastguard Worker
954*c0909341SAndroid Build Coastguard Worker        // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
955*c0909341SAndroid Build Coastguard Worker        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
956*c0909341SAndroid Build Coastguard Worker        sub             w17, w4,  #23
957*c0909341SAndroid Build Coastguard Worker        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
958*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
959*c0909341SAndroid Build Coastguard Worker        movrel          x6,  right_ext_mask, -4
960*c0909341SAndroid Build Coastguard Worker        ldr             h26, [x3,  w17, sxtw #1]
961*c0909341SAndroid Build Coastguard Worker        sub             x6,  x6,  w4,  uxtw #1
962*c0909341SAndroid Build Coastguard Worker        dup             v26.8h,  v26.h[0]
963*c0909341SAndroid Build Coastguard Worker        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
964*c0909341SAndroid Build Coastguard Worker
965*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v26.16b, v23.16b
966*c0909341SAndroid Build Coastguard Worker        bit             v3.16b,  v26.16b, v24.16b
967*c0909341SAndroid Build Coastguard Worker        bit             v4.16b,  v26.16b, v25.16b
968*c0909341SAndroid Build Coastguard Worker
969*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
970*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v2.16b,  v3.16b, #2
971*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v2.16b,  v3.16b, #6
972*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v2.16b,  v3.16b, #8
973*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b, #4
974*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
975*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v2.8h
976*c0909341SAndroid Build Coastguard Worker        smull           v6.4s,   v17.4h,  v0.h[3]
977*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v18.4h,  v0.h[2]
978*c0909341SAndroid Build Coastguard Worker        smlal           v6.4s,   v19.4h,  v0.h[1]
979*c0909341SAndroid Build Coastguard Worker        smull2          v7.4s,   v17.8h,  v0.h[3]
980*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v18.8h,  v0.h[2]
981*c0909341SAndroid Build Coastguard Worker        smlal2          v7.4s,   v19.8h,  v0.h[1]
982*c0909341SAndroid Build Coastguard Worker
983*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v3.16b,  v4.16b, #2
984*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v3.16b,  v4.16b, #6
985*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v3.16b,  v4.16b, #8
986*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v3.16b,  v4.16b, #4
987*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v16.8h
988*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
989*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v17.4h,  v0.h[3]
990*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v18.4h,  v0.h[2]
991*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v19.4h,  v0.h[1]
992*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v17.8h,  v0.h[3]
993*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v18.8h,  v0.h[2]
994*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v19.8h,  v0.h[1]
995*c0909341SAndroid Build Coastguard Worker
996*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x11], #32
997*c0909341SAndroid Build Coastguard Worker        mvni            v26.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
998*c0909341SAndroid Build Coastguard Worker        add             v6.4s,   v6.4s,   v30.4s
999*c0909341SAndroid Build Coastguard Worker        add             v7.4s,   v7.4s,   v30.4s
1000*c0909341SAndroid Build Coastguard Worker        add             v24.4s,  v24.4s,  v30.4s
1001*c0909341SAndroid Build Coastguard Worker        add             v25.4s,  v25.4s,  v30.4s
1002*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [x12], #32
1003*c0909341SAndroid Build Coastguard Worker        srshl           v6.4s,   v6.4s,   v29.4s
1004*c0909341SAndroid Build Coastguard Worker        srshl           v7.4s,   v7.4s,   v29.4s
1005*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s,  v24.4s,  v29.4s
1006*c0909341SAndroid Build Coastguard Worker        srshl           v25.4s,  v25.4s,  v29.4s
1007*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [x13], #32
1008*c0909341SAndroid Build Coastguard Worker        sqxtun          v6.4h,   v6.4s
1009*c0909341SAndroid Build Coastguard Worker        sqxtun2         v6.8h,   v7.4s
1010*c0909341SAndroid Build Coastguard Worker        sqxtun          v7.4h,   v24.4s
1011*c0909341SAndroid Build Coastguard Worker        sqxtun2         v7.8h,   v25.4s
1012*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h, v23.8h}, [x14], #32
1013*c0909341SAndroid Build Coastguard Worker        umin            v6.8h,   v6.8h,   v26.8h
1014*c0909341SAndroid Build Coastguard Worker        umin            v7.8h,   v7.8h,   v26.8h
1015*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v31.8h
1016*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v7.8h,   v31.8h
1017*c0909341SAndroid Build Coastguard Worker
1018*c0909341SAndroid Build Coastguard Worker        smull           v8.4s,   v16.4h,  v0.h[5]
1019*c0909341SAndroid Build Coastguard Worker        smlal           v8.4s,   v18.4h,  v0.h[6]
1020*c0909341SAndroid Build Coastguard Worker        smlal           v8.4s,   v20.4h,  v0.h[7]
1021*c0909341SAndroid Build Coastguard Worker        smlal           v8.4s,   v22.4h,  v0.h[6]
1022*c0909341SAndroid Build Coastguard Worker        smlal           v8.4s,   v6.4h,   v0.h[5]
1023*c0909341SAndroid Build Coastguard Worker        smull2          v9.4s,   v16.8h,  v0.h[5]
1024*c0909341SAndroid Build Coastguard Worker        smlal2          v9.4s,   v18.8h,  v0.h[6]
1025*c0909341SAndroid Build Coastguard Worker        smlal2          v9.4s,   v20.8h,  v0.h[7]
1026*c0909341SAndroid Build Coastguard Worker        smlal2          v9.4s,   v22.8h,  v0.h[6]
1027*c0909341SAndroid Build Coastguard Worker        smlal2          v9.4s,   v6.8h,   v0.h[5]
1028*c0909341SAndroid Build Coastguard Worker        smull           v1.4s,   v17.4h,  v0.h[5]
1029*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v19.4h,  v0.h[6]
1030*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v21.4h,  v0.h[7]
1031*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v23.4h,  v0.h[6]
1032*c0909341SAndroid Build Coastguard Worker        smlal           v1.4s,   v7.4h,   v0.h[5]
1033*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v17.8h,  v0.h[5]
1034*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v19.8h,  v0.h[6]
1035*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v21.8h,  v0.h[7]
1036*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v23.8h,  v0.h[6]
1037*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v7.8h,   v0.h[5]
1038*c0909341SAndroid Build Coastguard Worker        srshl           v8.4s,   v8.4s,   v27.4s  // -round_bits_v
1039*c0909341SAndroid Build Coastguard Worker        srshl           v9.4s,   v9.4s,   v27.4s
1040*c0909341SAndroid Build Coastguard Worker        srshl           v1.4s,   v1.4s,   v27.4s
1041*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v27.4s
1042*c0909341SAndroid Build Coastguard Worker        sqxtun          v8.4h,   v8.4s
1043*c0909341SAndroid Build Coastguard Worker        sqxtun2         v8.8h,   v9.4s
1044*c0909341SAndroid Build Coastguard Worker        sqxtun          v9.4h,   v1.4s
1045*c0909341SAndroid Build Coastguard Worker        sqxtun2         v9.8h,   v5.4s
1046*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x15], #32
1047*c0909341SAndroid Build Coastguard Worker        umin            v8.8h,   v8.8h,   v28.8h  // bitdepth_max
1048*c0909341SAndroid Build Coastguard Worker        umin            v9.8h,   v9.8h,   v28.8h
1049*c0909341SAndroid Build Coastguard Worker
1050*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
1051*c0909341SAndroid Build Coastguard Worker
1052*c0909341SAndroid Build Coastguard Worker        st1             {v8.8h, v9.8h}, [x0], #32
1053*c0909341SAndroid Build Coastguard Worker
1054*c0909341SAndroid Build Coastguard Worker        b.le            0f
1055*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v4.16b
1056*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
1057*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h}, [x3], #32
1058*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep filtering.
1059*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
1060*c0909341SAndroid Build Coastguard Worker
1061*c0909341SAndroid Build Coastguard Worker0:
1062*c0909341SAndroid Build Coastguard Worker        ldp             x3,  x4,  [sp, #48]
1063*c0909341SAndroid Build Coastguard Worker        ldp             x15, x0,  [sp, #32]
1064*c0909341SAndroid Build Coastguard Worker        ldp             x13, x14, [sp, #16]
1065*c0909341SAndroid Build Coastguard Worker        ldp             x11, x12, [sp], #64
1066*c0909341SAndroid Build Coastguard Worker
1067*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  x1
1068*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1069*c0909341SAndroid Build Coastguard Worker
1070*c0909341SAndroid Build Coastguard Worker        ret
1071*c0909341SAndroid Build Coastguard Workerendfunc
1072*c0909341SAndroid Build Coastguard Worker
1073*c0909341SAndroid Build Coastguard Worker#include "looprestoration_tmpl.S"
1074*c0909341SAndroid Build Coastguard Worker
1075*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
1076*c0909341SAndroid Build Coastguard Worker//                                      const pixel (*left)[4],
1077*c0909341SAndroid Build Coastguard Worker//                                      const pixel *src, const int w,
1078*c0909341SAndroid Build Coastguard Worker//                                      const enum LrEdgeFlags edges);
1079*c0909341SAndroid Build Coastguard Workerfunction sgr_box3_row_h_16bpc_neon, export=1
1080*c0909341SAndroid Build Coastguard Worker        add             w4,  w4,  #2 // w += 2
1081*c0909341SAndroid Build Coastguard Worker
1082*c0909341SAndroid Build Coastguard Worker        tst             w5,  #1 // LR_HAVE_LEFT
1083*c0909341SAndroid Build Coastguard Worker        b.eq            1f
1084*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
1085*c0909341SAndroid Build Coastguard Worker
1086*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT && left == NULL
1087*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
1088*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3], #32
1089*c0909341SAndroid Build Coastguard Worker        b               2f
1090*c0909341SAndroid Build Coastguard Worker
1091*c0909341SAndroid Build Coastguard Worker0:
1092*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
1093*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3], #32
1094*c0909341SAndroid Build Coastguard Worker        ld1             {v2.d}[1], [x2]
1095*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 pixels we loaded earlier,
1096*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
1097*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
1098*c0909341SAndroid Build Coastguard Worker        ext             v1.16b, v0.16b, v1.16b, #12
1099*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v2.16b, v0.16b, #12
1100*c0909341SAndroid Build Coastguard Worker        b               2f
1101*c0909341SAndroid Build Coastguard Worker
1102*c0909341SAndroid Build Coastguard Worker1:
1103*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3], #32
1104*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
1105*c0909341SAndroid Build Coastguard Worker        // and shift v0/v1 to have 2x the first pixel at the front.
1106*c0909341SAndroid Build Coastguard Worker        dup             v2.8h, v0.h[0]
1107*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 2 pixels we loaded before,
1108*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
1109*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #4
1110*c0909341SAndroid Build Coastguard Worker        ext             v1.16b, v0.16b, v1.16b, #12
1111*c0909341SAndroid Build Coastguard Worker        ext             v0.16b, v2.16b, v0.16b, #12
1112*c0909341SAndroid Build Coastguard Worker
1113*c0909341SAndroid Build Coastguard Worker2:
1114*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1115*c0909341SAndroid Build Coastguard Worker        b.ne            4f
1116*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that pixel to pad with
1117*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
1118*c0909341SAndroid Build Coastguard Worker        sub             w13, w4, #(2 + 16 - 2 + 1)
1119*c0909341SAndroid Build Coastguard Worker        ldr             h30, [x3,  w13, sxtw #1]
1120*c0909341SAndroid Build Coastguard Worker        // Fill v30 with the right padding pixel
1121*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  v30.h[0]
1122*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
1123*c0909341SAndroid Build Coastguard Worker
1124*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
1125*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #10
1126*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 10, all used input pixels are valid
1127*c0909341SAndroid Build Coastguard Worker
1128*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
1129*c0909341SAndroid Build Coastguard Worker        // again; it's not strictly needed in those cases (we pad enough here),
1130*c0909341SAndroid Build Coastguard Worker        // but keeping the code as simple as possible.
1131*c0909341SAndroid Build Coastguard Worker
1132*c0909341SAndroid Build Coastguard Worker        // Insert padding in v0.b[w] onwards
1133*c0909341SAndroid Build Coastguard Worker        movrel          x13, right_ext_mask
1134*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, w4,  uxtw #1
1135*c0909341SAndroid Build Coastguard Worker        ld1             {v28.16b, v29.16b}, [x13]
1136*c0909341SAndroid Build Coastguard Worker
1137*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v30.16b, v28.16b
1138*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v30.16b, v29.16b
1139*c0909341SAndroid Build Coastguard Worker
1140*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
1141*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v0.16b,  v1.16b,  #2
1142*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v0.16b,  v1.16b,  #4
1143*c0909341SAndroid Build Coastguard Worker
1144*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v0.8h,   v26.8h
1145*c0909341SAndroid Build Coastguard Worker        umull           v22.4s,  v0.4h,   v0.4h
1146*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v26.4h,  v26.4h
1147*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v27.4h,  v27.4h
1148*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v27.8h
1149*c0909341SAndroid Build Coastguard Worker        umull2          v23.4s,  v0.8h,   v0.8h
1150*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v26.8h,  v26.8h
1151*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v27.8h,  v27.8h
1152*c0909341SAndroid Build Coastguard Worker
1153*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #8
1154*c0909341SAndroid Build Coastguard Worker
1155*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h},         [x1],  #16
1156*c0909341SAndroid Build Coastguard Worker        st1             {v22.4s,v23.4s}, [x0],  #32
1157*c0909341SAndroid Build Coastguard Worker
1158*c0909341SAndroid Build Coastguard Worker        b.le            9f
1159*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1160*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
1161*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h},  [x3],  #16
1162*c0909341SAndroid Build Coastguard Worker
1163*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep summing.
1164*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
1165*c0909341SAndroid Build Coastguard Worker
1166*c0909341SAndroid Build Coastguard Worker9:
1167*c0909341SAndroid Build Coastguard Worker        ret
1168*c0909341SAndroid Build Coastguard Workerendfunc
1169*c0909341SAndroid Build Coastguard Worker
1170*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
1171*c0909341SAndroid Build Coastguard Worker//                                      const pixel (*left)[4],
1172*c0909341SAndroid Build Coastguard Worker//                                      const pixel *src, const int w,
1173*c0909341SAndroid Build Coastguard Worker//                                      const enum LrEdgeFlags edges);
1174*c0909341SAndroid Build Coastguard Workerfunction sgr_box5_row_h_16bpc_neon, export=1
1175*c0909341SAndroid Build Coastguard Worker        add             w4,  w4,  #2 // w += 2
1176*c0909341SAndroid Build Coastguard Worker
1177*c0909341SAndroid Build Coastguard Worker        tst             w5,  #1 // LR_HAVE_LEFT
1178*c0909341SAndroid Build Coastguard Worker        b.eq            1f
1179*c0909341SAndroid Build Coastguard Worker        cbnz            x2,  0f
1180*c0909341SAndroid Build Coastguard Worker
1181*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT && left == NULL
1182*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
1183*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3], #32
1184*c0909341SAndroid Build Coastguard Worker        b               2f
1185*c0909341SAndroid Build Coastguard Worker
1186*c0909341SAndroid Build Coastguard Worker0:
1187*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
1188*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3], #32
1189*c0909341SAndroid Build Coastguard Worker        ld1             {v2.d}[1], [x2], #8
1190*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 pixels we loaded earlier,
1191*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
1192*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
1193*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v1.16b,  #10
1194*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v2.16b,  v0.16b,  #10
1195*c0909341SAndroid Build Coastguard Worker        b               2f
1196*c0909341SAndroid Build Coastguard Worker
1197*c0909341SAndroid Build Coastguard Worker1:
1198*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3], #32
1199*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
1200*c0909341SAndroid Build Coastguard Worker        // and shift v0/v1 to have 3x the first pixel at the front.
1201*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,  v0.h[0]
1202*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 pixels we loaded before,
1203*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
1204*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #6
1205*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v1.16b,  #10
1206*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v2.16b,  v0.16b,  #10
1207*c0909341SAndroid Build Coastguard Worker
1208*c0909341SAndroid Build Coastguard Worker2:
1209*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1210*c0909341SAndroid Build Coastguard Worker        b.ne            4f
1211*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that pixel to pad with
1212*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
1213*c0909341SAndroid Build Coastguard Worker        sub             w13, w4, #(2 + 16 - 3 + 1)
1214*c0909341SAndroid Build Coastguard Worker        ldr             h30, [x3,  w13, sxtw #1]
1215*c0909341SAndroid Build Coastguard Worker        // Fill v30 with the right padding pixel
1216*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  v30.h[0]
1217*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
1218*c0909341SAndroid Build Coastguard Worker
1219*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
1220*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #11
1221*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 11, all used input pixels are valid
1222*c0909341SAndroid Build Coastguard Worker
1223*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1224*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
1225*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
1226*c0909341SAndroid Build Coastguard Worker
1227*c0909341SAndroid Build Coastguard Worker        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1228*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
1229*c0909341SAndroid Build Coastguard Worker        movrel          x13, right_ext_mask, -1
1230*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, w4,  uxtw #1
1231*c0909341SAndroid Build Coastguard Worker        ld1             {v28.16b, v29.16b}, [x13]
1232*c0909341SAndroid Build Coastguard Worker
1233*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v30.16b, v28.16b
1234*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v30.16b, v29.16b
1235*c0909341SAndroid Build Coastguard Worker
1236*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
1237*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v0.16b,  v1.16b,  #2
1238*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v0.16b,  v1.16b,  #4
1239*c0909341SAndroid Build Coastguard Worker
1240*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v0.8h,   v26.8h
1241*c0909341SAndroid Build Coastguard Worker        umull           v22.4s,  v0.4h,   v0.4h
1242*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v26.4h,  v26.4h
1243*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v27.4h,  v27.4h
1244*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v27.8h
1245*c0909341SAndroid Build Coastguard Worker        umull2          v23.4s,  v0.8h,   v0.8h
1246*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v26.8h,  v26.8h
1247*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v27.8h,  v27.8h
1248*c0909341SAndroid Build Coastguard Worker
1249*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v0.16b,  v1.16b,  #6
1250*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v0.16b,  v1.16b,  #8
1251*c0909341SAndroid Build Coastguard Worker
1252*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v26.8h
1253*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v26.4h,  v26.4h
1254*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v27.4h,  v27.4h
1255*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v27.8h
1256*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v26.8h,  v26.8h
1257*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v27.8h,  v27.8h
1258*c0909341SAndroid Build Coastguard Worker
1259*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #8
1260*c0909341SAndroid Build Coastguard Worker
1261*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h},         [x1],  #16
1262*c0909341SAndroid Build Coastguard Worker        st1             {v22.4s,v23.4s}, [x0],  #32
1263*c0909341SAndroid Build Coastguard Worker
1264*c0909341SAndroid Build Coastguard Worker        b.le            9f
1265*c0909341SAndroid Build Coastguard Worker        tst             w5,  #2 // LR_HAVE_RIGHT
1266*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
1267*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x3], #16
1268*c0909341SAndroid Build Coastguard Worker
1269*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep summing.
1270*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
1271*c0909341SAndroid Build Coastguard Worker
1272*c0909341SAndroid Build Coastguard Worker9:
1273*c0909341SAndroid Build Coastguard Worker        ret
1274*c0909341SAndroid Build Coastguard Workerendfunc
1275*c0909341SAndroid Build Coastguard Worker
1276*c0909341SAndroid Build Coastguard Worker// void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
1277*c0909341SAndroid Build Coastguard Worker//                                       int32_t *sumsq5, int16_t *sum5,
1278*c0909341SAndroid Build Coastguard Worker//                                       const pixel (*left)[4],
1279*c0909341SAndroid Build Coastguard Worker//                                       const pixel *src, const int w,
1280*c0909341SAndroid Build Coastguard Worker//                                       const enum LrEdgeFlags edges);
1281*c0909341SAndroid Build Coastguard Workerfunction sgr_box35_row_h_16bpc_neon, export=1
1282*c0909341SAndroid Build Coastguard Worker        add             w6,  w6,  #2 // w += 2
1283*c0909341SAndroid Build Coastguard Worker
1284*c0909341SAndroid Build Coastguard Worker        tst             w7,  #1 // LR_HAVE_LEFT
1285*c0909341SAndroid Build Coastguard Worker        b.eq            1f
1286*c0909341SAndroid Build Coastguard Worker        cbnz            x4,  0f
1287*c0909341SAndroid Build Coastguard Worker
1288*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT && left == NULL
1289*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #6
1290*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x5], #32
1291*c0909341SAndroid Build Coastguard Worker        b               2f
1292*c0909341SAndroid Build Coastguard Worker
1293*c0909341SAndroid Build Coastguard Worker0:
1294*c0909341SAndroid Build Coastguard Worker        // LR_HAVE_LEFT, left != NULL
1295*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x5], #32
1296*c0909341SAndroid Build Coastguard Worker        ld1             {v2.d}[1], [x4], #8
1297*c0909341SAndroid Build Coastguard Worker        // Move x3 back to account for the last 3 pixels we loaded earlier,
1298*c0909341SAndroid Build Coastguard Worker        // which we'll shift out.
1299*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #6
1300*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v1.16b,  #10
1301*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v2.16b,  v0.16b,  #10
1302*c0909341SAndroid Build Coastguard Worker        b               2f
1303*c0909341SAndroid Build Coastguard Worker
1304*c0909341SAndroid Build Coastguard Worker1:
1305*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x5], #32
1306*c0909341SAndroid Build Coastguard Worker        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
1307*c0909341SAndroid Build Coastguard Worker        // and shift v0/v1 to have 3x the first pixel at the front.
1308*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,  v0.h[0]
1309*c0909341SAndroid Build Coastguard Worker        // Move x5 back to account for the last 3 pixels we loaded before,
1310*c0909341SAndroid Build Coastguard Worker        // which we shifted out.
1311*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  #6
1312*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v1.16b,  #10
1313*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v2.16b,  v0.16b,  #10
1314*c0909341SAndroid Build Coastguard Worker
1315*c0909341SAndroid Build Coastguard Worker2:
1316*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
1317*c0909341SAndroid Build Coastguard Worker        b.ne            4f
1318*c0909341SAndroid Build Coastguard Worker        // If we'll need to pad the right edge, load that pixel to pad with
1319*c0909341SAndroid Build Coastguard Worker        // here since we can find it pretty easily from here.
1320*c0909341SAndroid Build Coastguard Worker        sub             w13, w6, #(2 + 16 - 3 + 1)
1321*c0909341SAndroid Build Coastguard Worker        ldr             h30, [x5,  w13, sxtw #1]
1322*c0909341SAndroid Build Coastguard Worker        // Fill v30 with the right padding pixel
1323*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  v30.h[0]
1324*c0909341SAndroid Build Coastguard Worker3:      // !LR_HAVE_RIGHT
1325*c0909341SAndroid Build Coastguard Worker
1326*c0909341SAndroid Build Coastguard Worker        // Check whether we need to pad the right edge
1327*c0909341SAndroid Build Coastguard Worker        cmp             w6,  #11
1328*c0909341SAndroid Build Coastguard Worker        b.ge            4f   // If w >= 11, all used input pixels are valid
1329*c0909341SAndroid Build Coastguard Worker
1330*c0909341SAndroid Build Coastguard Worker        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
1331*c0909341SAndroid Build Coastguard Worker        // this ends up called again; it's not strictly needed in those
1332*c0909341SAndroid Build Coastguard Worker        // cases (we pad enough here), but keeping the code as simple as possible.
1333*c0909341SAndroid Build Coastguard Worker
1334*c0909341SAndroid Build Coastguard Worker        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
1335*c0909341SAndroid Build Coastguard Worker        // buffer pointer.
1336*c0909341SAndroid Build Coastguard Worker        movrel          x13, right_ext_mask, -1
1337*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, w6,  uxtw #1
1338*c0909341SAndroid Build Coastguard Worker        ld1             {v28.16b, v29.16b}, [x13]
1339*c0909341SAndroid Build Coastguard Worker
1340*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v30.16b, v28.16b
1341*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v30.16b, v29.16b
1342*c0909341SAndroid Build Coastguard Worker
1343*c0909341SAndroid Build Coastguard Worker4:      // Loop horizontally
1344*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v1.16b,  #2
1345*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v0.16b,  v1.16b,  #4
1346*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v0.16b,  v1.16b,  #8
1347*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v0.16b,  v1.16b,  #6
1348*c0909341SAndroid Build Coastguard Worker
1349*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v16.8h,  v17.8h
1350*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v0.8h,   v19.8h
1351*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v18.8h
1352*c0909341SAndroid Build Coastguard Worker
1353*c0909341SAndroid Build Coastguard Worker        umull           v22.4s,  v16.4h,  v16.4h
1354*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v17.4h,  v17.4h
1355*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v18.4h,  v18.4h
1356*c0909341SAndroid Build Coastguard Worker
1357*c0909341SAndroid Build Coastguard Worker        umull2          v23.4s,  v16.8h,  v16.8h
1358*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v17.8h,  v17.8h
1359*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v18.8h,  v18.8h
1360*c0909341SAndroid Build Coastguard Worker
1361*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v20.8h
1362*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h},        [x1], #16
1363*c0909341SAndroid Build Coastguard Worker        st1             {v22.4s,v23.4s}, [x0], #32
1364*c0909341SAndroid Build Coastguard Worker
1365*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v0.4h,   v0.4h
1366*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v19.4h,  v19.4h
1367*c0909341SAndroid Build Coastguard Worker
1368*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v0.8h,   v0.8h
1369*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v19.8h,  v19.8h
1370*c0909341SAndroid Build Coastguard Worker
1371*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #8
1372*c0909341SAndroid Build Coastguard Worker
1373*c0909341SAndroid Build Coastguard Worker        st1             {v21.8h},        [x3], #16
1374*c0909341SAndroid Build Coastguard Worker        st1             {v22.4s,v23.4s}, [x2], #32
1375*c0909341SAndroid Build Coastguard Worker
1376*c0909341SAndroid Build Coastguard Worker        b.le            9f
1377*c0909341SAndroid Build Coastguard Worker        tst             w7,  #2 // LR_HAVE_RIGHT
1378*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
1379*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x5], #16
1380*c0909341SAndroid Build Coastguard Worker
1381*c0909341SAndroid Build Coastguard Worker        b.ne            4b // If we don't need to pad, just keep summing.
1382*c0909341SAndroid Build Coastguard Worker        b               3b // If we need to pad, check how many pixels we have left.
1383*c0909341SAndroid Build Coastguard Worker
1384*c0909341SAndroid Build Coastguard Worker9:
1385*c0909341SAndroid Build Coastguard Worker        ret
1386*c0909341SAndroid Build Coastguard Workerendfunc
1387*c0909341SAndroid Build Coastguard Worker
1388*c0909341SAndroid Build Coastguard Workersgr_funcs 16
1389