xref: /aosp_15_r20/external/libdav1d/src/arm/64/mc16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, Janne Grunau
4*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo
5*c0909341SAndroid Build Coastguard Worker * All rights reserved.
6*c0909341SAndroid Build Coastguard Worker *
7*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
8*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
9*c0909341SAndroid Build Coastguard Worker *
10*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
11*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
12*c0909341SAndroid Build Coastguard Worker *
13*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
14*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
15*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
16*c0909341SAndroid Build Coastguard Worker *
17*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*c0909341SAndroid Build Coastguard Worker */
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
30*c0909341SAndroid Build Coastguard Worker#include "util.S"
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Worker#define PREP_BIAS 8192
33*c0909341SAndroid Build Coastguard Worker
34*c0909341SAndroid Build Coastguard Worker.macro avg d0, d1, t0, t1, t2, t3
35*c0909341SAndroid Build Coastguard Worker        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
36*c0909341SAndroid Build Coastguard Worker        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
37*c0909341SAndroid Build Coastguard Worker        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
38*c0909341SAndroid Build Coastguard Worker        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
39*c0909341SAndroid Build Coastguard Worker        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
40*c0909341SAndroid Build Coastguard Worker        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
41*c0909341SAndroid Build Coastguard Worker        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
42*c0909341SAndroid Build Coastguard Worker        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
43*c0909341SAndroid Build Coastguard Worker        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
44*c0909341SAndroid Build Coastguard Worker        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
45*c0909341SAndroid Build Coastguard Worker.endm
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard Worker.macro w_avg d0, d1, t0, t1, t2, t3
48*c0909341SAndroid Build Coastguard Worker        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
49*c0909341SAndroid Build Coastguard Worker        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
50*c0909341SAndroid Build Coastguard Worker        // This difference requires a 17 bit range, and all bits are
51*c0909341SAndroid Build Coastguard Worker        // significant for the following multiplication.
52*c0909341SAndroid Build Coastguard Worker        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
53*c0909341SAndroid Build Coastguard Worker        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
54*c0909341SAndroid Build Coastguard Worker        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
55*c0909341SAndroid Build Coastguard Worker        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
56*c0909341SAndroid Build Coastguard Worker        mul             \d0\().4s,  \d0\().4s,  v27.4s
57*c0909341SAndroid Build Coastguard Worker        mul             \t0\().4s,  \t0\().4s,  v27.4s
58*c0909341SAndroid Build Coastguard Worker        mul             \d1\().4s,  \d1\().4s,  v27.4s
59*c0909341SAndroid Build Coastguard Worker        mul             \t1\().4s,  \t1\().4s,  v27.4s
60*c0909341SAndroid Build Coastguard Worker        sshr            \d0\().4s,  \d0\().4s,  #4
61*c0909341SAndroid Build Coastguard Worker        sshr            \t0\().4s,  \t0\().4s,  #4
62*c0909341SAndroid Build Coastguard Worker        sshr            \d1\().4s,  \d1\().4s,  #4
63*c0909341SAndroid Build Coastguard Worker        sshr            \t1\().4s,  \t1\().4s,  #4
64*c0909341SAndroid Build Coastguard Worker        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
65*c0909341SAndroid Build Coastguard Worker        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
66*c0909341SAndroid Build Coastguard Worker        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
67*c0909341SAndroid Build Coastguard Worker        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
68*c0909341SAndroid Build Coastguard Worker        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2
69*c0909341SAndroid Build Coastguard Worker        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto
70*c0909341SAndroid Build Coastguard Worker        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
71*c0909341SAndroid Build Coastguard Worker        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
72*c0909341SAndroid Build Coastguard Worker        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
73*c0909341SAndroid Build Coastguard Worker        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
74*c0909341SAndroid Build Coastguard Worker        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
75*c0909341SAndroid Build Coastguard Worker        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
76*c0909341SAndroid Build Coastguard Worker        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
77*c0909341SAndroid Build Coastguard Worker        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
78*c0909341SAndroid Build Coastguard Worker.endm
79*c0909341SAndroid Build Coastguard Worker
80*c0909341SAndroid Build Coastguard Worker.macro mask d0, d1, t0, t1, t2, t3
81*c0909341SAndroid Build Coastguard Worker        ld1             {v27.16b}, [x6],  16
82*c0909341SAndroid Build Coastguard Worker        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
83*c0909341SAndroid Build Coastguard Worker        neg             v27.16b, v27.16b
84*c0909341SAndroid Build Coastguard Worker        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
85*c0909341SAndroid Build Coastguard Worker        sxtl            v26.8h,  v27.8b
86*c0909341SAndroid Build Coastguard Worker        sxtl2           v27.8h,  v27.16b
87*c0909341SAndroid Build Coastguard Worker        sxtl            v24.4s,  v26.4h
88*c0909341SAndroid Build Coastguard Worker        sxtl2           v25.4s,  v26.8h
89*c0909341SAndroid Build Coastguard Worker        sxtl            v26.4s,  v27.4h
90*c0909341SAndroid Build Coastguard Worker        sxtl2           v27.4s,  v27.8h
91*c0909341SAndroid Build Coastguard Worker        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
92*c0909341SAndroid Build Coastguard Worker        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
93*c0909341SAndroid Build Coastguard Worker        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
94*c0909341SAndroid Build Coastguard Worker        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
95*c0909341SAndroid Build Coastguard Worker        mul             \d0\().4s,  \d0\().4s,  v24.4s
96*c0909341SAndroid Build Coastguard Worker        mul             \t0\().4s,  \t0\().4s,  v25.4s
97*c0909341SAndroid Build Coastguard Worker        mul             \d1\().4s,  \d1\().4s,  v26.4s
98*c0909341SAndroid Build Coastguard Worker        mul             \t1\().4s,  \t1\().4s,  v27.4s
99*c0909341SAndroid Build Coastguard Worker        sshr            \d0\().4s,  \d0\().4s,  #6
100*c0909341SAndroid Build Coastguard Worker        sshr            \t0\().4s,  \t0\().4s,  #6
101*c0909341SAndroid Build Coastguard Worker        sshr            \d1\().4s,  \d1\().4s,  #6
102*c0909341SAndroid Build Coastguard Worker        sshr            \t1\().4s,  \t1\().4s,  #6
103*c0909341SAndroid Build Coastguard Worker        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
104*c0909341SAndroid Build Coastguard Worker        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
105*c0909341SAndroid Build Coastguard Worker        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
106*c0909341SAndroid Build Coastguard Worker        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
107*c0909341SAndroid Build Coastguard Worker        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2
108*c0909341SAndroid Build Coastguard Worker        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto
109*c0909341SAndroid Build Coastguard Worker        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
110*c0909341SAndroid Build Coastguard Worker        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
111*c0909341SAndroid Build Coastguard Worker        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
112*c0909341SAndroid Build Coastguard Worker        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
113*c0909341SAndroid Build Coastguard Worker        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
114*c0909341SAndroid Build Coastguard Worker        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
115*c0909341SAndroid Build Coastguard Worker        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
116*c0909341SAndroid Build Coastguard Worker        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
117*c0909341SAndroid Build Coastguard Worker.endm
118*c0909341SAndroid Build Coastguard Worker
119*c0909341SAndroid Build Coastguard Worker.macro bidir_fn type, bdmax
120*c0909341SAndroid Build Coastguard Workerfunction \type\()_16bpc_neon, export=1
121*c0909341SAndroid Build Coastguard Worker        clz             w4,  w4
122*c0909341SAndroid Build Coastguard Worker.ifnc \type, avg
123*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  \bdmax // bitdepth_max
124*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #0
125*c0909341SAndroid Build Coastguard Worker.endif
126*c0909341SAndroid Build Coastguard Worker        clz             w7,  \bdmax
127*c0909341SAndroid Build Coastguard Worker        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
128*c0909341SAndroid Build Coastguard Worker.ifc \type, avg
129*c0909341SAndroid Build Coastguard Worker        mov             w9,  #1
130*c0909341SAndroid Build Coastguard Worker        mov             w8,  #-2*PREP_BIAS
131*c0909341SAndroid Build Coastguard Worker        lsl             w9,  w9,  w7    // 1 << intermediate_bits
132*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  #1
133*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
134*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7         // -(intermediate_bits+1)
135*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
136*c0909341SAndroid Build Coastguard Worker        dup             v29.8h,   w7    // -(intermediate_bits+1)
137*c0909341SAndroid Build Coastguard Worker.else
138*c0909341SAndroid Build Coastguard Worker        mov             w8,  #PREP_BIAS
139*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
140*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7         // -intermediate_bits
141*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
142*c0909341SAndroid Build Coastguard Worker        dup             v29.8h,  w7     // -intermediate_bits
143*c0909341SAndroid Build Coastguard Worker.endif
144*c0909341SAndroid Build Coastguard Worker.ifc \type, w_avg
145*c0909341SAndroid Build Coastguard Worker        dup             v27.4s,  w6
146*c0909341SAndroid Build Coastguard Worker        neg             v27.4s,  v27.4s
147*c0909341SAndroid Build Coastguard Worker.endif
148*c0909341SAndroid Build Coastguard Worker        movrel          x7,  \type\()_tbl
149*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  #24
150*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
151*c0909341SAndroid Build Coastguard Worker        ldrsw           x4,  [x7, x4, lsl #2]
152*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x4
153*c0909341SAndroid Build Coastguard Worker        br              x7
154*c0909341SAndroid Build Coastguard Worker40:
155*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
156*c0909341SAndroid Build Coastguard Worker        add             x7,  x0,  x1
157*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
158*c0909341SAndroid Build Coastguard Worker4:
159*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
160*c0909341SAndroid Build Coastguard Worker        st1             {v4.8b},    [x0], x1
161*c0909341SAndroid Build Coastguard Worker        st1             {v4.d}[1],  [x7], x1
162*c0909341SAndroid Build Coastguard Worker        st1             {v5.8b},    [x0], x1
163*c0909341SAndroid Build Coastguard Worker        st1             {v5.d}[1],  [x7], x1
164*c0909341SAndroid Build Coastguard Worker        b.le            0f
165*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
166*c0909341SAndroid Build Coastguard Worker        b               4b
167*c0909341SAndroid Build Coastguard Worker80:
168*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
169*c0909341SAndroid Build Coastguard Worker        add             x7,  x0,  x1
170*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
171*c0909341SAndroid Build Coastguard Worker8:
172*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h},  [x0], x1
173*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
174*c0909341SAndroid Build Coastguard Worker        st1             {v5.8h},  [x7], x1
175*c0909341SAndroid Build Coastguard Worker        b.le            0f
176*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
177*c0909341SAndroid Build Coastguard Worker        b               8b
178*c0909341SAndroid Build Coastguard Worker160:
179*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
180*c0909341SAndroid Build Coastguard Worker16:
181*c0909341SAndroid Build Coastguard Worker        \type           v6,  v7,  v0,  v1,  v2,  v3
182*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h}, [x0], x1
183*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
184*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x0], x1
185*c0909341SAndroid Build Coastguard Worker        b.le            0f
186*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
187*c0909341SAndroid Build Coastguard Worker        b               16b
188*c0909341SAndroid Build Coastguard Worker320:
189*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
190*c0909341SAndroid Build Coastguard Worker32:
191*c0909341SAndroid Build Coastguard Worker        \type           v6,  v7,  v0,  v1,  v2,  v3
192*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
193*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
194*c0909341SAndroid Build Coastguard Worker        b.le            0f
195*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
196*c0909341SAndroid Build Coastguard Worker        b               32b
197*c0909341SAndroid Build Coastguard Worker640:
198*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
199*c0909341SAndroid Build Coastguard Worker        add             x7,  x0,  #64
200*c0909341SAndroid Build Coastguard Worker64:
201*c0909341SAndroid Build Coastguard Worker        \type           v6,  v7,  v0,  v1,  v2,  v3
202*c0909341SAndroid Build Coastguard Worker        \type           v16, v17, v0,  v1,  v2,  v3
203*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
204*c0909341SAndroid Build Coastguard Worker        \type           v18, v19, v0,  v1,  v2,  v3
205*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
206*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
207*c0909341SAndroid Build Coastguard Worker        b.le            0f
208*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
209*c0909341SAndroid Build Coastguard Worker        b               64b
210*c0909341SAndroid Build Coastguard Worker1280:
211*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
212*c0909341SAndroid Build Coastguard Worker        add             x7,  x0,  #64
213*c0909341SAndroid Build Coastguard Worker        mov             x8,  #128
214*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #128
215*c0909341SAndroid Build Coastguard Worker128:
216*c0909341SAndroid Build Coastguard Worker        \type           v6,  v7,  v0,  v1,  v2,  v3
217*c0909341SAndroid Build Coastguard Worker        \type           v16, v17, v0,  v1,  v2,  v3
218*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
219*c0909341SAndroid Build Coastguard Worker        \type           v18, v19, v0,  v1,  v2,  v3
220*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
221*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
222*c0909341SAndroid Build Coastguard Worker        \type           v6,  v7,  v0,  v1,  v2,  v3
223*c0909341SAndroid Build Coastguard Worker        \type           v16, v17, v0,  v1,  v2,  v3
224*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
225*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
226*c0909341SAndroid Build Coastguard Worker        \type           v18, v19, v0,  v1,  v2,  v3
227*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
228*c0909341SAndroid Build Coastguard Worker        b.le            0f
229*c0909341SAndroid Build Coastguard Worker        \type           v4,  v5,  v0,  v1,  v2,  v3
230*c0909341SAndroid Build Coastguard Worker        b               128b
231*c0909341SAndroid Build Coastguard Worker0:
232*c0909341SAndroid Build Coastguard Worker        ret
233*c0909341SAndroid Build Coastguard Workerendfunc
234*c0909341SAndroid Build Coastguard Worker
235*c0909341SAndroid Build Coastguard Workerjumptable \type\()_tbl
236*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_tbl
237*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_tbl
238*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_tbl
239*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_tbl
240*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_tbl
241*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_tbl
242*c0909341SAndroid Build Coastguard Workerendjumptable
243*c0909341SAndroid Build Coastguard Worker.endm
244*c0909341SAndroid Build Coastguard Worker
245*c0909341SAndroid Build Coastguard Workerbidir_fn avg, w6
246*c0909341SAndroid Build Coastguard Workerbidir_fn w_avg, w7
247*c0909341SAndroid Build Coastguard Workerbidir_fn mask, w7
248*c0909341SAndroid Build Coastguard Worker
249*c0909341SAndroid Build Coastguard Worker
250*c0909341SAndroid Build Coastguard Worker.macro w_mask_fn type
251*c0909341SAndroid Build Coastguard Workerfunction w_mask_\type\()_16bpc_neon, export=1
252*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]
253*c0909341SAndroid Build Coastguard Worker        clz             w9,  w4
254*c0909341SAndroid Build Coastguard Worker        movrel          x10, w_mask_\type\()_tbl
255*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w8   // bitdepth_max
256*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #24
257*c0909341SAndroid Build Coastguard Worker        clz             w8,  w8       // clz(bitdepth_max)
258*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10,  x9,  lsl #2]
259*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
260*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
261*c0909341SAndroid Build Coastguard Worker        mov             w9,  #PREP_BIAS*64
262*c0909341SAndroid Build Coastguard Worker        neg             w8,  w8       // -sh
263*c0909341SAndroid Build Coastguard Worker        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
264*c0909341SAndroid Build Coastguard Worker        dup             v30.4s,  w9   // PREP_BIAS*64
265*c0909341SAndroid Build Coastguard Worker        dup             v29.4s,  w8   // -sh
266*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   w11
267*c0909341SAndroid Build Coastguard Worker.if \type == 444
268*c0909341SAndroid Build Coastguard Worker        movi            v1.16b,  #64
269*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
270*c0909341SAndroid Build Coastguard Worker        dup             v2.8b,   w7
271*c0909341SAndroid Build Coastguard Worker        movi            v3.8b,   #129
272*c0909341SAndroid Build Coastguard Worker        sub             v3.8b,   v3.8b,   v2.8b
273*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
274*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   w7
275*c0909341SAndroid Build Coastguard Worker        movi            v3.8h,   #1, lsl #8
276*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v2.8h
277*c0909341SAndroid Build Coastguard Worker.endif
278*c0909341SAndroid Build Coastguard Worker        add             x12,  x0,  x1
279*c0909341SAndroid Build Coastguard Worker        lsl             x1,   x1,  #1
280*c0909341SAndroid Build Coastguard Worker        br              x10
281*c0909341SAndroid Build Coastguard Worker40:
282*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
283*c0909341SAndroid Build Coastguard Worker4:
284*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
285*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
286*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
287*c0909341SAndroid Build Coastguard Worker        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
288*c0909341SAndroid Build Coastguard Worker        sabd            v21.8h,  v5.8h,   v7.8h
289*c0909341SAndroid Build Coastguard Worker        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
290*c0909341SAndroid Build Coastguard Worker        ssubl2          v17.4s,  v6.8h,   v4.8h
291*c0909341SAndroid Build Coastguard Worker        ssubl           v18.4s,  v7.4h,   v5.4h
292*c0909341SAndroid Build Coastguard Worker        ssubl2          v19.4s,  v7.8h,   v5.8h
293*c0909341SAndroid Build Coastguard Worker        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
294*c0909341SAndroid Build Coastguard Worker        uqsub           v21.8h,  v0.8h,   v21.8h
295*c0909341SAndroid Build Coastguard Worker        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
296*c0909341SAndroid Build Coastguard Worker        sshll           v6.4s,   v5.4h,   #6
297*c0909341SAndroid Build Coastguard Worker        sshll2          v5.4s,   v4.8h,   #6
298*c0909341SAndroid Build Coastguard Worker        sshll           v4.4s,   v4.4h,   #6
299*c0909341SAndroid Build Coastguard Worker        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
300*c0909341SAndroid Build Coastguard Worker        ushr            v21.8h,  v21.8h,  #10
301*c0909341SAndroid Build Coastguard Worker        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
302*c0909341SAndroid Build Coastguard Worker        add             v5.4s,   v5.4s,   v30.4s
303*c0909341SAndroid Build Coastguard Worker        add             v6.4s,   v6.4s,   v30.4s
304*c0909341SAndroid Build Coastguard Worker        add             v7.4s,   v7.4s,   v30.4s
305*c0909341SAndroid Build Coastguard Worker        uxtl            v22.4s,  v20.4h
306*c0909341SAndroid Build Coastguard Worker        uxtl2           v23.4s,  v20.8h
307*c0909341SAndroid Build Coastguard Worker        uxtl            v24.4s,  v21.4h
308*c0909341SAndroid Build Coastguard Worker        uxtl2           v25.4s,  v21.8h
309*c0909341SAndroid Build Coastguard Worker        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
310*c0909341SAndroid Build Coastguard Worker        mla             v5.4s,   v17.4s,  v23.4s
311*c0909341SAndroid Build Coastguard Worker        mla             v6.4s,   v18.4s,  v24.4s
312*c0909341SAndroid Build Coastguard Worker        mla             v7.4s,   v19.4s,  v25.4s
313*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
314*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v29.4s
315*c0909341SAndroid Build Coastguard Worker        srshl           v6.4s,   v6.4s,   v29.4s
316*c0909341SAndroid Build Coastguard Worker        srshl           v7.4s,   v7.4s,   v29.4s
317*c0909341SAndroid Build Coastguard Worker        sqxtun          v4.4h,   v4.4s            // iclip_pixel
318*c0909341SAndroid Build Coastguard Worker        sqxtun2         v4.8h,   v5.4s
319*c0909341SAndroid Build Coastguard Worker        sqxtun          v5.4h,   v6.4s
320*c0909341SAndroid Build Coastguard Worker        sqxtun2         v5.8h,   v7.4s
321*c0909341SAndroid Build Coastguard Worker        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
322*c0909341SAndroid Build Coastguard Worker        umin            v5.8h,   v5.8h,   v31.8h
323*c0909341SAndroid Build Coastguard Worker.if \type == 444
324*c0909341SAndroid Build Coastguard Worker        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
325*c0909341SAndroid Build Coastguard Worker        sub             v20.16b, v1.16b,  v20.16b // m
326*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b}, [x6], #16
327*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
328*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
329*c0909341SAndroid Build Coastguard Worker        xtn             v20.8b,  v20.8h
330*c0909341SAndroid Build Coastguard Worker        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
331*c0909341SAndroid Build Coastguard Worker        st1             {v20.8b}, [x6], #8
332*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
333*c0909341SAndroid Build Coastguard Worker        trn1            v24.2d,  v20.2d,  v21.2d
334*c0909341SAndroid Build Coastguard Worker        trn2            v25.2d,  v20.2d,  v21.2d
335*c0909341SAndroid Build Coastguard Worker        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
336*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
337*c0909341SAndroid Build Coastguard Worker        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
338*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
339*c0909341SAndroid Build Coastguard Worker        str             s20,        [x6],  #4
340*c0909341SAndroid Build Coastguard Worker.endif
341*c0909341SAndroid Build Coastguard Worker        st1             {v4.8b},    [x0],  x1
342*c0909341SAndroid Build Coastguard Worker        st1             {v4.d}[1],  [x12], x1
343*c0909341SAndroid Build Coastguard Worker        st1             {v5.8b},    [x0],  x1
344*c0909341SAndroid Build Coastguard Worker        st1             {v5.d}[1],  [x12], x1
345*c0909341SAndroid Build Coastguard Worker        b.gt            4b
346*c0909341SAndroid Build Coastguard Worker        ret
347*c0909341SAndroid Build Coastguard Worker80:
348*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
349*c0909341SAndroid Build Coastguard Worker8:
350*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
351*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
352*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
353*c0909341SAndroid Build Coastguard Worker        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
354*c0909341SAndroid Build Coastguard Worker        sabd            v21.8h,  v5.8h,   v7.8h
355*c0909341SAndroid Build Coastguard Worker        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
356*c0909341SAndroid Build Coastguard Worker        ssubl2          v17.4s,  v6.8h,   v4.8h
357*c0909341SAndroid Build Coastguard Worker        ssubl           v18.4s,  v7.4h,   v5.4h
358*c0909341SAndroid Build Coastguard Worker        ssubl2          v19.4s,  v7.8h,   v5.8h
359*c0909341SAndroid Build Coastguard Worker        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
360*c0909341SAndroid Build Coastguard Worker        uqsub           v21.8h,  v0.8h,   v21.8h
361*c0909341SAndroid Build Coastguard Worker        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
362*c0909341SAndroid Build Coastguard Worker        sshll           v6.4s,   v5.4h,   #6
363*c0909341SAndroid Build Coastguard Worker        sshll2          v5.4s,   v4.8h,   #6
364*c0909341SAndroid Build Coastguard Worker        sshll           v4.4s,   v4.4h,   #6
365*c0909341SAndroid Build Coastguard Worker        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
366*c0909341SAndroid Build Coastguard Worker        ushr            v21.8h,  v21.8h,  #10
367*c0909341SAndroid Build Coastguard Worker        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
368*c0909341SAndroid Build Coastguard Worker        add             v5.4s,   v5.4s,   v30.4s
369*c0909341SAndroid Build Coastguard Worker        add             v6.4s,   v6.4s,   v30.4s
370*c0909341SAndroid Build Coastguard Worker        add             v7.4s,   v7.4s,   v30.4s
371*c0909341SAndroid Build Coastguard Worker        uxtl            v22.4s,  v20.4h
372*c0909341SAndroid Build Coastguard Worker        uxtl2           v23.4s,  v20.8h
373*c0909341SAndroid Build Coastguard Worker        uxtl            v24.4s,  v21.4h
374*c0909341SAndroid Build Coastguard Worker        uxtl2           v25.4s,  v21.8h
375*c0909341SAndroid Build Coastguard Worker        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
376*c0909341SAndroid Build Coastguard Worker        mla             v5.4s,   v17.4s,  v23.4s
377*c0909341SAndroid Build Coastguard Worker        mla             v6.4s,   v18.4s,  v24.4s
378*c0909341SAndroid Build Coastguard Worker        mla             v7.4s,   v19.4s,  v25.4s
379*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
380*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v29.4s
381*c0909341SAndroid Build Coastguard Worker        srshl           v6.4s,   v6.4s,   v29.4s
382*c0909341SAndroid Build Coastguard Worker        srshl           v7.4s,   v7.4s,   v29.4s
383*c0909341SAndroid Build Coastguard Worker        sqxtun          v4.4h,   v4.4s            // iclip_pixel
384*c0909341SAndroid Build Coastguard Worker        sqxtun2         v4.8h,   v5.4s
385*c0909341SAndroid Build Coastguard Worker        sqxtun          v5.4h,   v6.4s
386*c0909341SAndroid Build Coastguard Worker        sqxtun2         v5.8h,   v7.4s
387*c0909341SAndroid Build Coastguard Worker        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
388*c0909341SAndroid Build Coastguard Worker        umin            v5.8h,   v5.8h,   v31.8h
389*c0909341SAndroid Build Coastguard Worker.if \type == 444
390*c0909341SAndroid Build Coastguard Worker        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
391*c0909341SAndroid Build Coastguard Worker        sub             v20.16b, v1.16b,  v20.16b // m
392*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b}, [x6], #16
393*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
394*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
395*c0909341SAndroid Build Coastguard Worker        xtn             v20.8b,  v20.8h
396*c0909341SAndroid Build Coastguard Worker        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
397*c0909341SAndroid Build Coastguard Worker        st1             {v20.8b}, [x6], #8
398*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
399*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
400*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
401*c0909341SAndroid Build Coastguard Worker        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
402*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
403*c0909341SAndroid Build Coastguard Worker        str             s20,     [x6],  #4
404*c0909341SAndroid Build Coastguard Worker.endif
405*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h}, [x0],  x1
406*c0909341SAndroid Build Coastguard Worker        st1             {v5.8h}, [x12], x1
407*c0909341SAndroid Build Coastguard Worker        b.gt            8b
408*c0909341SAndroid Build Coastguard Worker        ret
409*c0909341SAndroid Build Coastguard Worker1280:
410*c0909341SAndroid Build Coastguard Worker640:
411*c0909341SAndroid Build Coastguard Worker320:
412*c0909341SAndroid Build Coastguard Worker160:
413*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
414*c0909341SAndroid Build Coastguard Worker        mov             w11, w4
415*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w4,  uxtw #1
416*c0909341SAndroid Build Coastguard Worker.if \type == 444
417*c0909341SAndroid Build Coastguard Worker        add             x10, x6,  w4,  uxtw
418*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
419*c0909341SAndroid Build Coastguard Worker        add             x10, x6,  x11, lsr #1
420*c0909341SAndroid Build Coastguard Worker.endif
421*c0909341SAndroid Build Coastguard Worker        add             x9,  x3,  w4,  uxtw #1
422*c0909341SAndroid Build Coastguard Worker        add             x7,  x2,  w4,  uxtw #1
423*c0909341SAndroid Build Coastguard Worker161:
424*c0909341SAndroid Build Coastguard Worker        mov             w8,  w4
425*c0909341SAndroid Build Coastguard Worker16:
426*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
427*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
428*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h,   v7.8h},  [x7], #32
429*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h,  v19.8h}, [x9], #32
430*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #16
431*c0909341SAndroid Build Coastguard Worker        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
432*c0909341SAndroid Build Coastguard Worker        sabd            v21.8h,  v5.8h,   v17.8h
433*c0909341SAndroid Build Coastguard Worker        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
434*c0909341SAndroid Build Coastguard Worker        ssubl2          v23.4s,  v16.8h,  v4.8h
435*c0909341SAndroid Build Coastguard Worker        ssubl           v24.4s,  v17.4h,  v5.4h
436*c0909341SAndroid Build Coastguard Worker        ssubl2          v25.4s,  v17.8h,  v5.8h
437*c0909341SAndroid Build Coastguard Worker        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
438*c0909341SAndroid Build Coastguard Worker        uqsub           v21.8h,  v0.8h,   v21.8h
439*c0909341SAndroid Build Coastguard Worker        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
440*c0909341SAndroid Build Coastguard Worker        sshll           v26.4s,  v5.4h,   #6
441*c0909341SAndroid Build Coastguard Worker        sshll2          v5.4s,   v4.8h,   #6
442*c0909341SAndroid Build Coastguard Worker        sshll           v4.4s,   v4.4h,   #6
443*c0909341SAndroid Build Coastguard Worker        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
444*c0909341SAndroid Build Coastguard Worker        ushr            v21.8h,  v21.8h,  #10
445*c0909341SAndroid Build Coastguard Worker        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
446*c0909341SAndroid Build Coastguard Worker        add             v5.4s,   v5.4s,   v30.4s
447*c0909341SAndroid Build Coastguard Worker        add             v26.4s,  v26.4s,  v30.4s
448*c0909341SAndroid Build Coastguard Worker        add             v27.4s,  v27.4s,  v30.4s
449*c0909341SAndroid Build Coastguard Worker        uxtl            v16.4s,  v20.4h
450*c0909341SAndroid Build Coastguard Worker        uxtl2           v17.4s,  v20.8h
451*c0909341SAndroid Build Coastguard Worker        uxtl            v28.4s,  v21.4h
452*c0909341SAndroid Build Coastguard Worker        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
453*c0909341SAndroid Build Coastguard Worker        uxtl2           v16.4s,  v21.8h
454*c0909341SAndroid Build Coastguard Worker        mla             v5.4s,   v23.4s,  v17.4s
455*c0909341SAndroid Build Coastguard Worker        mla             v26.4s,  v24.4s,  v28.4s
456*c0909341SAndroid Build Coastguard Worker        mla             v27.4s,  v25.4s,  v16.4s
457*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
458*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v29.4s
459*c0909341SAndroid Build Coastguard Worker        srshl           v26.4s,  v26.4s,  v29.4s
460*c0909341SAndroid Build Coastguard Worker        srshl           v27.4s,  v27.4s,  v29.4s
461*c0909341SAndroid Build Coastguard Worker        sqxtun          v4.4h,   v4.4s            // iclip_pixel
462*c0909341SAndroid Build Coastguard Worker        sqxtun2         v4.8h,   v5.4s
463*c0909341SAndroid Build Coastguard Worker        sqxtun          v5.4h,   v26.4s
464*c0909341SAndroid Build Coastguard Worker        sqxtun2         v5.8h,   v27.4s
465*c0909341SAndroid Build Coastguard Worker
466*c0909341SAndroid Build Coastguard Worker        // Start of other half
467*c0909341SAndroid Build Coastguard Worker        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
468*c0909341SAndroid Build Coastguard Worker        sabd            v23.8h,  v7.8h,   v19.8h
469*c0909341SAndroid Build Coastguard Worker
470*c0909341SAndroid Build Coastguard Worker        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
471*c0909341SAndroid Build Coastguard Worker        umin            v5.8h,   v5.8h,   v31.8h
472*c0909341SAndroid Build Coastguard Worker
473*c0909341SAndroid Build Coastguard Worker        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
474*c0909341SAndroid Build Coastguard Worker        ssubl2          v17.4s,  v18.8h,  v6.8h
475*c0909341SAndroid Build Coastguard Worker        ssubl           v18.4s,  v19.4h,  v7.4h
476*c0909341SAndroid Build Coastguard Worker        ssubl2          v19.4s,  v19.8h,  v7.8h
477*c0909341SAndroid Build Coastguard Worker        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
478*c0909341SAndroid Build Coastguard Worker        uqsub           v23.8h,  v0.8h,   v23.8h
479*c0909341SAndroid Build Coastguard Worker        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
480*c0909341SAndroid Build Coastguard Worker        sshll2          v25.4s,  v6.8h,   #6
481*c0909341SAndroid Build Coastguard Worker        sshll           v26.4s,  v7.4h,   #6
482*c0909341SAndroid Build Coastguard Worker        sshll2          v27.4s,  v7.8h,   #6
483*c0909341SAndroid Build Coastguard Worker        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
484*c0909341SAndroid Build Coastguard Worker        ushr            v23.8h,  v23.8h,  #10
485*c0909341SAndroid Build Coastguard Worker        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
486*c0909341SAndroid Build Coastguard Worker        add             v25.4s,  v25.4s,  v30.4s
487*c0909341SAndroid Build Coastguard Worker        add             v26.4s,  v26.4s,  v30.4s
488*c0909341SAndroid Build Coastguard Worker        add             v27.4s,  v27.4s,  v30.4s
489*c0909341SAndroid Build Coastguard Worker        uxtl            v6.4s,   v22.4h
490*c0909341SAndroid Build Coastguard Worker        uxtl2           v7.4s,   v22.8h
491*c0909341SAndroid Build Coastguard Worker        uxtl            v28.4s,  v23.4h
492*c0909341SAndroid Build Coastguard Worker        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
493*c0909341SAndroid Build Coastguard Worker        uxtl2           v6.4s,   v23.8h
494*c0909341SAndroid Build Coastguard Worker        mla             v25.4s,  v17.4s,  v7.4s
495*c0909341SAndroid Build Coastguard Worker        mla             v26.4s,  v18.4s,  v28.4s
496*c0909341SAndroid Build Coastguard Worker        mla             v27.4s,  v19.4s,  v6.4s
497*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
498*c0909341SAndroid Build Coastguard Worker        srshl           v25.4s,  v25.4s,  v29.4s
499*c0909341SAndroid Build Coastguard Worker        srshl           v26.4s,  v26.4s,  v29.4s
500*c0909341SAndroid Build Coastguard Worker        srshl           v27.4s,  v27.4s,  v29.4s
501*c0909341SAndroid Build Coastguard Worker        sqxtun          v6.4h,   v24.4s           // iclip_pixel
502*c0909341SAndroid Build Coastguard Worker        sqxtun2         v6.8h,   v25.4s
503*c0909341SAndroid Build Coastguard Worker        sqxtun          v7.4h,   v26.4s
504*c0909341SAndroid Build Coastguard Worker        sqxtun2         v7.8h,   v27.4s
505*c0909341SAndroid Build Coastguard Worker        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
506*c0909341SAndroid Build Coastguard Worker        umin            v7.8h,   v7.8h,   v31.8h
507*c0909341SAndroid Build Coastguard Worker.if \type == 444
508*c0909341SAndroid Build Coastguard Worker        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
509*c0909341SAndroid Build Coastguard Worker        uzp1            v21.16b, v22.16b, v23.16b
510*c0909341SAndroid Build Coastguard Worker        sub             v20.16b, v1.16b,  v20.16b // m
511*c0909341SAndroid Build Coastguard Worker        sub             v21.16b, v1.16b,  v21.16b
512*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b}, [x6],  #16
513*c0909341SAndroid Build Coastguard Worker        st1             {v21.16b}, [x10], #16
514*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
515*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
516*c0909341SAndroid Build Coastguard Worker        addp            v21.8h,  v22.8h,  v23.8h
517*c0909341SAndroid Build Coastguard Worker        xtn             v20.8b,  v20.8h
518*c0909341SAndroid Build Coastguard Worker        xtn             v21.8b,  v21.8h
519*c0909341SAndroid Build Coastguard Worker        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
520*c0909341SAndroid Build Coastguard Worker        uhsub           v21.8b,  v3.8b,   v21.8b
521*c0909341SAndroid Build Coastguard Worker        st1             {v20.8b}, [x6],  #8
522*c0909341SAndroid Build Coastguard Worker        st1             {v21.8b}, [x10], #8
523*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
524*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
525*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v23.8h
526*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
527*c0909341SAndroid Build Coastguard Worker        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
528*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
529*c0909341SAndroid Build Coastguard Worker        st1             {v20.8b}, [x6], #8
530*c0909341SAndroid Build Coastguard Worker.endif
531*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h}, [x0],  #32
532*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x12], #32
533*c0909341SAndroid Build Coastguard Worker        b.gt            16b
534*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
535*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  w4,  uxtw #1
536*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  w4,  uxtw #1
537*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  w4,  uxtw #1
538*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  w4,  uxtw #1
539*c0909341SAndroid Build Coastguard Worker.if \type == 444
540*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  w4,  uxtw
541*c0909341SAndroid Build Coastguard Worker        add             x10, x10, w4,  uxtw
542*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
543*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x11, lsr #1
544*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x11, lsr #1
545*c0909341SAndroid Build Coastguard Worker.endif
546*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
547*c0909341SAndroid Build Coastguard Worker        add             x12, x12, x1
548*c0909341SAndroid Build Coastguard Worker        b.gt            161b
549*c0909341SAndroid Build Coastguard Worker        ret
550*c0909341SAndroid Build Coastguard Workerendfunc
551*c0909341SAndroid Build Coastguard Worker
552*c0909341SAndroid Build Coastguard Workerjumptable w_mask_\type\()_tbl
553*c0909341SAndroid Build Coastguard Worker        .word 1280b - w_mask_\type\()_tbl
554*c0909341SAndroid Build Coastguard Worker        .word 640b  - w_mask_\type\()_tbl
555*c0909341SAndroid Build Coastguard Worker        .word 320b  - w_mask_\type\()_tbl
556*c0909341SAndroid Build Coastguard Worker        .word 160b  - w_mask_\type\()_tbl
557*c0909341SAndroid Build Coastguard Worker        .word 80b   - w_mask_\type\()_tbl
558*c0909341SAndroid Build Coastguard Worker        .word 40b   - w_mask_\type\()_tbl
559*c0909341SAndroid Build Coastguard Workerendjumptable
560*c0909341SAndroid Build Coastguard Worker.endm
561*c0909341SAndroid Build Coastguard Worker
562*c0909341SAndroid Build Coastguard Workerw_mask_fn 444
563*c0909341SAndroid Build Coastguard Workerw_mask_fn 422
564*c0909341SAndroid Build Coastguard Workerw_mask_fn 420
565*c0909341SAndroid Build Coastguard Worker
566*c0909341SAndroid Build Coastguard Worker
567*c0909341SAndroid Build Coastguard Workerfunction blend_16bpc_neon, export=1
568*c0909341SAndroid Build Coastguard Worker        movrel          x6,  blend_tbl
569*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
570*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #26
571*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x6,  x3,  lsl #2]
572*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x3
573*c0909341SAndroid Build Coastguard Worker        add             x8,  x0,  x1
574*c0909341SAndroid Build Coastguard Worker        br              x6
575*c0909341SAndroid Build Coastguard Worker40:
576*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
577*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
578*c0909341SAndroid Build Coastguard Worker4:
579*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8b},   [x5], #8
580*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h},   [x2], #16
581*c0909341SAndroid Build Coastguard Worker        ldr             d0,        [x0]
582*c0909341SAndroid Build Coastguard Worker        neg             v2.8b,   v2.8b            // -m
583*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
584*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x8]
585*c0909341SAndroid Build Coastguard Worker        sxtl            v2.8h,   v2.8b
586*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #9      // -m << 9
587*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v0.8h,   v1.8h   // a - b
588*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
589*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
590*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},   [x0], x1
591*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[1], [x8], x1
592*c0909341SAndroid Build Coastguard Worker        b.gt            4b
593*c0909341SAndroid Build Coastguard Worker        ret
594*c0909341SAndroid Build Coastguard Worker80:
595*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
596*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
597*c0909341SAndroid Build Coastguard Worker8:
598*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b},       [x5], #16
599*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2], #32
600*c0909341SAndroid Build Coastguard Worker        neg             v5.16b,  v4.16b           // -m
601*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},   [x0]
602*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h},   [x8]
603*c0909341SAndroid Build Coastguard Worker        sxtl            v4.8h,   v5.8b
604*c0909341SAndroid Build Coastguard Worker        sxtl2           v5.8h,   v5.16b
605*c0909341SAndroid Build Coastguard Worker        shl             v4.8h,   v4.8h,   #9      // -m << 9
606*c0909341SAndroid Build Coastguard Worker        shl             v5.8h,   v5.8h,   #9
607*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v0.8h,   v2.8h   // a - b
608*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v1.8h,   v3.8h
609*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
610*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
611*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v3.8h,   v3.8h,   v5.8h
612*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
613*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v3.8h
614*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], x1
615*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h}, [x8], x1
616*c0909341SAndroid Build Coastguard Worker        b.gt            8b
617*c0909341SAndroid Build Coastguard Worker        ret
618*c0909341SAndroid Build Coastguard Worker160:
619*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
620*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
621*c0909341SAndroid Build Coastguard Worker16:
622*c0909341SAndroid Build Coastguard Worker        ld1             {v16.16b, v17.16b},           [x5], #32
623*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
624*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
625*c0909341SAndroid Build Coastguard Worker        neg             v18.16b, v16.16b          // -m
626*c0909341SAndroid Build Coastguard Worker        neg             v19.16b, v17.16b
627*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x0]
628*c0909341SAndroid Build Coastguard Worker        sxtl            v16.8h,  v18.8b
629*c0909341SAndroid Build Coastguard Worker        sxtl2           v17.8h,  v18.16b
630*c0909341SAndroid Build Coastguard Worker        sxtl            v18.8h,  v19.8b
631*c0909341SAndroid Build Coastguard Worker        sxtl2           v19.8h,  v19.16b
632*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x8]
633*c0909341SAndroid Build Coastguard Worker        shl             v16.8h,  v16.8h,  #9      // -m << 9
634*c0909341SAndroid Build Coastguard Worker        shl             v17.8h,  v17.8h,  #9
635*c0909341SAndroid Build Coastguard Worker        shl             v18.8h,  v18.8h,  #9
636*c0909341SAndroid Build Coastguard Worker        shl             v19.8h,  v19.8h,  #9
637*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v0.8h,   v4.8h   // a - b
638*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v1.8h,   v5.8h
639*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v2.8h,   v6.8h
640*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v3.8h,   v7.8h
641*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
642*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v5.8h,   v5.8h,   v17.8h
643*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v6.8h,   v6.8h,   v18.8h
644*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v7.8h,   v7.8h,   v19.8h
645*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v4.8h
646*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v5.8h
647*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v6.8h
648*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v7.8h
649*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
650*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h, v3.8h}, [x8], x1
651*c0909341SAndroid Build Coastguard Worker        b.gt            16b
652*c0909341SAndroid Build Coastguard Worker        ret
653*c0909341SAndroid Build Coastguard Worker320:
654*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
655*c0909341SAndroid Build Coastguard Worker32:
656*c0909341SAndroid Build Coastguard Worker        ld1             {v16.16b, v17.16b},           [x5], #32
657*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
658*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #1
659*c0909341SAndroid Build Coastguard Worker        neg             v18.16b, v16.16b          // -m
660*c0909341SAndroid Build Coastguard Worker        neg             v19.16b, v17.16b
661*c0909341SAndroid Build Coastguard Worker        sxtl            v16.8h,  v18.8b
662*c0909341SAndroid Build Coastguard Worker        sxtl2           v17.8h,  v18.16b
663*c0909341SAndroid Build Coastguard Worker        sxtl            v18.8h,  v19.8b
664*c0909341SAndroid Build Coastguard Worker        sxtl2           v19.8h,  v19.16b
665*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
666*c0909341SAndroid Build Coastguard Worker        shl             v16.8h,  v16.8h,  #9      // -m << 9
667*c0909341SAndroid Build Coastguard Worker        shl             v17.8h,  v17.8h,  #9
668*c0909341SAndroid Build Coastguard Worker        shl             v18.8h,  v18.8h,  #9
669*c0909341SAndroid Build Coastguard Worker        shl             v19.8h,  v19.8h,  #9
670*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v0.8h,   v4.8h   // a - b
671*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v1.8h,   v5.8h
672*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v2.8h,   v6.8h
673*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v3.8h,   v7.8h
674*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
675*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v5.8h,   v5.8h,   v17.8h
676*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v6.8h,   v6.8h,   v18.8h
677*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v7.8h,   v7.8h,   v19.8h
678*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v4.8h
679*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v5.8h
680*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v6.8h
681*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v7.8h
682*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
683*c0909341SAndroid Build Coastguard Worker        b.gt            32b
684*c0909341SAndroid Build Coastguard Worker        ret
685*c0909341SAndroid Build Coastguard Workerendfunc
686*c0909341SAndroid Build Coastguard Worker
687*c0909341SAndroid Build Coastguard Workerjumptable blend_tbl
688*c0909341SAndroid Build Coastguard Worker        .word 320b - blend_tbl
689*c0909341SAndroid Build Coastguard Worker        .word 160b - blend_tbl
690*c0909341SAndroid Build Coastguard Worker        .word 80b  - blend_tbl
691*c0909341SAndroid Build Coastguard Worker        .word 40b  - blend_tbl
692*c0909341SAndroid Build Coastguard Workerendjumptable
693*c0909341SAndroid Build Coastguard Worker
694*c0909341SAndroid Build Coastguard Workerfunction blend_h_16bpc_neon, export=1
695*c0909341SAndroid Build Coastguard Worker        movrel          x6,  blend_h_tbl
696*c0909341SAndroid Build Coastguard Worker        movrel          x5,  X(obmc_masks)
697*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  w4,  uxtw
698*c0909341SAndroid Build Coastguard Worker        sub             w4,  w4,  w4,  lsr #2
699*c0909341SAndroid Build Coastguard Worker        clz             w7,  w3
700*c0909341SAndroid Build Coastguard Worker        add             x8,  x0,  x1
701*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
702*c0909341SAndroid Build Coastguard Worker        sub             w7,  w7,  #24
703*c0909341SAndroid Build Coastguard Worker        ldrsw           x7,  [x6,  x7,  lsl #2]
704*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x7
705*c0909341SAndroid Build Coastguard Worker        br              x6
706*c0909341SAndroid Build Coastguard Worker20:
707*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
708*c0909341SAndroid Build Coastguard Worker2:
709*c0909341SAndroid Build Coastguard Worker        ld2r            {v2.8b, v3.8b}, [x5], #2
710*c0909341SAndroid Build Coastguard Worker        ld1             {v1.4h},        [x2], #8
711*c0909341SAndroid Build Coastguard Worker        ext             v2.8b,   v2.8b,   v3.8b,   #6
712*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
713*c0909341SAndroid Build Coastguard Worker        neg             v2.8b,   v2.8b            // -m
714*c0909341SAndroid Build Coastguard Worker        ldr             s0,        [x0]
715*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[1], [x8]
716*c0909341SAndroid Build Coastguard Worker        sxtl            v2.8h,   v2.8b
717*c0909341SAndroid Build Coastguard Worker        shl             v2.4h,   v2.4h,   #9      // -m << 9
718*c0909341SAndroid Build Coastguard Worker        sub             v1.4h,   v0.4h,   v1.4h   // a - b
719*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
720*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
721*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0], [x0], x1
722*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[1], [x8], x1
723*c0909341SAndroid Build Coastguard Worker        b.gt            2b
724*c0909341SAndroid Build Coastguard Worker        ret
725*c0909341SAndroid Build Coastguard Worker40:
726*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
727*c0909341SAndroid Build Coastguard Worker4:
728*c0909341SAndroid Build Coastguard Worker        ld2r            {v2.8b, v3.8b}, [x5], #2
729*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h},        [x2], #16
730*c0909341SAndroid Build Coastguard Worker        ext             v2.8b,   v2.8b,   v3.8b,   #4
731*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
732*c0909341SAndroid Build Coastguard Worker        neg             v2.8b,   v2.8b            // -m
733*c0909341SAndroid Build Coastguard Worker        ldr             d0,          [x0]
734*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1],   [x8]
735*c0909341SAndroid Build Coastguard Worker        sxtl            v2.8h,   v2.8b
736*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #9      // -m << 9
737*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v0.8h,   v1.8h   // a - b
738*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
739*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
740*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},   [x0], x1
741*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[1], [x8], x1
742*c0909341SAndroid Build Coastguard Worker        b.gt            4b
743*c0909341SAndroid Build Coastguard Worker        ret
744*c0909341SAndroid Build Coastguard Worker80:
745*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
746*c0909341SAndroid Build Coastguard Worker8:
747*c0909341SAndroid Build Coastguard Worker        ld2r            {v4.8b, v5.8b}, [x5], #2
748*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2], #32
749*c0909341SAndroid Build Coastguard Worker        neg             v4.8b,   v4.8b            // -m
750*c0909341SAndroid Build Coastguard Worker        neg             v5.8b,   v5.8b
751*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x0]
752*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
753*c0909341SAndroid Build Coastguard Worker        sxtl            v4.8h,   v4.8b
754*c0909341SAndroid Build Coastguard Worker        sxtl            v5.8h,   v5.8b
755*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x8]
756*c0909341SAndroid Build Coastguard Worker        shl             v4.8h,   v4.8h,   #9      // -m << 9
757*c0909341SAndroid Build Coastguard Worker        shl             v5.8h,   v5.8h,   #9
758*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v0.8h,   v2.8h   // a - b
759*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v1.8h,   v3.8h
760*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
761*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v3.8h,   v3.8h,   v5.8h
762*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
763*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v3.8h
764*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], x1
765*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h}, [x8], x1
766*c0909341SAndroid Build Coastguard Worker        b.gt            8b
767*c0909341SAndroid Build Coastguard Worker        ret
768*c0909341SAndroid Build Coastguard Worker160:
769*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
770*c0909341SAndroid Build Coastguard Worker16:
771*c0909341SAndroid Build Coastguard Worker        ld2r            {v16.8b, v17.8b}, [x5], #2
772*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
773*c0909341SAndroid Build Coastguard Worker        neg             v16.8b,  v16.8b           // -m
774*c0909341SAndroid Build Coastguard Worker        neg             v17.8b,  v17.8b
775*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x0]
776*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h},  [x8]
777*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
778*c0909341SAndroid Build Coastguard Worker        sxtl            v16.8h,  v16.8b
779*c0909341SAndroid Build Coastguard Worker        sxtl            v17.8h,  v17.8b
780*c0909341SAndroid Build Coastguard Worker        shl             v16.8h,  v16.8h,  #9      // -m << 9
781*c0909341SAndroid Build Coastguard Worker        shl             v17.8h,  v17.8h,  #9
782*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v0.8h,   v4.8h   // a - b
783*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v1.8h,   v5.8h
784*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v2.8h,   v6.8h
785*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v3.8h,   v7.8h
786*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
787*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v5.8h,   v5.8h,   v16.8h
788*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v6.8h,   v6.8h,   v17.8h
789*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v7.8h,   v7.8h,   v17.8h
790*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v4.8h
791*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v5.8h
792*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v6.8h
793*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v7.8h
794*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
795*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h, v3.8h}, [x8], x1
796*c0909341SAndroid Build Coastguard Worker        b.gt            16b
797*c0909341SAndroid Build Coastguard Worker        ret
798*c0909341SAndroid Build Coastguard Worker1280:
799*c0909341SAndroid Build Coastguard Worker640:
800*c0909341SAndroid Build Coastguard Worker320:
801*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
802*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3,  uxtw #1
803*c0909341SAndroid Build Coastguard Worker        add             x7,  x2,  w3,  uxtw #1
804*c0909341SAndroid Build Coastguard Worker321:
805*c0909341SAndroid Build Coastguard Worker        ld2r            {v24.8b, v25.8b}, [x5], #2
806*c0909341SAndroid Build Coastguard Worker        mov             w6,  w3
807*c0909341SAndroid Build Coastguard Worker        neg             v24.8b,  v24.8b           // -m
808*c0909341SAndroid Build Coastguard Worker        neg             v25.8b,  v25.8b
809*c0909341SAndroid Build Coastguard Worker        sxtl            v24.8h,  v24.8b
810*c0909341SAndroid Build Coastguard Worker        sxtl            v25.8h,  v25.8b
811*c0909341SAndroid Build Coastguard Worker        shl             v24.8h,  v24.8h,  #9      // -m << 9
812*c0909341SAndroid Build Coastguard Worker        shl             v25.8h,  v25.8h,  #9
813*c0909341SAndroid Build Coastguard Worker32:
814*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
815*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
816*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #32
817*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v0.8h,   v16.8h  // a - b
818*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v1.8h,   v17.8h
819*c0909341SAndroid Build Coastguard Worker        sub             v18.8h,  v2.8h,   v18.8h
820*c0909341SAndroid Build Coastguard Worker        sub             v19.8h,  v3.8h,   v19.8h
821*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
822*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
823*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
824*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v17.8h,  v17.8h,  v24.8h
825*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v18.8h,  v18.8h,  v24.8h
826*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v19.8h,  v19.8h,  v24.8h
827*c0909341SAndroid Build Coastguard Worker        sub             v20.8h,  v4.8h,   v20.8h  // a - b
828*c0909341SAndroid Build Coastguard Worker        sub             v21.8h,  v5.8h,   v21.8h
829*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v6.8h,   v22.8h
830*c0909341SAndroid Build Coastguard Worker        sub             v23.8h,  v7.8h,   v23.8h
831*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v16.8h
832*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v17.8h
833*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v18.8h
834*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v19.8h
835*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
836*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v21.8h,  v25.8h
837*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v22.8h,  v22.8h,  v25.8h
838*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v23.8h,  v23.8h,  v25.8h
839*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
840*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v20.8h
841*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v21.8h
842*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v22.8h
843*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v23.8h
844*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
845*c0909341SAndroid Build Coastguard Worker        b.gt            32b
846*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
847*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
848*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x1
849*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  w3,  uxtw #1
850*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  w3,  uxtw #1
851*c0909341SAndroid Build Coastguard Worker        b.gt            321b
852*c0909341SAndroid Build Coastguard Worker        ret
853*c0909341SAndroid Build Coastguard Workerendfunc
854*c0909341SAndroid Build Coastguard Worker
855*c0909341SAndroid Build Coastguard Workerjumptable blend_h_tbl
856*c0909341SAndroid Build Coastguard Worker        .word 1280b - blend_h_tbl
857*c0909341SAndroid Build Coastguard Worker        .word 640b  - blend_h_tbl
858*c0909341SAndroid Build Coastguard Worker        .word 320b  - blend_h_tbl
859*c0909341SAndroid Build Coastguard Worker        .word 160b  - blend_h_tbl
860*c0909341SAndroid Build Coastguard Worker        .word 80b   - blend_h_tbl
861*c0909341SAndroid Build Coastguard Worker        .word 40b   - blend_h_tbl
862*c0909341SAndroid Build Coastguard Worker        .word 20b   - blend_h_tbl
863*c0909341SAndroid Build Coastguard Workerendjumptable
864*c0909341SAndroid Build Coastguard Worker
865*c0909341SAndroid Build Coastguard Workerfunction blend_v_16bpc_neon, export=1
866*c0909341SAndroid Build Coastguard Worker        movrel          x6,  blend_v_tbl
867*c0909341SAndroid Build Coastguard Worker        movrel          x5,  X(obmc_masks)
868*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  w3,  uxtw
869*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
870*c0909341SAndroid Build Coastguard Worker        add             x8,  x0,  x1
871*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
872*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #26
873*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x6,  x3,  lsl #2]
874*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x3
875*c0909341SAndroid Build Coastguard Worker        br              x6
876*c0909341SAndroid Build Coastguard Worker20:
877*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
878*c0909341SAndroid Build Coastguard Worker        ld1r            {v2.8b}, [x5]
879*c0909341SAndroid Build Coastguard Worker        neg             v2.8b,   v2.8b            // -m
880*c0909341SAndroid Build Coastguard Worker        sxtl            v2.8h,   v2.8b
881*c0909341SAndroid Build Coastguard Worker        shl             v2.4h,   v2.4h,   #9      // -m << 9
882*c0909341SAndroid Build Coastguard Worker2:
883*c0909341SAndroid Build Coastguard Worker        ldr             s1,  [x2],  #4
884*c0909341SAndroid Build Coastguard Worker        ldr             h0,  [x0]
885*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
886*c0909341SAndroid Build Coastguard Worker        ld1             {v1.h}[1], [x2]
887*c0909341SAndroid Build Coastguard Worker        ld1             {v0.h}[1], [x8]
888*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #4
889*c0909341SAndroid Build Coastguard Worker        sub             v1.4h,   v0.4h,   v1.4h   // a - b
890*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
891*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
892*c0909341SAndroid Build Coastguard Worker        st1             {v0.h}[0], [x0],  x1
893*c0909341SAndroid Build Coastguard Worker        st1             {v0.h}[1], [x8],  x1
894*c0909341SAndroid Build Coastguard Worker        b.gt            2b
895*c0909341SAndroid Build Coastguard Worker        ret
896*c0909341SAndroid Build Coastguard Worker40:
897*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
898*c0909341SAndroid Build Coastguard Worker        ld1r            {v2.2s}, [x5]
899*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #4
900*c0909341SAndroid Build Coastguard Worker        neg             v2.8b,   v2.8b            // -m
901*c0909341SAndroid Build Coastguard Worker        sxtl            v2.8h,   v2.8b
902*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #9      // -m << 9
903*c0909341SAndroid Build Coastguard Worker4:
904*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h},   [x2], #16
905*c0909341SAndroid Build Coastguard Worker        ldr             d0,        [x0]
906*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x8]
907*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
908*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v0.8h,   v1.8h   // a - b
909*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
910*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
911*c0909341SAndroid Build Coastguard Worker        str             s0,        [x0], #4
912*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[2], [x8], #4
913*c0909341SAndroid Build Coastguard Worker        st1             {v0.h}[2], [x0], x1
914*c0909341SAndroid Build Coastguard Worker        st1             {v0.h}[6], [x8], x1
915*c0909341SAndroid Build Coastguard Worker        b.gt            4b
916*c0909341SAndroid Build Coastguard Worker        ret
917*c0909341SAndroid Build Coastguard Worker80:
918*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
919*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x5]
920*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #8
921*c0909341SAndroid Build Coastguard Worker        neg             v4.8b,   v4.8b            // -m
922*c0909341SAndroid Build Coastguard Worker        sxtl            v4.8h,   v4.8b
923*c0909341SAndroid Build Coastguard Worker        shl             v4.8h,   v4.8h,   #9      // -m << 9
924*c0909341SAndroid Build Coastguard Worker8:
925*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2], #32
926*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x0]
927*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x8]
928*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
929*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v0.8h,   v2.8h   // a - b
930*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v1.8h,   v3.8h
931*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
932*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v3.8h,   v3.8h,   v4.8h
933*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
934*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v3.8h
935*c0909341SAndroid Build Coastguard Worker        str             d0,        [x0], #8
936*c0909341SAndroid Build Coastguard Worker        str             d1,        [x8], #8
937*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[2], [x0], x1
938*c0909341SAndroid Build Coastguard Worker        st1             {v1.s}[2], [x8], x1
939*c0909341SAndroid Build Coastguard Worker        b.gt            8b
940*c0909341SAndroid Build Coastguard Worker        ret
941*c0909341SAndroid Build Coastguard Worker160:
942*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
943*c0909341SAndroid Build Coastguard Worker        ld1             {v16.16b}, [x5]
944*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #16
945*c0909341SAndroid Build Coastguard Worker        neg             v17.16b, v16.16b          // -m
946*c0909341SAndroid Build Coastguard Worker        sxtl            v16.8h,  v17.8b
947*c0909341SAndroid Build Coastguard Worker        sxtl2           v17.8h,  v17.16b
948*c0909341SAndroid Build Coastguard Worker        shl             v16.8h,  v16.8h,  #9      // -m << 9
949*c0909341SAndroid Build Coastguard Worker        shl             v17.4h,  v17.4h,  #9
950*c0909341SAndroid Build Coastguard Worker16:
951*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
952*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x0]
953*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
954*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x8]
955*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v0.8h,   v4.8h   // a - b
956*c0909341SAndroid Build Coastguard Worker        sub             v5.4h,   v1.4h,   v5.4h
957*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v2.8h,   v6.8h
958*c0909341SAndroid Build Coastguard Worker        sub             v7.4h,   v3.4h,   v7.4h
959*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
960*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v5.4h,   v5.4h,   v17.4h
961*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v6.8h,   v6.8h,   v16.8h
962*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v7.4h,   v7.4h,   v17.4h
963*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v4.8h
964*c0909341SAndroid Build Coastguard Worker        add             v1.4h,   v1.4h,   v5.4h
965*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v6.8h
966*c0909341SAndroid Build Coastguard Worker        add             v3.4h,   v3.4h,   v7.4h
967*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], #16
968*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h}, [x8], #16
969*c0909341SAndroid Build Coastguard Worker        st1             {v1.4h}, [x0], x1
970*c0909341SAndroid Build Coastguard Worker        st1             {v3.4h}, [x8], x1
971*c0909341SAndroid Build Coastguard Worker        b.gt            16b
972*c0909341SAndroid Build Coastguard Worker        ret
973*c0909341SAndroid Build Coastguard Worker320:
974*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
975*c0909341SAndroid Build Coastguard Worker        ld1             {v24.16b, v25.16b},  [x5]
976*c0909341SAndroid Build Coastguard Worker        neg             v26.16b, v24.16b          // -m
977*c0909341SAndroid Build Coastguard Worker        neg             v27.8b,  v25.8b
978*c0909341SAndroid Build Coastguard Worker        sxtl            v24.8h,  v26.8b
979*c0909341SAndroid Build Coastguard Worker        sxtl2           v25.8h,  v26.16b
980*c0909341SAndroid Build Coastguard Worker        sxtl            v26.8h,  v27.8b
981*c0909341SAndroid Build Coastguard Worker        shl             v24.8h,  v24.8h,  #9      // -m << 9
982*c0909341SAndroid Build Coastguard Worker        shl             v25.8h,  v25.8h,  #9
983*c0909341SAndroid Build Coastguard Worker        shl             v26.8h,  v26.8h,  #9
984*c0909341SAndroid Build Coastguard Worker32:
985*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
986*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
987*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
988*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
989*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
990*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v0.8h,   v16.8h  // a - b
991*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v1.8h,   v17.8h
992*c0909341SAndroid Build Coastguard Worker        sub             v18.8h,  v2.8h,   v18.8h
993*c0909341SAndroid Build Coastguard Worker        sub             v20.8h,  v4.8h,   v20.8h
994*c0909341SAndroid Build Coastguard Worker        sub             v21.8h,  v5.8h,   v21.8h
995*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v6.8h,   v22.8h
996*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
997*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v17.8h,  v17.8h,  v25.8h
998*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v18.8h,  v18.8h,  v26.8h
999*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v20.8h,  v24.8h
1000*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v21.8h,  v25.8h
1001*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v22.8h,  v22.8h,  v26.8h
1002*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v16.8h
1003*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v17.8h
1004*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v18.8h
1005*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v20.8h
1006*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v21.8h
1007*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v22.8h
1008*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
1009*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
1010*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1011*c0909341SAndroid Build Coastguard Worker        ret
1012*c0909341SAndroid Build Coastguard Workerendfunc
1013*c0909341SAndroid Build Coastguard Worker
1014*c0909341SAndroid Build Coastguard Workerjumptable blend_v_tbl
1015*c0909341SAndroid Build Coastguard Worker        .word 320b - blend_v_tbl
1016*c0909341SAndroid Build Coastguard Worker        .word 160b - blend_v_tbl
1017*c0909341SAndroid Build Coastguard Worker        .word 80b  - blend_v_tbl
1018*c0909341SAndroid Build Coastguard Worker        .word 40b  - blend_v_tbl
1019*c0909341SAndroid Build Coastguard Worker        .word 20b  - blend_v_tbl
1020*c0909341SAndroid Build Coastguard Workerendjumptable
1021*c0909341SAndroid Build Coastguard Worker
1022*c0909341SAndroid Build Coastguard Worker
1023*c0909341SAndroid Build Coastguard Worker// This has got the same signature as the put_8tap functions,
1024*c0909341SAndroid Build Coastguard Worker// and assumes that x9 is set to (clz(w)-24).
1025*c0909341SAndroid Build Coastguard Workerfunction put_16bpc_neon, export=1
1026*c0909341SAndroid Build Coastguard Worker        movrel          x10, put_16bpc_tbl
1027*c0909341SAndroid Build Coastguard Worker        ldrsw           x9, [x10, x9, lsl #2]
1028*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
1029*c0909341SAndroid Build Coastguard Worker        br              x10
1030*c0909341SAndroid Build Coastguard Worker
1031*c0909341SAndroid Build Coastguard Worker20:
1032*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1033*c0909341SAndroid Build Coastguard Worker2:
1034*c0909341SAndroid Build Coastguard Worker        ld1r            {v0.4s},   [x2], x3
1035*c0909341SAndroid Build Coastguard Worker        ld1r            {v1.4s},   [x2], x3
1036*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
1037*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0], [x0], x1
1038*c0909341SAndroid Build Coastguard Worker        st1             {v1.s}[0], [x0], x1
1039*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1040*c0909341SAndroid Build Coastguard Worker        ret
1041*c0909341SAndroid Build Coastguard Worker40:
1042*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1043*c0909341SAndroid Build Coastguard Worker4:
1044*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h}, [x2], x3
1045*c0909341SAndroid Build Coastguard Worker        ld1             {v1.4h}, [x2], x3
1046*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
1047*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h}, [x0], x1
1048*c0909341SAndroid Build Coastguard Worker        st1             {v1.4h}, [x0], x1
1049*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1050*c0909341SAndroid Build Coastguard Worker        ret
1051*c0909341SAndroid Build Coastguard Worker80:
1052*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1053*c0909341SAndroid Build Coastguard Worker        add             x8,  x0,  x1
1054*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1055*c0909341SAndroid Build Coastguard Worker        add             x9,  x2,  x3
1056*c0909341SAndroid Build Coastguard Worker        lsl             x3,  x3,  #1
1057*c0909341SAndroid Build Coastguard Worker8:
1058*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x2], x3
1059*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x9], x3
1060*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
1061*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], x1
1062*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h}, [x8], x1
1063*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1064*c0909341SAndroid Build Coastguard Worker        ret
1065*c0909341SAndroid Build Coastguard Worker160:
1066*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1067*c0909341SAndroid Build Coastguard Worker16:
1068*c0909341SAndroid Build Coastguard Worker        ldp             x6,  x7,  [x2]
1069*c0909341SAndroid Build Coastguard Worker        ldp             x8,  x9,  [x2, #16]
1070*c0909341SAndroid Build Coastguard Worker        stp             x6,  x7,  [x0]
1071*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
1072*c0909341SAndroid Build Coastguard Worker        stp             x8,  x9,  [x0, #16]
1073*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  x3
1074*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1075*c0909341SAndroid Build Coastguard Worker        b.gt            16b
1076*c0909341SAndroid Build Coastguard Worker        ret
1077*c0909341SAndroid Build Coastguard Worker320:
1078*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1079*c0909341SAndroid Build Coastguard Worker32:
1080*c0909341SAndroid Build Coastguard Worker        ldp             x6,  x7,  [x2]
1081*c0909341SAndroid Build Coastguard Worker        ldp             x8,  x9,  [x2, #16]
1082*c0909341SAndroid Build Coastguard Worker        stp             x6,  x7,  [x0]
1083*c0909341SAndroid Build Coastguard Worker        ldp             x10, x11, [x2, #32]
1084*c0909341SAndroid Build Coastguard Worker        stp             x8,  x9,  [x0, #16]
1085*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
1086*c0909341SAndroid Build Coastguard Worker        ldp             x12, x13, [x2, #48]
1087*c0909341SAndroid Build Coastguard Worker        stp             x10, x11, [x0, #32]
1088*c0909341SAndroid Build Coastguard Worker        stp             x12, x13, [x0, #48]
1089*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  x3
1090*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1091*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1092*c0909341SAndroid Build Coastguard Worker        ret
1093*c0909341SAndroid Build Coastguard Worker640:
1094*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1095*c0909341SAndroid Build Coastguard Worker64:
1096*c0909341SAndroid Build Coastguard Worker        ldp             q0,  q1,  [x2]
1097*c0909341SAndroid Build Coastguard Worker        ldp             q2,  q3,  [x2, #32]
1098*c0909341SAndroid Build Coastguard Worker        stp             q0,  q1,  [x0]
1099*c0909341SAndroid Build Coastguard Worker        ldp             q4,  q5,  [x2, #64]
1100*c0909341SAndroid Build Coastguard Worker        stp             q2,  q3,  [x0, #32]
1101*c0909341SAndroid Build Coastguard Worker        ldp             q6,  q7,  [x2, #96]
1102*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
1103*c0909341SAndroid Build Coastguard Worker        stp             q4,  q5,  [x0, #64]
1104*c0909341SAndroid Build Coastguard Worker        stp             q6,  q7,  [x0, #96]
1105*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  x3
1106*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1107*c0909341SAndroid Build Coastguard Worker        b.gt            64b
1108*c0909341SAndroid Build Coastguard Worker        ret
1109*c0909341SAndroid Build Coastguard Worker1280:
1110*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1111*c0909341SAndroid Build Coastguard Worker128:
1112*c0909341SAndroid Build Coastguard Worker        ldp             q0,  q1,  [x2]
1113*c0909341SAndroid Build Coastguard Worker        ldp             q2,  q3,  [x2, #32]
1114*c0909341SAndroid Build Coastguard Worker        stp             q0,  q1,  [x0]
1115*c0909341SAndroid Build Coastguard Worker        ldp             q4,  q5,  [x2, #64]
1116*c0909341SAndroid Build Coastguard Worker        stp             q2,  q3,  [x0, #32]
1117*c0909341SAndroid Build Coastguard Worker        ldp             q6,  q7,  [x2, #96]
1118*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
1119*c0909341SAndroid Build Coastguard Worker        stp             q4,  q5,  [x0, #64]
1120*c0909341SAndroid Build Coastguard Worker        ldp             q16, q17, [x2, #128]
1121*c0909341SAndroid Build Coastguard Worker        stp             q6,  q7,  [x0, #96]
1122*c0909341SAndroid Build Coastguard Worker        ldp             q18, q19, [x2, #160]
1123*c0909341SAndroid Build Coastguard Worker        stp             q16, q17, [x0, #128]
1124*c0909341SAndroid Build Coastguard Worker        ldp             q20, q21, [x2, #192]
1125*c0909341SAndroid Build Coastguard Worker        stp             q18, q19, [x0, #160]
1126*c0909341SAndroid Build Coastguard Worker        ldp             q22, q23, [x2, #224]
1127*c0909341SAndroid Build Coastguard Worker        stp             q20, q21, [x0, #192]
1128*c0909341SAndroid Build Coastguard Worker        stp             q22, q23, [x0, #224]
1129*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  x3
1130*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1131*c0909341SAndroid Build Coastguard Worker        b.gt            128b
1132*c0909341SAndroid Build Coastguard Worker        ret
1133*c0909341SAndroid Build Coastguard Workerendfunc
1134*c0909341SAndroid Build Coastguard Worker
1135*c0909341SAndroid Build Coastguard Workerjumptable put_16bpc_tbl
1136*c0909341SAndroid Build Coastguard Worker        .word 1280b - put_16bpc_tbl
1137*c0909341SAndroid Build Coastguard Worker        .word 640b  - put_16bpc_tbl
1138*c0909341SAndroid Build Coastguard Worker        .word 320b  - put_16bpc_tbl
1139*c0909341SAndroid Build Coastguard Worker        .word 160b  - put_16bpc_tbl
1140*c0909341SAndroid Build Coastguard Worker        .word 80b   - put_16bpc_tbl
1141*c0909341SAndroid Build Coastguard Worker        .word 40b   - put_16bpc_tbl
1142*c0909341SAndroid Build Coastguard Worker        .word 20b   - put_16bpc_tbl
1143*c0909341SAndroid Build Coastguard Workerendjumptable
1144*c0909341SAndroid Build Coastguard Worker
1145*c0909341SAndroid Build Coastguard Worker
1146*c0909341SAndroid Build Coastguard Worker// This has got the same signature as the prep_8tap functions,
1147*c0909341SAndroid Build Coastguard Worker// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
1148*c0909341SAndroid Build Coastguard Worker// x8 to w*2.
1149*c0909341SAndroid Build Coastguard Workerfunction prep_16bpc_neon
1150*c0909341SAndroid Build Coastguard Worker        movrel          x10, prep_16bpc_tbl
1151*c0909341SAndroid Build Coastguard Worker        ldrsw           x9, [x10, x9, lsl #2]
1152*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w7   // intermediate_bits
1153*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
1154*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
1155*c0909341SAndroid Build Coastguard Worker        br              x10
1156*c0909341SAndroid Build Coastguard Worker
1157*c0909341SAndroid Build Coastguard Worker40:
1158*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1159*c0909341SAndroid Build Coastguard Worker        add             x9,  x1,  x2
1160*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
1161*c0909341SAndroid Build Coastguard Worker4:
1162*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},   [x1], x2
1163*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x9], x2
1164*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1165*c0909341SAndroid Build Coastguard Worker        sshl            v0.8h,   v0.8h,   v31.8h
1166*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v30.8h
1167*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], #16
1168*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1169*c0909341SAndroid Build Coastguard Worker        ret
1170*c0909341SAndroid Build Coastguard Worker80:
1171*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1172*c0909341SAndroid Build Coastguard Worker        add             x9,  x1,  x2
1173*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
1174*c0909341SAndroid Build Coastguard Worker8:
1175*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1], x2
1176*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x9], x2
1177*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1178*c0909341SAndroid Build Coastguard Worker        sshl            v0.8h,   v0.8h,   v31.8h
1179*c0909341SAndroid Build Coastguard Worker        sshl            v1.8h,   v1.8h,   v31.8h
1180*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v30.8h
1181*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v30.8h
1182*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
1183*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1184*c0909341SAndroid Build Coastguard Worker        ret
1185*c0909341SAndroid Build Coastguard Worker160:
1186*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1187*c0909341SAndroid Build Coastguard Worker16:
1188*c0909341SAndroid Build Coastguard Worker        ldp             q0,  q1,  [x1]
1189*c0909341SAndroid Build Coastguard Worker        add             x1,  x1,  x2
1190*c0909341SAndroid Build Coastguard Worker        sshl            v0.8h,   v0.8h,   v31.8h
1191*c0909341SAndroid Build Coastguard Worker        ldp             q2,  q3,  [x1]
1192*c0909341SAndroid Build Coastguard Worker        add             x1,  x1,  x2
1193*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1194*c0909341SAndroid Build Coastguard Worker        sshl            v1.8h,   v1.8h,   v31.8h
1195*c0909341SAndroid Build Coastguard Worker        sshl            v2.8h,   v2.8h,   v31.8h
1196*c0909341SAndroid Build Coastguard Worker        sshl            v3.8h,   v3.8h,   v31.8h
1197*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v30.8h
1198*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v30.8h
1199*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v30.8h
1200*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v30.8h
1201*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1202*c0909341SAndroid Build Coastguard Worker        b.gt            16b
1203*c0909341SAndroid Build Coastguard Worker        ret
1204*c0909341SAndroid Build Coastguard Worker320:
1205*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1206*c0909341SAndroid Build Coastguard Worker32:
1207*c0909341SAndroid Build Coastguard Worker        ldp             q0,  q1,  [x1]
1208*c0909341SAndroid Build Coastguard Worker        sshl            v0.8h,   v0.8h,   v31.8h
1209*c0909341SAndroid Build Coastguard Worker        ldp             q2,  q3,  [x1, #32]
1210*c0909341SAndroid Build Coastguard Worker        add             x1,  x1,  x2
1211*c0909341SAndroid Build Coastguard Worker        sshl            v1.8h,   v1.8h,   v31.8h
1212*c0909341SAndroid Build Coastguard Worker        sshl            v2.8h,   v2.8h,   v31.8h
1213*c0909341SAndroid Build Coastguard Worker        sshl            v3.8h,   v3.8h,   v31.8h
1214*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #1
1215*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v30.8h
1216*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v30.8h
1217*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v30.8h
1218*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v30.8h
1219*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1220*c0909341SAndroid Build Coastguard Worker        b.gt            32b
1221*c0909341SAndroid Build Coastguard Worker        ret
1222*c0909341SAndroid Build Coastguard Worker640:
1223*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1224*c0909341SAndroid Build Coastguard Worker64:
1225*c0909341SAndroid Build Coastguard Worker        ldp             q0,  q1,  [x1]
1226*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #1
1227*c0909341SAndroid Build Coastguard Worker        sshl            v0.8h,   v0.8h,   v31.8h
1228*c0909341SAndroid Build Coastguard Worker        ldp             q2,  q3,  [x1, #32]
1229*c0909341SAndroid Build Coastguard Worker        sshl            v1.8h,   v1.8h,   v31.8h
1230*c0909341SAndroid Build Coastguard Worker        ldp             q4,  q5,  [x1, #64]
1231*c0909341SAndroid Build Coastguard Worker        sshl            v2.8h,   v2.8h,   v31.8h
1232*c0909341SAndroid Build Coastguard Worker        sshl            v3.8h,   v3.8h,   v31.8h
1233*c0909341SAndroid Build Coastguard Worker        ldp             q6,  q7,  [x1, #96]
1234*c0909341SAndroid Build Coastguard Worker        add             x1,  x1,  x2
1235*c0909341SAndroid Build Coastguard Worker        sshl            v4.8h,   v4.8h,   v31.8h
1236*c0909341SAndroid Build Coastguard Worker        sshl            v5.8h,   v5.8h,   v31.8h
1237*c0909341SAndroid Build Coastguard Worker        sshl            v6.8h,   v6.8h,   v31.8h
1238*c0909341SAndroid Build Coastguard Worker        sshl            v7.8h,   v7.8h,   v31.8h
1239*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v30.8h
1240*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v30.8h
1241*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v30.8h
1242*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v30.8h
1243*c0909341SAndroid Build Coastguard Worker        stp             q0,  q1,  [x0]
1244*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v30.8h
1245*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v5.8h,   v30.8h
1246*c0909341SAndroid Build Coastguard Worker        stp             q2,  q3,  [x0, #32]
1247*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v30.8h
1248*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v7.8h,   v30.8h
1249*c0909341SAndroid Build Coastguard Worker        stp             q4,  q5,  [x0, #64]
1250*c0909341SAndroid Build Coastguard Worker        stp             q6,  q7,  [x0, #96]
1251*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x8
1252*c0909341SAndroid Build Coastguard Worker        b.gt            64b
1253*c0909341SAndroid Build Coastguard Worker        ret
1254*c0909341SAndroid Build Coastguard Worker1280:
1255*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1256*c0909341SAndroid Build Coastguard Worker128:
1257*c0909341SAndroid Build Coastguard Worker        ldp             q0,  q1,  [x1]
1258*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #1
1259*c0909341SAndroid Build Coastguard Worker        sshl            v0.8h,   v0.8h,   v31.8h
1260*c0909341SAndroid Build Coastguard Worker        ldp             q2,  q3,  [x1, #32]
1261*c0909341SAndroid Build Coastguard Worker        sshl            v1.8h,   v1.8h,   v31.8h
1262*c0909341SAndroid Build Coastguard Worker        ldp             q4,  q5,  [x1, #64]
1263*c0909341SAndroid Build Coastguard Worker        sshl            v2.8h,   v2.8h,   v31.8h
1264*c0909341SAndroid Build Coastguard Worker        sshl            v3.8h,   v3.8h,   v31.8h
1265*c0909341SAndroid Build Coastguard Worker        ldp             q6,  q7,  [x1, #96]
1266*c0909341SAndroid Build Coastguard Worker        sshl            v4.8h,   v4.8h,   v31.8h
1267*c0909341SAndroid Build Coastguard Worker        sshl            v5.8h,   v5.8h,   v31.8h
1268*c0909341SAndroid Build Coastguard Worker        ldp             q16, q17, [x1, #128]
1269*c0909341SAndroid Build Coastguard Worker        sshl            v6.8h,   v6.8h,   v31.8h
1270*c0909341SAndroid Build Coastguard Worker        sshl            v7.8h,   v7.8h,   v31.8h
1271*c0909341SAndroid Build Coastguard Worker        ldp             q18, q19, [x1, #160]
1272*c0909341SAndroid Build Coastguard Worker        sshl            v16.8h,  v16.8h,  v31.8h
1273*c0909341SAndroid Build Coastguard Worker        sshl            v17.8h,  v17.8h,  v31.8h
1274*c0909341SAndroid Build Coastguard Worker        ldp             q20, q21, [x1, #192]
1275*c0909341SAndroid Build Coastguard Worker        sshl            v18.8h,  v18.8h,  v31.8h
1276*c0909341SAndroid Build Coastguard Worker        sshl            v19.8h,  v19.8h,  v31.8h
1277*c0909341SAndroid Build Coastguard Worker        ldp             q22, q23, [x1, #224]
1278*c0909341SAndroid Build Coastguard Worker        add             x1,  x1,  x2
1279*c0909341SAndroid Build Coastguard Worker        sshl            v20.8h,  v20.8h,  v31.8h
1280*c0909341SAndroid Build Coastguard Worker        sshl            v21.8h,  v21.8h,  v31.8h
1281*c0909341SAndroid Build Coastguard Worker        sshl            v22.8h,  v22.8h,  v31.8h
1282*c0909341SAndroid Build Coastguard Worker        sshl            v23.8h,  v23.8h,  v31.8h
1283*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v30.8h
1284*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v30.8h
1285*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v30.8h
1286*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v30.8h
1287*c0909341SAndroid Build Coastguard Worker        stp             q0,  q1,  [x0]
1288*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v30.8h
1289*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v5.8h,   v30.8h
1290*c0909341SAndroid Build Coastguard Worker        stp             q2,  q3,  [x0, #32]
1291*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v30.8h
1292*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v7.8h,   v30.8h
1293*c0909341SAndroid Build Coastguard Worker        stp             q4,  q5,  [x0, #64]
1294*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v16.8h,  v30.8h
1295*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v17.8h,  v30.8h
1296*c0909341SAndroid Build Coastguard Worker        stp             q6,  q7,  [x0, #96]
1297*c0909341SAndroid Build Coastguard Worker        sub             v18.8h,  v18.8h,  v30.8h
1298*c0909341SAndroid Build Coastguard Worker        sub             v19.8h,  v19.8h,  v30.8h
1299*c0909341SAndroid Build Coastguard Worker        stp             q16, q17, [x0, #128]
1300*c0909341SAndroid Build Coastguard Worker        sub             v20.8h,  v20.8h,  v30.8h
1301*c0909341SAndroid Build Coastguard Worker        sub             v21.8h,  v21.8h,  v30.8h
1302*c0909341SAndroid Build Coastguard Worker        stp             q18, q19, [x0, #160]
1303*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v22.8h,  v30.8h
1304*c0909341SAndroid Build Coastguard Worker        sub             v23.8h,  v23.8h,  v30.8h
1305*c0909341SAndroid Build Coastguard Worker        stp             q20, q21, [x0, #192]
1306*c0909341SAndroid Build Coastguard Worker        stp             q22, q23, [x0, #224]
1307*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x8
1308*c0909341SAndroid Build Coastguard Worker        b.gt            128b
1309*c0909341SAndroid Build Coastguard Worker        ret
1310*c0909341SAndroid Build Coastguard Workerendfunc
1311*c0909341SAndroid Build Coastguard Worker
1312*c0909341SAndroid Build Coastguard Workerjumptable prep_16bpc_tbl
1313*c0909341SAndroid Build Coastguard Worker        .word 1280b - prep_16bpc_tbl
1314*c0909341SAndroid Build Coastguard Worker        .word 640b  - prep_16bpc_tbl
1315*c0909341SAndroid Build Coastguard Worker        .word 320b  - prep_16bpc_tbl
1316*c0909341SAndroid Build Coastguard Worker        .word 160b  - prep_16bpc_tbl
1317*c0909341SAndroid Build Coastguard Worker        .word 80b   - prep_16bpc_tbl
1318*c0909341SAndroid Build Coastguard Worker        .word 40b   - prep_16bpc_tbl
1319*c0909341SAndroid Build Coastguard Workerendjumptable
1320*c0909341SAndroid Build Coastguard Worker
1321*c0909341SAndroid Build Coastguard Worker
1322*c0909341SAndroid Build Coastguard Worker.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1323*c0909341SAndroid Build Coastguard Worker        ld1             {\d0\wd}[0], [\s0], \strd
1324*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\wd}[0], [\s1], \strd
1325*c0909341SAndroid Build Coastguard Worker.ifnb \d2
1326*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\wd}[0], [\s0], \strd
1327*c0909341SAndroid Build Coastguard Worker        ld1             {\d3\wd}[0], [\s1], \strd
1328*c0909341SAndroid Build Coastguard Worker.endif
1329*c0909341SAndroid Build Coastguard Worker.ifnb \d4
1330*c0909341SAndroid Build Coastguard Worker        ld1             {\d4\wd}[0], [\s0], \strd
1331*c0909341SAndroid Build Coastguard Worker.endif
1332*c0909341SAndroid Build Coastguard Worker.ifnb \d5
1333*c0909341SAndroid Build Coastguard Worker        ld1             {\d5\wd}[0], [\s1], \strd
1334*c0909341SAndroid Build Coastguard Worker.endif
1335*c0909341SAndroid Build Coastguard Worker.ifnb \d6
1336*c0909341SAndroid Build Coastguard Worker        ld1             {\d6\wd}[0], [\s0], \strd
1337*c0909341SAndroid Build Coastguard Worker.endif
1338*c0909341SAndroid Build Coastguard Worker.endm
1339*c0909341SAndroid Build Coastguard Worker.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1340*c0909341SAndroid Build Coastguard Worker        ld1             {\d0\wd}, [\s0], \strd
1341*c0909341SAndroid Build Coastguard Worker        ld1             {\d1\wd}, [\s1], \strd
1342*c0909341SAndroid Build Coastguard Worker.ifnb \d2
1343*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\wd}, [\s0], \strd
1344*c0909341SAndroid Build Coastguard Worker        ld1             {\d3\wd}, [\s1], \strd
1345*c0909341SAndroid Build Coastguard Worker.endif
1346*c0909341SAndroid Build Coastguard Worker.ifnb \d4
1347*c0909341SAndroid Build Coastguard Worker        ld1             {\d4\wd}, [\s0], \strd
1348*c0909341SAndroid Build Coastguard Worker.endif
1349*c0909341SAndroid Build Coastguard Worker.ifnb \d5
1350*c0909341SAndroid Build Coastguard Worker        ld1             {\d5\wd}, [\s1], \strd
1351*c0909341SAndroid Build Coastguard Worker.endif
1352*c0909341SAndroid Build Coastguard Worker.ifnb \d6
1353*c0909341SAndroid Build Coastguard Worker        ld1             {\d6\wd}, [\s0], \strd
1354*c0909341SAndroid Build Coastguard Worker.endif
1355*c0909341SAndroid Build Coastguard Worker.endm
1356*c0909341SAndroid Build Coastguard Worker.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
1357*c0909341SAndroid Build Coastguard Worker        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
1358*c0909341SAndroid Build Coastguard Worker.ifnb \d2
1359*c0909341SAndroid Build Coastguard Worker        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
1360*c0909341SAndroid Build Coastguard Worker.endif
1361*c0909341SAndroid Build Coastguard Worker.ifnb \d4
1362*c0909341SAndroid Build Coastguard Worker        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
1363*c0909341SAndroid Build Coastguard Worker.endif
1364*c0909341SAndroid Build Coastguard Worker.endm
1365*c0909341SAndroid Build Coastguard Worker.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1366*c0909341SAndroid Build Coastguard Worker        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1367*c0909341SAndroid Build Coastguard Worker.endm
1368*c0909341SAndroid Build Coastguard Worker.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1369*c0909341SAndroid Build Coastguard Worker        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1370*c0909341SAndroid Build Coastguard Worker.endm
1371*c0909341SAndroid Build Coastguard Worker.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1372*c0909341SAndroid Build Coastguard Worker        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1373*c0909341SAndroid Build Coastguard Worker.endm
1374*c0909341SAndroid Build Coastguard Worker.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
1375*c0909341SAndroid Build Coastguard Worker        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
1376*c0909341SAndroid Build Coastguard Worker.endm
1377*c0909341SAndroid Build Coastguard Worker.macro interleave_1 wd, r0, r1, r2, r3, r4
1378*c0909341SAndroid Build Coastguard Worker        trn1            \r0\wd, \r0\wd, \r1\wd
1379*c0909341SAndroid Build Coastguard Worker        trn1            \r1\wd, \r1\wd, \r2\wd
1380*c0909341SAndroid Build Coastguard Worker.ifnb \r3
1381*c0909341SAndroid Build Coastguard Worker        trn1            \r2\wd, \r2\wd, \r3\wd
1382*c0909341SAndroid Build Coastguard Worker        trn1            \r3\wd, \r3\wd, \r4\wd
1383*c0909341SAndroid Build Coastguard Worker.endif
1384*c0909341SAndroid Build Coastguard Worker.endm
1385*c0909341SAndroid Build Coastguard Worker.macro interleave_1_s r0, r1, r2, r3, r4
1386*c0909341SAndroid Build Coastguard Worker        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1387*c0909341SAndroid Build Coastguard Worker.endm
1388*c0909341SAndroid Build Coastguard Worker.macro umin_h c, wd, r0, r1, r2, r3
1389*c0909341SAndroid Build Coastguard Worker        umin            \r0\wd,  \r0\wd,  \c\wd
1390*c0909341SAndroid Build Coastguard Worker.ifnb \r1
1391*c0909341SAndroid Build Coastguard Worker        umin            \r1\wd,  \r1\wd,  \c\wd
1392*c0909341SAndroid Build Coastguard Worker.endif
1393*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1394*c0909341SAndroid Build Coastguard Worker        umin            \r2\wd,  \r2\wd,  \c\wd
1395*c0909341SAndroid Build Coastguard Worker        umin            \r3\wd,  \r3\wd,  \c\wd
1396*c0909341SAndroid Build Coastguard Worker.endif
1397*c0909341SAndroid Build Coastguard Worker.endm
1398*c0909341SAndroid Build Coastguard Worker.macro sub_h c, wd, r0, r1, r2, r3
1399*c0909341SAndroid Build Coastguard Worker        sub             \r0\wd,  \r0\wd,  \c\wd
1400*c0909341SAndroid Build Coastguard Worker.ifnb \r1
1401*c0909341SAndroid Build Coastguard Worker        sub             \r1\wd,  \r1\wd,  \c\wd
1402*c0909341SAndroid Build Coastguard Worker.endif
1403*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1404*c0909341SAndroid Build Coastguard Worker        sub             \r2\wd,  \r2\wd,  \c\wd
1405*c0909341SAndroid Build Coastguard Worker        sub             \r3\wd,  \r3\wd,  \c\wd
1406*c0909341SAndroid Build Coastguard Worker.endif
1407*c0909341SAndroid Build Coastguard Worker.endm
1408*c0909341SAndroid Build Coastguard Worker.macro smull_smlal_4tap d, s0, s1, s2, s3
1409*c0909341SAndroid Build Coastguard Worker        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1410*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1411*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1412*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1413*c0909341SAndroid Build Coastguard Worker.endm
1414*c0909341SAndroid Build Coastguard Worker.macro smull2_smlal2_4tap d, s0, s1, s2, s3
1415*c0909341SAndroid Build Coastguard Worker        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1416*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1417*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1418*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1419*c0909341SAndroid Build Coastguard Worker.endm
1420*c0909341SAndroid Build Coastguard Worker.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
1421*c0909341SAndroid Build Coastguard Worker        smull           \d\().4s,  \s1\().4h,  v0.h[1]
1422*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1423*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1424*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1425*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1426*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1427*c0909341SAndroid Build Coastguard Worker.endm
1428*c0909341SAndroid Build Coastguard Worker.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
1429*c0909341SAndroid Build Coastguard Worker        smull2          \d\().4s,  \s1\().8h,  v0.h[1]
1430*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1431*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1432*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1433*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1434*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1435*c0909341SAndroid Build Coastguard Worker.endm
1436*c0909341SAndroid Build Coastguard Worker.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
1437*c0909341SAndroid Build Coastguard Worker        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1438*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1439*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1440*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1441*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1442*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1443*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1444*c0909341SAndroid Build Coastguard Worker        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
1445*c0909341SAndroid Build Coastguard Worker.endm
1446*c0909341SAndroid Build Coastguard Worker.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
1447*c0909341SAndroid Build Coastguard Worker        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1448*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1449*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1450*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1451*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1452*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1453*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1454*c0909341SAndroid Build Coastguard Worker        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
1455*c0909341SAndroid Build Coastguard Worker.endm
1456*c0909341SAndroid Build Coastguard Worker.macro sqrshrun_h shift, r0, r1, r2, r3
1457*c0909341SAndroid Build Coastguard Worker        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
1458*c0909341SAndroid Build Coastguard Worker.ifnb \r1
1459*c0909341SAndroid Build Coastguard Worker        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
1460*c0909341SAndroid Build Coastguard Worker.endif
1461*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1462*c0909341SAndroid Build Coastguard Worker        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
1463*c0909341SAndroid Build Coastguard Worker        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
1464*c0909341SAndroid Build Coastguard Worker.endif
1465*c0909341SAndroid Build Coastguard Worker.endm
1466*c0909341SAndroid Build Coastguard Worker.macro xtn_h r0, r1, r2, r3
1467*c0909341SAndroid Build Coastguard Worker        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2
1468*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1469*c0909341SAndroid Build Coastguard Worker        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto
1470*c0909341SAndroid Build Coastguard Worker.endif
1471*c0909341SAndroid Build Coastguard Worker.endm
1472*c0909341SAndroid Build Coastguard Worker.macro srshl_s shift, r0, r1, r2, r3
1473*c0909341SAndroid Build Coastguard Worker        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
1474*c0909341SAndroid Build Coastguard Worker        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
1475*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1476*c0909341SAndroid Build Coastguard Worker        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
1477*c0909341SAndroid Build Coastguard Worker        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
1478*c0909341SAndroid Build Coastguard Worker.endif
1479*c0909341SAndroid Build Coastguard Worker.endm
1480*c0909341SAndroid Build Coastguard Worker.macro st_s strd, reg, lanes
1481*c0909341SAndroid Build Coastguard Worker        st1             {\reg\().s}[0], [x0], \strd
1482*c0909341SAndroid Build Coastguard Worker        st1             {\reg\().s}[1], [x9], \strd
1483*c0909341SAndroid Build Coastguard Worker.if \lanes > 2
1484*c0909341SAndroid Build Coastguard Worker        st1             {\reg\().s}[2], [x0], \strd
1485*c0909341SAndroid Build Coastguard Worker        st1             {\reg\().s}[3], [x9], \strd
1486*c0909341SAndroid Build Coastguard Worker.endif
1487*c0909341SAndroid Build Coastguard Worker.endm
1488*c0909341SAndroid Build Coastguard Worker.macro st_d strd, r0, r1
1489*c0909341SAndroid Build Coastguard Worker        st1             {\r0\().8b},   [x0], \strd
1490*c0909341SAndroid Build Coastguard Worker        st1             {\r0\().d}[1], [x9], \strd
1491*c0909341SAndroid Build Coastguard Worker.ifnb \r1
1492*c0909341SAndroid Build Coastguard Worker        st1             {\r1\().8b},   [x0], \strd
1493*c0909341SAndroid Build Coastguard Worker        st1             {\r1\().d}[1], [x9], \strd
1494*c0909341SAndroid Build Coastguard Worker.endif
1495*c0909341SAndroid Build Coastguard Worker.endm
1496*c0909341SAndroid Build Coastguard Worker.macro shift_store_4 type, strd, r0, r1, r2, r3
1497*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1498*c0909341SAndroid Build Coastguard Worker        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1499*c0909341SAndroid Build Coastguard Worker        umin_h          v31, .8h, \r0, \r2
1500*c0909341SAndroid Build Coastguard Worker.else
1501*c0909341SAndroid Build Coastguard Worker        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1502*c0909341SAndroid Build Coastguard Worker        xtn_h           \r0, \r1, \r2, \r3
1503*c0909341SAndroid Build Coastguard Worker        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1504*c0909341SAndroid Build Coastguard Worker.endif
1505*c0909341SAndroid Build Coastguard Worker        st_d            \strd, \r0, \r2
1506*c0909341SAndroid Build Coastguard Worker.endm
1507*c0909341SAndroid Build Coastguard Worker.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1508*c0909341SAndroid Build Coastguard Worker        st1             {\r0\wd}, [x0], \strd
1509*c0909341SAndroid Build Coastguard Worker        st1             {\r1\wd}, [x9], \strd
1510*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1511*c0909341SAndroid Build Coastguard Worker        st1             {\r2\wd}, [x0], \strd
1512*c0909341SAndroid Build Coastguard Worker        st1             {\r3\wd}, [x9], \strd
1513*c0909341SAndroid Build Coastguard Worker.endif
1514*c0909341SAndroid Build Coastguard Worker.ifnb \r4
1515*c0909341SAndroid Build Coastguard Worker        st1             {\r4\wd}, [x0], \strd
1516*c0909341SAndroid Build Coastguard Worker        st1             {\r5\wd}, [x9], \strd
1517*c0909341SAndroid Build Coastguard Worker        st1             {\r6\wd}, [x0], \strd
1518*c0909341SAndroid Build Coastguard Worker        st1             {\r7\wd}, [x9], \strd
1519*c0909341SAndroid Build Coastguard Worker.endif
1520*c0909341SAndroid Build Coastguard Worker.endm
1521*c0909341SAndroid Build Coastguard Worker.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
1522*c0909341SAndroid Build Coastguard Worker        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1523*c0909341SAndroid Build Coastguard Worker.endm
1524*c0909341SAndroid Build Coastguard Worker.macro shift_store_8 type, strd, r0, r1, r2, r3
1525*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1526*c0909341SAndroid Build Coastguard Worker        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1527*c0909341SAndroid Build Coastguard Worker        umin_h          v31, .8h, \r0, \r2
1528*c0909341SAndroid Build Coastguard Worker.else
1529*c0909341SAndroid Build Coastguard Worker        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1530*c0909341SAndroid Build Coastguard Worker        xtn_h           \r0, \r1, \r2, \r3
1531*c0909341SAndroid Build Coastguard Worker        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1532*c0909341SAndroid Build Coastguard Worker.endif
1533*c0909341SAndroid Build Coastguard Worker        st_8h           \strd, \r0, \r2
1534*c0909341SAndroid Build Coastguard Worker.endm
1535*c0909341SAndroid Build Coastguard Worker.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
1536*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1537*c0909341SAndroid Build Coastguard Worker        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1538*c0909341SAndroid Build Coastguard Worker        umin            \r0\().8h, \r0\().8h, v31.8h
1539*c0909341SAndroid Build Coastguard Worker        umin            \r1\().8h, \r2\().8h, v31.8h
1540*c0909341SAndroid Build Coastguard Worker.else
1541*c0909341SAndroid Build Coastguard Worker        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1542*c0909341SAndroid Build Coastguard Worker        xtn_h           \r0, \r1, \r2, \r3
1543*c0909341SAndroid Build Coastguard Worker        sub             \r0\().8h, \r0\().8h, v29.8h
1544*c0909341SAndroid Build Coastguard Worker        sub             \r1\().8h, \r2\().8h, v29.8h
1545*c0909341SAndroid Build Coastguard Worker.endif
1546*c0909341SAndroid Build Coastguard Worker        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
1547*c0909341SAndroid Build Coastguard Worker.endm
1548*c0909341SAndroid Build Coastguard Worker
1549*c0909341SAndroid Build Coastguard Worker.macro make_8tap_fn op, type, type_h, type_v, taps
1550*c0909341SAndroid Build Coastguard Workerfunction \op\()_8tap_\type\()_16bpc_neon, export=1
1551*c0909341SAndroid Build Coastguard Worker        mov             w9,  \type_h
1552*c0909341SAndroid Build Coastguard Worker        mov             w10, \type_v
1553*c0909341SAndroid Build Coastguard Worker        b               \op\()_\taps\()_neon
1554*c0909341SAndroid Build Coastguard Workerendfunc
1555*c0909341SAndroid Build Coastguard Worker.endm
1556*c0909341SAndroid Build Coastguard Worker
1557*c0909341SAndroid Build Coastguard Worker// No spaces in these expressions, due to gas-preprocessor.
1558*c0909341SAndroid Build Coastguard Worker#define REGULAR ((0*15<<7)|3*15)
1559*c0909341SAndroid Build Coastguard Worker#define SMOOTH  ((1*15<<7)|4*15)
1560*c0909341SAndroid Build Coastguard Worker#define SHARP   ((2*15<<7)|3*15)
1561*c0909341SAndroid Build Coastguard Worker
1562*c0909341SAndroid Build Coastguard Worker.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
1563*c0909341SAndroid Build Coastguard Workerfunction \type\()_\taps\()_neon
1564*c0909341SAndroid Build Coastguard Worker.ifc \bdmax, w8
1565*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]
1566*c0909341SAndroid Build Coastguard Worker.endif
1567*c0909341SAndroid Build Coastguard Worker        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1568*c0909341SAndroid Build Coastguard Worker        mul             \mx,  \mx, w11
1569*c0909341SAndroid Build Coastguard Worker        mul             \my,  \my, w11
1570*c0909341SAndroid Build Coastguard Worker        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
1571*c0909341SAndroid Build Coastguard Worker        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
1572*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1573*c0909341SAndroid Build Coastguard Worker        uxtw            \d_strd, \w
1574*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
1575*c0909341SAndroid Build Coastguard Worker.endif
1576*c0909341SAndroid Build Coastguard Worker
1577*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  \bdmax        // bitdepth_max
1578*c0909341SAndroid Build Coastguard Worker        clz             \bdmax,  \bdmax
1579*c0909341SAndroid Build Coastguard Worker        clz             w9,  \w
1580*c0909341SAndroid Build Coastguard Worker        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
1581*c0909341SAndroid Build Coastguard Worker        mov             w12, #6
1582*c0909341SAndroid Build Coastguard Worker        tst             \mx, #(0x7f << 14)
1583*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #24
1584*c0909341SAndroid Build Coastguard Worker        add             w13, w12, \bdmax       // 6 + intermediate_bits
1585*c0909341SAndroid Build Coastguard Worker        sub             w12, w12, \bdmax       // 6 - intermediate_bits
1586*c0909341SAndroid Build Coastguard Worker        movrel          x11, X(mc_subpel_filters), -8
1587*c0909341SAndroid Build Coastguard Worker        b.ne            L(\type\()_\taps\()_h)
1588*c0909341SAndroid Build Coastguard Worker        tst             \my, #(0x7f << 14)
1589*c0909341SAndroid Build Coastguard Worker        b.ne            L(\type\()_\taps\()_v)
1590*c0909341SAndroid Build Coastguard Worker        b               \type\()_16bpc_neon
1591*c0909341SAndroid Build Coastguard Worker
1592*c0909341SAndroid Build Coastguard WorkerL(\type\()_\taps\()_h):
1593*c0909341SAndroid Build Coastguard Worker        cmp             \w,   #4
1594*c0909341SAndroid Build Coastguard Worker        ubfx            w10,  \mx, #7, #7
1595*c0909341SAndroid Build Coastguard Worker        and             \mx,  \mx, #0x7f
1596*c0909341SAndroid Build Coastguard Worker        b.le            4f
1597*c0909341SAndroid Build Coastguard Worker        mov             \mx,  w10
1598*c0909341SAndroid Build Coastguard Worker4:
1599*c0909341SAndroid Build Coastguard Worker        tst             \my,  #(0x7f << 14)
1600*c0909341SAndroid Build Coastguard Worker        add             \xmx, x11, \mx, uxtw #3
1601*c0909341SAndroid Build Coastguard Worker        b.ne            L(\type\()_\taps\()_hv)
1602*c0909341SAndroid Build Coastguard Worker
1603*c0909341SAndroid Build Coastguard Worker        movrel          x10, \type\()_\taps\()_h_tbl
1604*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, x9, lsl #2]
1605*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1606*c0909341SAndroid Build Coastguard Worker        mov             w12,  #34              // rounding for 10-bit
1607*c0909341SAndroid Build Coastguard Worker        mov             w13,  #40              // rounding for 12-bit
1608*c0909341SAndroid Build Coastguard Worker        cmp             \bdmax, #2             // 10-bit: 4, 12-bit: 2
1609*c0909341SAndroid Build Coastguard Worker        csel            w12,  w12,  w13,  ne   // select rounding based on \bdmax
1610*c0909341SAndroid Build Coastguard Worker.else
1611*c0909341SAndroid Build Coastguard Worker        neg             w12,  w12              // -(6 - intermediate_bits)
1612*c0909341SAndroid Build Coastguard Worker        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
1613*c0909341SAndroid Build Coastguard Worker.endif
1614*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
1615*c0909341SAndroid Build Coastguard Worker        dup             v30.4s,  w12           // rounding or shift amount
1616*c0909341SAndroid Build Coastguard Worker        br              x10
1617*c0909341SAndroid Build Coastguard Worker
1618*c0909341SAndroid Build Coastguard Worker20:     // 2xN h
1619*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1620*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1621*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmx, #2]
1622*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #2
1623*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1624*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1625*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1626*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1627*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
1628*c0909341SAndroid Build Coastguard Worker2:
1629*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h},  [\src], \s_strd
1630*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h},  [\sr2], \s_strd
1631*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v30.16b
1632*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v4.16b,  #2
1633*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v6.16b,  #2
1634*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1635*c0909341SAndroid Build Coastguard Worker        trn1            v3.2s,   v4.2s,   v6.2s
1636*c0909341SAndroid Build Coastguard Worker        trn2            v6.2s,   v4.2s,   v6.2s
1637*c0909341SAndroid Build Coastguard Worker        trn1            v4.2s,   v5.2s,   v7.2s
1638*c0909341SAndroid Build Coastguard Worker        trn2            v7.2s,   v5.2s,   v7.2s
1639*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v3.4h,   v0.h[0]
1640*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v4.4h,   v0.h[1]
1641*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v6.4h,   v0.h[2]
1642*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v7.4h,   v0.h[3]
1643*c0909341SAndroid Build Coastguard Worker        sqshrun         v2.4h,   v2.4s,   #6
1644*c0909341SAndroid Build Coastguard Worker        umin            v2.4h,   v2.4h,   v31.4h
1645*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[0], [\dst], \d_strd
1646*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[1], [\ds2], \d_strd
1647*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1648*c0909341SAndroid Build Coastguard Worker        ret
1649*c0909341SAndroid Build Coastguard Worker.endif
1650*c0909341SAndroid Build Coastguard Worker
1651*c0909341SAndroid Build Coastguard Worker40:     // 4xN h
1652*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1653*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmx, #2]
1654*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #2
1655*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1656*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1657*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1658*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1659*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
1660*c0909341SAndroid Build Coastguard Worker4:
1661*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h}, [\src], \s_strd
1662*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h}, [\sr2], \s_strd
1663*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1664*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v30.16b
1665*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v30.16b
1666*c0909341SAndroid Build Coastguard Worker.endif
1667*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v16.16b, v16.16b, #2
1668*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v16.16b, v16.16b, #4
1669*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v16.16b, v16.16b, #6
1670*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v20.16b, v20.16b, #2
1671*c0909341SAndroid Build Coastguard Worker        ext             v22.16b, v20.16b, v20.16b, #4
1672*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v20.16b, v20.16b, #6
1673*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1674*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1675*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v16.4h,  v0.h[0]
1676*c0909341SAndroid Build Coastguard Worker.else
1677*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v16.4h,  v0.h[0]
1678*c0909341SAndroid Build Coastguard Worker.endif
1679*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v17.4h,  v0.h[1]
1680*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v0.h[2]
1681*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v19.4h,  v0.h[3]
1682*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1683*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v20.4h,  v0.h[0]
1684*c0909341SAndroid Build Coastguard Worker.else
1685*c0909341SAndroid Build Coastguard Worker        smull           v3.4s,   v20.4h,  v0.h[0]
1686*c0909341SAndroid Build Coastguard Worker.endif
1687*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v21.4h,  v0.h[1]
1688*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v22.4h,  v0.h[2]
1689*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v23.4h,  v0.h[3]
1690*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1691*c0909341SAndroid Build Coastguard Worker        sqshrun         v16.4h,  v2.4s,   #6
1692*c0909341SAndroid Build Coastguard Worker        sqshrun2        v16.8h,  v3.4s,   #6
1693*c0909341SAndroid Build Coastguard Worker        umin            v16.8h,  v16.8h,  v31.8h
1694*c0909341SAndroid Build Coastguard Worker.else
1695*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v2.4s,   v30.4s // -(6-intermediate_bits)
1696*c0909341SAndroid Build Coastguard Worker        srshl           v20.4s,  v3.4s,   v30.4s // -(6-intermediate_bits)
1697*c0909341SAndroid Build Coastguard Worker        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
1698*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
1699*c0909341SAndroid Build Coastguard Worker.endif
1700*c0909341SAndroid Build Coastguard Worker        st1             {v16.8b},   [\dst], \d_strd
1701*c0909341SAndroid Build Coastguard Worker        st1             {v16.d}[1], [\ds2], \d_strd
1702*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1703*c0909341SAndroid Build Coastguard Worker        ret
1704*c0909341SAndroid Build Coastguard Worker
1705*c0909341SAndroid Build Coastguard Worker80:
1706*c0909341SAndroid Build Coastguard Worker160:
1707*c0909341SAndroid Build Coastguard Worker320:
1708*c0909341SAndroid Build Coastguard Worker640:
1709*c0909341SAndroid Build Coastguard Worker1280:   // 8xN, 16xN, 32xN, ... h
1710*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1711*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [\xmx]
1712*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
1713*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #4
1714*c0909341SAndroid Build Coastguard Worker.else
1715*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #6
1716*c0909341SAndroid Build Coastguard Worker.endif
1717*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1718*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1719*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1720*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
1721*c0909341SAndroid Build Coastguard Worker
1722*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  \w, uxtw #1
1723*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  #16
1724*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1725*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1726*c0909341SAndroid Build Coastguard Worker        sub             \d_strd,  \d_strd,  \w, uxtw #1
1727*c0909341SAndroid Build Coastguard Worker.endif
1728*c0909341SAndroid Build Coastguard Worker81:
1729*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h},  [\src], #32
1730*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h},  [\sr2], #32
1731*c0909341SAndroid Build Coastguard Worker        mov             \mx, \w
1732*c0909341SAndroid Build Coastguard Worker
1733*c0909341SAndroid Build Coastguard Worker8:
1734*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
1735*c0909341SAndroid Build Coastguard Worker    .ifc \type, put
1736*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v30.16b
1737*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v30.16b
1738*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v16.4h,  v0.h[1]
1739*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v16.8h,  v0.h[1]
1740*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v30.16b
1741*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v30.16b
1742*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v20.4h,  v0.h[1]
1743*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v20.8h,  v0.h[1]
1744*c0909341SAndroid Build Coastguard Worker    .else
1745*c0909341SAndroid Build Coastguard Worker        smull           v18.4s,  v16.4h,  v0.h[1]
1746*c0909341SAndroid Build Coastguard Worker        smull2          v19.4s,  v16.8h,  v0.h[1]
1747*c0909341SAndroid Build Coastguard Worker        smull           v22.4s,  v20.4h,  v0.h[1]
1748*c0909341SAndroid Build Coastguard Worker        smull2          v23.4s,  v20.8h,  v0.h[1]
1749*c0909341SAndroid Build Coastguard Worker    .endif
1750*c0909341SAndroid Build Coastguard Worker    .irpc i, 23456
1751*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v16.16b, v17.16b, #(2*\i-2)
1752*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v20.16b, v21.16b, #(2*\i-2)
1753*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v24.4h,  v0.h[\i]
1754*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1755*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v25.4h,  v0.h[\i]
1756*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1757*c0909341SAndroid Build Coastguard Worker    .endr
1758*c0909341SAndroid Build Coastguard Worker.else   // 8tap
1759*c0909341SAndroid Build Coastguard Worker    .ifc \type, put
1760*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v30.16b
1761*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v30.16b
1762*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v16.4h,  v0.h[0]
1763*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v16.8h,  v0.h[0]
1764*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v30.16b
1765*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v30.16b
1766*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v20.4h,  v0.h[0]
1767*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v20.8h,  v0.h[0]
1768*c0909341SAndroid Build Coastguard Worker    .else
1769*c0909341SAndroid Build Coastguard Worker        smull           v18.4s,  v16.4h,  v0.h[0]
1770*c0909341SAndroid Build Coastguard Worker        smull2          v19.4s,  v16.8h,  v0.h[0]
1771*c0909341SAndroid Build Coastguard Worker        smull           v22.4s,  v20.4h,  v0.h[0]
1772*c0909341SAndroid Build Coastguard Worker        smull2          v23.4s,  v20.8h,  v0.h[0]
1773*c0909341SAndroid Build Coastguard Worker    .endif
1774*c0909341SAndroid Build Coastguard Worker    .irpc i, 1234567
1775*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
1776*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
1777*c0909341SAndroid Build Coastguard Worker        smlal           v18.4s,  v24.4h,  v0.h[\i]
1778*c0909341SAndroid Build Coastguard Worker        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1779*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v25.4h,  v0.h[\i]
1780*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1781*c0909341SAndroid Build Coastguard Worker    .endr
1782*c0909341SAndroid Build Coastguard Worker.endif
1783*c0909341SAndroid Build Coastguard Worker        subs            \mx, \mx, #8
1784*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1785*c0909341SAndroid Build Coastguard Worker        sqshrun         v18.4h,  v18.4s,  #6
1786*c0909341SAndroid Build Coastguard Worker        sqshrun2        v18.8h,  v19.4s,  #6
1787*c0909341SAndroid Build Coastguard Worker        sqshrun         v22.4h,  v22.4s,  #6
1788*c0909341SAndroid Build Coastguard Worker        sqshrun2        v22.8h,  v23.4s,  #6
1789*c0909341SAndroid Build Coastguard Worker        umin            v18.8h,  v18.8h,  v31.8h
1790*c0909341SAndroid Build Coastguard Worker        umin            v22.8h,  v22.8h,  v31.8h
1791*c0909341SAndroid Build Coastguard Worker.else
1792*c0909341SAndroid Build Coastguard Worker        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
1793*c0909341SAndroid Build Coastguard Worker        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
1794*c0909341SAndroid Build Coastguard Worker        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
1795*c0909341SAndroid Build Coastguard Worker        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
1796*c0909341SAndroid Build Coastguard Worker        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
1797*c0909341SAndroid Build Coastguard Worker        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
1798*c0909341SAndroid Build Coastguard Worker        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
1799*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
1800*c0909341SAndroid Build Coastguard Worker.endif
1801*c0909341SAndroid Build Coastguard Worker        st1             {v18.8h}, [\dst], #16
1802*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h}, [\ds2], #16
1803*c0909341SAndroid Build Coastguard Worker        b.le            9f
1804*c0909341SAndroid Build Coastguard Worker
1805*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v17.16b
1806*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v21.16b
1807*c0909341SAndroid Build Coastguard Worker        ld1             {v17.8h}, [\src], #16
1808*c0909341SAndroid Build Coastguard Worker        ld1             {v21.8h}, [\sr2], #16
1809*c0909341SAndroid Build Coastguard Worker        b               8b
1810*c0909341SAndroid Build Coastguard Worker
1811*c0909341SAndroid Build Coastguard Worker9:
1812*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  \d_strd
1813*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \ds2,  \d_strd
1814*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  \s_strd
1815*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \sr2,  \s_strd
1816*c0909341SAndroid Build Coastguard Worker
1817*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1818*c0909341SAndroid Build Coastguard Worker        b.gt            81b
1819*c0909341SAndroid Build Coastguard Worker        ret
1820*c0909341SAndroid Build Coastguard Workerendfunc
1821*c0909341SAndroid Build Coastguard Worker
1822*c0909341SAndroid Build Coastguard Workerjumptable \type\()_\taps\()_h_tbl
1823*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_\taps\()_h_tbl
1824*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_\taps\()_h_tbl
1825*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_\taps\()_h_tbl
1826*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_\taps\()_h_tbl
1827*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_\taps\()_h_tbl
1828*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_\taps\()_h_tbl
1829*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_\taps\()_h_tbl
1830*c0909341SAndroid Build Coastguard Workerendjumptable
1831*c0909341SAndroid Build Coastguard Worker
1832*c0909341SAndroid Build Coastguard Worker
1833*c0909341SAndroid Build Coastguard Workerfunction L(\type\()_\taps\()_v)
1834*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #4
1835*c0909341SAndroid Build Coastguard Worker        ubfx            w10, \my, #7, #7
1836*c0909341SAndroid Build Coastguard Worker        and             \my, \my, #0x7f
1837*c0909341SAndroid Build Coastguard Worker        b.le            4f
1838*c0909341SAndroid Build Coastguard Worker        mov             \my, w10
1839*c0909341SAndroid Build Coastguard Worker4:
1840*c0909341SAndroid Build Coastguard Worker        add             \xmy, x11, \my, uxtw #3
1841*c0909341SAndroid Build Coastguard Worker
1842*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1843*c0909341SAndroid Build Coastguard Worker        dup             v30.4s,  w12           // 6 - intermediate_bits
1844*c0909341SAndroid Build Coastguard Worker        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
1845*c0909341SAndroid Build Coastguard Worker.endif
1846*c0909341SAndroid Build Coastguard Worker        movrel          x10, \type\()_\taps\()_v_tbl
1847*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, x9, lsl #2]
1848*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1849*c0909341SAndroid Build Coastguard Worker        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1850*c0909341SAndroid Build Coastguard Worker.endif
1851*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
1852*c0909341SAndroid Build Coastguard Worker        br              x10
1853*c0909341SAndroid Build Coastguard Worker
1854*c0909341SAndroid Build Coastguard Worker20:     // 2xN v
1855*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1856*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1857*c0909341SAndroid Build Coastguard Worker        b.gt            28f
1858*c0909341SAndroid Build Coastguard Worker
1859*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1860*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmy, #2]
1861*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd
1862*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1863*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1864*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1865*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1866*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
1867*c0909341SAndroid Build Coastguard Worker
1868*c0909341SAndroid Build Coastguard Worker        // 2x2 v
1869*c0909341SAndroid Build Coastguard Worker        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1870*c0909341SAndroid Build Coastguard Worker        interleave_1_s  v1,  v2,  v3,  v4,  v5
1871*c0909341SAndroid Build Coastguard Worker        b.gt            24f
1872*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap v6, v1,  v2,  v3,  v4
1873*c0909341SAndroid Build Coastguard Worker        sqrshrun_h      6,   v6
1874*c0909341SAndroid Build Coastguard Worker        umin_h          v31, .8h, v6
1875*c0909341SAndroid Build Coastguard Worker        st_s            \d_strd, v6, 2
1876*c0909341SAndroid Build Coastguard Worker        ret
1877*c0909341SAndroid Build Coastguard Worker
1878*c0909341SAndroid Build Coastguard Worker24:     // 2x4 v
1879*c0909341SAndroid Build Coastguard Worker        load_s          \sr2, \src, \s_strd, v6, v7
1880*c0909341SAndroid Build Coastguard Worker        interleave_1_s  v5,  v6,  v7
1881*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap v16, v1, v2, v3, v4
1882*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap v17, v3, v4, v5, v6
1883*c0909341SAndroid Build Coastguard Worker        sqrshrun_h      6,   v16, v17
1884*c0909341SAndroid Build Coastguard Worker        umin_h          v31, .8h, v16
1885*c0909341SAndroid Build Coastguard Worker        st_s            \d_strd, v16, 4
1886*c0909341SAndroid Build Coastguard Worker        ret
1887*c0909341SAndroid Build Coastguard Worker
1888*c0909341SAndroid Build Coastguard Worker28:     // 2x6, 2x8, 2x12, 2x16 v
1889*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [\xmy]
1890*c0909341SAndroid Build Coastguard Worker        sub             \sr2,  \src,  \s_strd, lsl #1
1891*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1892*c0909341SAndroid Build Coastguard Worker        sub             \src,  \sr2,  \s_strd
1893*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1894*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1895*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
1896*c0909341SAndroid Build Coastguard Worker
1897*c0909341SAndroid Build Coastguard Worker        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1898*c0909341SAndroid Build Coastguard Worker        interleave_1_s  v1,  v2,  v3,  v4,  v5
1899*c0909341SAndroid Build Coastguard Worker        interleave_1_s  v5,  v6,  v7
1900*c0909341SAndroid Build Coastguard Worker216:
1901*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #4
1902*c0909341SAndroid Build Coastguard Worker        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
1903*c0909341SAndroid Build Coastguard Worker        interleave_1_s  v7,  v16, v17, v18, v19
1904*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v24, v1,  v2,  v3,  v4,  v5,  v6,  v7, v16
1905*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v25, v3,  v4,  v5,  v6,  v7, v16, v17, v18
1906*c0909341SAndroid Build Coastguard Worker        sqrshrun_h      6,   v24, v25
1907*c0909341SAndroid Build Coastguard Worker        umin_h          v31, .8h, v24
1908*c0909341SAndroid Build Coastguard Worker        st_s            \d_strd, v24, 4
1909*c0909341SAndroid Build Coastguard Worker        b.le            0f
1910*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1911*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v5.16b
1912*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v6.16b
1913*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v7.16b
1914*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v16.16b
1915*c0909341SAndroid Build Coastguard Worker        mov             v5.16b,  v17.16b
1916*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v18.16b
1917*c0909341SAndroid Build Coastguard Worker        mov             v7.16b,  v19.16b
1918*c0909341SAndroid Build Coastguard Worker        b.eq            26f
1919*c0909341SAndroid Build Coastguard Worker        b               216b
1920*c0909341SAndroid Build Coastguard Worker26:
1921*c0909341SAndroid Build Coastguard Worker        load_s          \sr2, \src, \s_strd, v16, v17
1922*c0909341SAndroid Build Coastguard Worker        interleave_1_s  v7,  v16, v17
1923*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v24, v1, v2,  v3,  v4,  v5,  v6,  v7, v16
1924*c0909341SAndroid Build Coastguard Worker        sqrshrun_h      6,   v24
1925*c0909341SAndroid Build Coastguard Worker        umin_h          v31, .4h, v24
1926*c0909341SAndroid Build Coastguard Worker        st_s            \d_strd, v24, 2
1927*c0909341SAndroid Build Coastguard Worker0:
1928*c0909341SAndroid Build Coastguard Worker        ret
1929*c0909341SAndroid Build Coastguard Worker.endif
1930*c0909341SAndroid Build Coastguard Worker
1931*c0909341SAndroid Build Coastguard Worker40:
1932*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1933*c0909341SAndroid Build Coastguard Worker        b.gt            480f
1934*c0909341SAndroid Build Coastguard Worker
1935*c0909341SAndroid Build Coastguard Worker        // 4x2, 4x4 v
1936*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1937*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmy, #2]
1938*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
1939*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
1940*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
1941*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
1942*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
1943*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
1944*c0909341SAndroid Build Coastguard Worker
1945*c0909341SAndroid Build Coastguard Worker        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1946*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap v6,  v1,  v2,  v3,  v4
1947*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap v7,  v2,  v3,  v4,  v5
1948*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, v6, v7
1949*c0909341SAndroid Build Coastguard Worker        b.le            0f
1950*c0909341SAndroid Build Coastguard Worker        load_4h         \sr2, \src, \s_strd, v6, v7
1951*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap v1,  v3,  v4,  v5,  v6
1952*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap v2,  v4,  v5,  v6,  v7
1953*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, v1, v2
1954*c0909341SAndroid Build Coastguard Worker0:
1955*c0909341SAndroid Build Coastguard Worker        ret
1956*c0909341SAndroid Build Coastguard Worker
1957*c0909341SAndroid Build Coastguard Worker480:    // 4x6, 4x8, 4x12, 4x16 v
1958*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [\xmy]
1959*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, \s_strd, lsl #1
1960*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
1961*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
1962*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
1963*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
1964*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
1965*c0909341SAndroid Build Coastguard Worker
1966*c0909341SAndroid Build Coastguard Worker        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1967*c0909341SAndroid Build Coastguard Worker
1968*c0909341SAndroid Build Coastguard Worker48:
1969*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #4
1970*c0909341SAndroid Build Coastguard Worker        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
1971*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
1972*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
1973*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
1974*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
1975*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, v1, v2, v3, v4
1976*c0909341SAndroid Build Coastguard Worker        b.le            0f
1977*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1978*c0909341SAndroid Build Coastguard Worker        mov             v16.8b,  v20.8b
1979*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v21.8b
1980*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v22.8b
1981*c0909341SAndroid Build Coastguard Worker        mov             v19.8b,  v23.8b
1982*c0909341SAndroid Build Coastguard Worker        mov             v20.8b,  v24.8b
1983*c0909341SAndroid Build Coastguard Worker        mov             v21.8b,  v25.8b
1984*c0909341SAndroid Build Coastguard Worker        mov             v22.8b,  v26.8b
1985*c0909341SAndroid Build Coastguard Worker        b.eq            46f
1986*c0909341SAndroid Build Coastguard Worker        b               48b
1987*c0909341SAndroid Build Coastguard Worker46:
1988*c0909341SAndroid Build Coastguard Worker        load_4h         \sr2, \src, \s_strd, v23, v24
1989*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
1990*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
1991*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, v1, v2
1992*c0909341SAndroid Build Coastguard Worker0:
1993*c0909341SAndroid Build Coastguard Worker        ret
1994*c0909341SAndroid Build Coastguard Worker
1995*c0909341SAndroid Build Coastguard Worker80:
1996*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1997*c0909341SAndroid Build Coastguard Worker        b.gt            880f
1998*c0909341SAndroid Build Coastguard Worker
1999*c0909341SAndroid Build Coastguard Worker        // 8x2, 8x4 v
2000*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
2001*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmy, #2]
2002*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
2003*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2004*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
2005*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2006*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2007*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2008*c0909341SAndroid Build Coastguard Worker
2009*c0909341SAndroid Build Coastguard Worker        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
2010*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap   v16, v1,  v2,  v3,  v4
2011*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_4tap v17, v1,  v2,  v3,  v4
2012*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap   v18, v2,  v3,  v4,  v5
2013*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_4tap v19, v2,  v3,  v4,  v5
2014*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, v16, v17, v18, v19
2015*c0909341SAndroid Build Coastguard Worker        b.le            0f
2016*c0909341SAndroid Build Coastguard Worker        load_8h         \sr2, \src, \s_strd, v6, v7
2017*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap   v16, v3,  v4,  v5,  v6
2018*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_4tap v17, v3,  v4,  v5,  v6
2019*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap   v18, v4,  v5,  v6,  v7
2020*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_4tap v19, v4,  v5,  v6,  v7
2021*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, v16, v17, v18, v19
2022*c0909341SAndroid Build Coastguard Worker0:
2023*c0909341SAndroid Build Coastguard Worker        ret
2024*c0909341SAndroid Build Coastguard Worker
2025*c0909341SAndroid Build Coastguard Worker880:    // 8x6, 8x8, 8x16, 8x32 v
2026*c0909341SAndroid Build Coastguard Worker1680:   // 16x8, 16x16, ...
2027*c0909341SAndroid Build Coastguard Worker320:    // 32x8, 32x16, ...
2028*c0909341SAndroid Build Coastguard Worker640:
2029*c0909341SAndroid Build Coastguard Worker1280:
2030*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2031*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [\xmy]
2032*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
2033*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1
2034*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2035*c0909341SAndroid Build Coastguard Worker        mov             \my,  \h
2036*c0909341SAndroid Build Coastguard Worker168:
2037*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2038*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
2039*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2040*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2041*c0909341SAndroid Build Coastguard Worker
2042*c0909341SAndroid Build Coastguard Worker        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
2043*c0909341SAndroid Build Coastguard Worker
2044*c0909341SAndroid Build Coastguard Worker88:
2045*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2046*c0909341SAndroid Build Coastguard Worker        load_8h         \sr2, \src, \s_strd, v23, v24
2047*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps   v1, v16, v17, v18, v19, v20, v21, v22, v23
2048*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
2049*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps   v3, v17, v18, v19, v20, v21, v22, v23, v24
2050*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
2051*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, v1, v2, v3, v4
2052*c0909341SAndroid Build Coastguard Worker        b.le            9f
2053*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2054*c0909341SAndroid Build Coastguard Worker        load_8h         \sr2, \src, \s_strd, v25, v26
2055*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps   v1, v18, v19, v20, v21, v22, v23, v24, v25
2056*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
2057*c0909341SAndroid Build Coastguard Worker        smull_smlal_\taps   v3, v19, v20, v21, v22, v23, v24, v25, v26
2058*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
2059*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, v1, v2, v3, v4
2060*c0909341SAndroid Build Coastguard Worker        b.le            9f
2061*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v20.16b
2062*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v21.16b
2063*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v22.16b
2064*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v23.16b
2065*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v24.16b
2066*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v25.16b
2067*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v26.16b
2068*c0909341SAndroid Build Coastguard Worker        b               88b
2069*c0909341SAndroid Build Coastguard Worker9:
2070*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
2071*c0909341SAndroid Build Coastguard Worker        b.le            0f
2072*c0909341SAndroid Build Coastguard Worker        asr             \s_strd, \s_strd, #1
2073*c0909341SAndroid Build Coastguard Worker        asr             \d_strd, \d_strd, #1
2074*c0909341SAndroid Build Coastguard Worker        msub            \src, \s_strd, \xmy, \src
2075*c0909341SAndroid Build Coastguard Worker        msub            \dst, \d_strd, \xmy, \dst
2076*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #3
2077*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
2078*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
2079*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
2080*c0909341SAndroid Build Coastguard Worker        b               168b
2081*c0909341SAndroid Build Coastguard Worker0:
2082*c0909341SAndroid Build Coastguard Worker        ret
2083*c0909341SAndroid Build Coastguard Worker
2084*c0909341SAndroid Build Coastguard Worker160:
2085*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2086*c0909341SAndroid Build Coastguard Worker        b.gt            1680b
2087*c0909341SAndroid Build Coastguard Worker
2088*c0909341SAndroid Build Coastguard Worker        // 16x2, 16x4 v
2089*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmy, #2]
2090*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
2091*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2092*c0909341SAndroid Build Coastguard Worker
2093*c0909341SAndroid Build Coastguard Worker        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
2094*c0909341SAndroid Build Coastguard Worker16:
2095*c0909341SAndroid Build Coastguard Worker        load_16h        \src, \src, \s_strd, v22, v23
2096*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #1
2097*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap   v1, v16, v18, v20, v22
2098*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_4tap v2, v16, v18, v20, v22
2099*c0909341SAndroid Build Coastguard Worker        smull_smlal_4tap   v3, v17, v19, v21, v23
2100*c0909341SAndroid Build Coastguard Worker        smull2_smlal2_4tap v4, v17, v19, v21, v23
2101*c0909341SAndroid Build Coastguard Worker        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
2102*c0909341SAndroid Build Coastguard Worker        b.le            0f
2103*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v18.16b
2104*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v19.16b
2105*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2106*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v21.16b
2107*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v22.16b
2108*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v23.16b
2109*c0909341SAndroid Build Coastguard Worker        b               16b
2110*c0909341SAndroid Build Coastguard Worker0:
2111*c0909341SAndroid Build Coastguard Worker        ret
2112*c0909341SAndroid Build Coastguard Workerendfunc
2113*c0909341SAndroid Build Coastguard Worker
2114*c0909341SAndroid Build Coastguard Workerjumptable \type\()_\taps\()_v_tbl
2115*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_\taps\()_v_tbl
2116*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_\taps\()_v_tbl
2117*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_\taps\()_v_tbl
2118*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_\taps\()_v_tbl
2119*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_\taps\()_v_tbl
2120*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_\taps\()_v_tbl
2121*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_\taps\()_v_tbl
2122*c0909341SAndroid Build Coastguard Workerendjumptable
2123*c0909341SAndroid Build Coastguard Worker
2124*c0909341SAndroid Build Coastguard Workerfunction L(\type\()_\taps\()_hv)
2125*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #4
2126*c0909341SAndroid Build Coastguard Worker        ubfx            w10, \my, #7, #7
2127*c0909341SAndroid Build Coastguard Worker        and             \my, \my, #0x7f
2128*c0909341SAndroid Build Coastguard Worker        b.le            4f
2129*c0909341SAndroid Build Coastguard Worker        mov             \my,  w10
2130*c0909341SAndroid Build Coastguard Worker4:
2131*c0909341SAndroid Build Coastguard Worker        add             \xmy, x11, \my, uxtw #3
2132*c0909341SAndroid Build Coastguard Worker
2133*c0909341SAndroid Build Coastguard Worker        movrel          x10, \type\()_\taps\()_hv_tbl
2134*c0909341SAndroid Build Coastguard Worker        dup             v30.4s,  w12           // 6 - intermediate_bits
2135*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, x9, lsl #2]
2136*c0909341SAndroid Build Coastguard Worker        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
2137*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2138*c0909341SAndroid Build Coastguard Worker        dup             v29.4s,  w13           // 6 + intermediate_bits
2139*c0909341SAndroid Build Coastguard Worker.else
2140*c0909341SAndroid Build Coastguard Worker        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2141*c0909341SAndroid Build Coastguard Worker.endif
2142*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
2143*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2144*c0909341SAndroid Build Coastguard Worker        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
2145*c0909341SAndroid Build Coastguard Worker.endif
2146*c0909341SAndroid Build Coastguard Worker        br              x10
2147*c0909341SAndroid Build Coastguard Worker
2148*c0909341SAndroid Build Coastguard Worker20:
2149*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2150*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2151*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmx, #2]
2152*c0909341SAndroid Build Coastguard Worker        b.gt            280f
2153*c0909341SAndroid Build Coastguard Worker        ldur            s1,  [\xmy, #2]
2154*c0909341SAndroid Build Coastguard Worker
2155*c0909341SAndroid Build Coastguard Worker        // 2x2, 2x4 hv
2156*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, #2
2157*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2158*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2159*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2160*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2161*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2162*c0909341SAndroid Build Coastguard Worker        sxtl            v1.8h,   v1.8b
2163*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
2164*c0909341SAndroid Build Coastguard Worker
2165*c0909341SAndroid Build Coastguard Worker        ld1             {v27.8h}, [\src], \s_strd
2166*c0909341SAndroid Build Coastguard Worker        ext             v28.16b, v27.16b, v27.16b, #2
2167*c0909341SAndroid Build Coastguard Worker        smull           v27.4s,  v27.4h,  v0.4h
2168*c0909341SAndroid Build Coastguard Worker        smull           v28.4s,  v28.4h,  v0.4h
2169*c0909341SAndroid Build Coastguard Worker        addp            v27.4s,  v27.4s,  v28.4s
2170*c0909341SAndroid Build Coastguard Worker        addp            v16.4s,  v27.4s,  v27.4s
2171*c0909341SAndroid Build Coastguard Worker        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2172*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_2)
2173*c0909341SAndroid Build Coastguard Worker        // The intermediates from the horizontal pass fit in 16 bit without
2174*c0909341SAndroid Build Coastguard Worker        // any bias; we could just as well keep them as .4s, but narrowing
2175*c0909341SAndroid Build Coastguard Worker        // them to .4h gives a significant speedup on out of order cores
2176*c0909341SAndroid Build Coastguard Worker        // (at the cost of a smaller slowdown on in-order cores such as A53).
2177*c0909341SAndroid Build Coastguard Worker        xtn             v16.4h,  v16.4s
2178*c0909341SAndroid Build Coastguard Worker
2179*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v24.2s
2180*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v24.8b
2181*c0909341SAndroid Build Coastguard Worker
2182*c0909341SAndroid Build Coastguard Worker2:
2183*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_2)
2184*c0909341SAndroid Build Coastguard Worker
2185*c0909341SAndroid Build Coastguard Worker        ext             v18.8b,  v17.8b,  v24.8b,  #4
2186*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v16.4h,  v1.h[0]
2187*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v17.4h,  v1.h[1]
2188*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v1.h[2]
2189*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v24.4h,  v1.h[3]
2190*c0909341SAndroid Build Coastguard Worker
2191*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2192*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.4h,   v2.4s
2193*c0909341SAndroid Build Coastguard Worker        umin            v2.4h,   v2.4h,   v31.4h
2194*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2195*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[0], [\dst], \d_strd
2196*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[1], [\ds2], \d_strd
2197*c0909341SAndroid Build Coastguard Worker        b.le            0f
2198*c0909341SAndroid Build Coastguard Worker        mov             v16.8b,  v18.8b
2199*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v24.8b
2200*c0909341SAndroid Build Coastguard Worker        b               2b
2201*c0909341SAndroid Build Coastguard Worker
2202*c0909341SAndroid Build Coastguard Worker280:    // 2x8, 2x16, 2x32 hv
2203*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b},  [\xmy]
2204*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, #2
2205*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, \s_strd, lsl #1
2206*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2207*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2208*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2209*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2210*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2211*c0909341SAndroid Build Coastguard Worker        sxtl            v1.8h,   v1.8b
2212*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
2213*c0909341SAndroid Build Coastguard Worker
2214*c0909341SAndroid Build Coastguard Worker        ld1             {v27.8h}, [\src], \s_strd
2215*c0909341SAndroid Build Coastguard Worker        ext             v28.16b, v27.16b, v27.16b, #2
2216*c0909341SAndroid Build Coastguard Worker        smull           v27.4s,  v27.4h,  v0.4h
2217*c0909341SAndroid Build Coastguard Worker        smull           v28.4s,  v28.4h,  v0.4h
2218*c0909341SAndroid Build Coastguard Worker        addp            v27.4s,  v27.4s,  v28.4s
2219*c0909341SAndroid Build Coastguard Worker        addp            v16.4s,  v27.4s,  v27.4s
2220*c0909341SAndroid Build Coastguard Worker        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2221*c0909341SAndroid Build Coastguard Worker        // The intermediates from the horizontal pass fit in 16 bit without
2222*c0909341SAndroid Build Coastguard Worker        // any bias; we could just as well keep them as .4s, but narrowing
2223*c0909341SAndroid Build Coastguard Worker        // them to .4h gives a significant speedup on out of order cores
2224*c0909341SAndroid Build Coastguard Worker        // (at the cost of a smaller slowdown on in-order cores such as A53).
2225*c0909341SAndroid Build Coastguard Worker
2226*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_2)
2227*c0909341SAndroid Build Coastguard Worker        xtn             v16.4h,  v16.4s
2228*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v24.2s
2229*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v24.8b
2230*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_2)
2231*c0909341SAndroid Build Coastguard Worker        ext             v18.8b,  v17.8b,  v24.8b,  #4
2232*c0909341SAndroid Build Coastguard Worker        mov             v19.8b,  v24.8b
2233*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_2)
2234*c0909341SAndroid Build Coastguard Worker        ext             v20.8b,  v19.8b,  v24.8b,  #4
2235*c0909341SAndroid Build Coastguard Worker        mov             v21.8b,  v24.8b
2236*c0909341SAndroid Build Coastguard Worker
2237*c0909341SAndroid Build Coastguard Worker28:
2238*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_2)
2239*c0909341SAndroid Build Coastguard Worker        ext             v22.8b,  v21.8b,  v24.8b,  #4
2240*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2241*c0909341SAndroid Build Coastguard Worker        smull           v3.4s,   v17.4h,  v1.h[1]
2242*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v18.4h,  v1.h[2]
2243*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v19.4h,  v1.h[3]
2244*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v20.4h,  v1.h[4]
2245*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v21.4h,  v1.h[5]
2246*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v22.4h,  v1.h[6]
2247*c0909341SAndroid Build Coastguard Worker.else   // 8tap
2248*c0909341SAndroid Build Coastguard Worker        smull           v3.4s,   v16.4h,  v1.h[0]
2249*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v17.4h,  v1.h[1]
2250*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v18.4h,  v1.h[2]
2251*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v19.4h,  v1.h[3]
2252*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v20.4h,  v1.h[4]
2253*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v21.4h,  v1.h[5]
2254*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v22.4h,  v1.h[6]
2255*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v24.4h,  v1.h[7]
2256*c0909341SAndroid Build Coastguard Worker.endif
2257*c0909341SAndroid Build Coastguard Worker
2258*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2259*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.4h,   v3.4s
2260*c0909341SAndroid Build Coastguard Worker        umin            v3.4h,   v3.4h,   v31.4h
2261*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2262*c0909341SAndroid Build Coastguard Worker        st1             {v3.s}[0], [\dst], \d_strd
2263*c0909341SAndroid Build Coastguard Worker        st1             {v3.s}[1], [\ds2], \d_strd
2264*c0909341SAndroid Build Coastguard Worker        b.le            0f
2265*c0909341SAndroid Build Coastguard Worker        mov             v16.8b,  v18.8b
2266*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v19.8b
2267*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v20.8b
2268*c0909341SAndroid Build Coastguard Worker        mov             v19.8b,  v21.8b
2269*c0909341SAndroid Build Coastguard Worker        mov             v20.8b,  v22.8b
2270*c0909341SAndroid Build Coastguard Worker        mov             v21.8b,  v24.8b
2271*c0909341SAndroid Build Coastguard Worker        b               28b
2272*c0909341SAndroid Build Coastguard Worker
2273*c0909341SAndroid Build Coastguard Worker0:
2274*c0909341SAndroid Build Coastguard Worker        ret             x15
2275*c0909341SAndroid Build Coastguard Worker
2276*c0909341SAndroid Build Coastguard WorkerL(\type\()_\taps\()_filter_2):
2277*c0909341SAndroid Build Coastguard Worker        ld1             {v25.8h},  [\sr2], \s_strd
2278*c0909341SAndroid Build Coastguard Worker        ld1             {v27.8h},  [\src], \s_strd
2279*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v25.16b, v25.16b, #2
2280*c0909341SAndroid Build Coastguard Worker        ext             v28.16b, v27.16b, v27.16b, #2
2281*c0909341SAndroid Build Coastguard Worker        trn1            v24.2s,  v25.2s,  v27.2s
2282*c0909341SAndroid Build Coastguard Worker        trn2            v27.2s,  v25.2s,  v27.2s
2283*c0909341SAndroid Build Coastguard Worker        trn1            v25.2s,  v26.2s,  v28.2s
2284*c0909341SAndroid Build Coastguard Worker        trn2            v28.2s,  v26.2s,  v28.2s
2285*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v24.4h,  v0.h[0]
2286*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v25.4h,  v0.h[1]
2287*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v27.4h,  v0.h[2]
2288*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v28.4h,  v0.h[3]
2289*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2290*c0909341SAndroid Build Coastguard Worker        xtn             v24.4h,  v24.4s
2291*c0909341SAndroid Build Coastguard Worker        ret
2292*c0909341SAndroid Build Coastguard Worker.endif
2293*c0909341SAndroid Build Coastguard Worker
2294*c0909341SAndroid Build Coastguard Worker40:
2295*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2296*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [\xmx, #2]
2297*c0909341SAndroid Build Coastguard Worker        b.gt            480f
2298*c0909341SAndroid Build Coastguard Worker        ldur            s1,  [\xmy, #2]
2299*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, #2
2300*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2301*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2302*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2303*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2304*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2305*c0909341SAndroid Build Coastguard Worker        sxtl            v1.8h,   v1.8b
2306*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
2307*c0909341SAndroid Build Coastguard Worker
2308*c0909341SAndroid Build Coastguard Worker        // 4x2, 4x4 hv
2309*c0909341SAndroid Build Coastguard Worker        ld1             {v25.8h}, [\src], \s_strd
2310*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v25.16b, v25.16b, #2
2311*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v25.16b, v25.16b, #4
2312*c0909341SAndroid Build Coastguard Worker        ext             v28.16b, v25.16b, v25.16b, #6
2313*c0909341SAndroid Build Coastguard Worker        smull           v25.4s,  v25.4h,  v0.h[0]
2314*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v26.4h,  v0.h[1]
2315*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v27.4h,  v0.h[2]
2316*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v28.4h,  v0.h[3]
2317*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2318*c0909341SAndroid Build Coastguard Worker        // The intermediates from the horizontal pass fit in 16 bit without
2319*c0909341SAndroid Build Coastguard Worker        // any bias; we could just as well keep them as .4s, but narrowing
2320*c0909341SAndroid Build Coastguard Worker        // them to .4h gives a significant speedup on out of order cores
2321*c0909341SAndroid Build Coastguard Worker        // (at the cost of a smaller slowdown on in-order cores such as A53).
2322*c0909341SAndroid Build Coastguard Worker        xtn             v16.4h,  v16.4s
2323*c0909341SAndroid Build Coastguard Worker
2324*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_4)
2325*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v24.8b
2326*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v25.8b
2327*c0909341SAndroid Build Coastguard Worker
2328*c0909341SAndroid Build Coastguard Worker4:
2329*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_4)
2330*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v16.4h,  v1.h[0]
2331*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v17.4h,  v1.h[1]
2332*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v1.h[2]
2333*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v24.4h,  v1.h[3]
2334*c0909341SAndroid Build Coastguard Worker        smull           v3.4s,   v17.4h,  v1.h[0]
2335*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v18.4h,  v1.h[1]
2336*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v24.4h,  v1.h[2]
2337*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v25.4h,  v1.h[3]
2338*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2339*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2340*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2341*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.4h,   v2.4s
2342*c0909341SAndroid Build Coastguard Worker        sqxtun2         v2.8h,   v3.4s
2343*c0909341SAndroid Build Coastguard Worker        umin            v2.8h,   v2.8h,   v31.8h
2344*c0909341SAndroid Build Coastguard Worker.else
2345*c0909341SAndroid Build Coastguard Worker        rshrn           v2.4h,   v2.4s,   #6
2346*c0909341SAndroid Build Coastguard Worker        rshrn2          v2.8h,   v3.4s,   #6
2347*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2348*c0909341SAndroid Build Coastguard Worker.endif
2349*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2350*c0909341SAndroid Build Coastguard Worker
2351*c0909341SAndroid Build Coastguard Worker        st1             {v2.8b},   [\dst], \d_strd
2352*c0909341SAndroid Build Coastguard Worker        st1             {v2.d}[1], [\ds2], \d_strd
2353*c0909341SAndroid Build Coastguard Worker        b.le            0f
2354*c0909341SAndroid Build Coastguard Worker        mov             v16.8b,  v18.8b
2355*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v24.8b
2356*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v25.8b
2357*c0909341SAndroid Build Coastguard Worker        b               4b
2358*c0909341SAndroid Build Coastguard Worker
2359*c0909341SAndroid Build Coastguard Worker480:    // 4x8, 4x16, 4x32 hv
2360*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b},  [\xmy]
2361*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, #2
2362*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2363*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, \s_strd
2364*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1
2365*c0909341SAndroid Build Coastguard Worker.else
2366*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, \s_strd, lsl #1
2367*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2368*c0909341SAndroid Build Coastguard Worker.endif
2369*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2370*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2371*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2372*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2373*c0909341SAndroid Build Coastguard Worker        sxtl            v1.8h,   v1.8b
2374*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
2375*c0909341SAndroid Build Coastguard Worker
2376*c0909341SAndroid Build Coastguard Worker        ld1             {v25.8h}, [\src], \s_strd
2377*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v25.16b, v25.16b, #2
2378*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v25.16b, v25.16b, #4
2379*c0909341SAndroid Build Coastguard Worker        ext             v28.16b, v25.16b, v25.16b, #6
2380*c0909341SAndroid Build Coastguard Worker        smull           v25.4s,  v25.4h,  v0.h[0]
2381*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v26.4h,  v0.h[1]
2382*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v27.4h,  v0.h[2]
2383*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v28.4h,  v0.h[3]
2384*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2385*c0909341SAndroid Build Coastguard Worker        // The intermediates from the horizontal pass fit in 16 bit without
2386*c0909341SAndroid Build Coastguard Worker        // any bias; we could just as well keep them as .4s, but narrowing
2387*c0909341SAndroid Build Coastguard Worker        // them to .4h gives a significant speedup on out of order cores
2388*c0909341SAndroid Build Coastguard Worker        // (at the cost of a smaller slowdown on in-order cores such as A53).
2389*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2390*c0909341SAndroid Build Coastguard Worker        xtn             v18.4h,  v16.4s
2391*c0909341SAndroid Build Coastguard Worker.else
2392*c0909341SAndroid Build Coastguard Worker        xtn             v16.4h,  v16.4s
2393*c0909341SAndroid Build Coastguard Worker
2394*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_4)
2395*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v24.8b
2396*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v25.8b
2397*c0909341SAndroid Build Coastguard Worker.endif
2398*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_4)
2399*c0909341SAndroid Build Coastguard Worker        mov             v19.8b,  v24.8b
2400*c0909341SAndroid Build Coastguard Worker        mov             v20.8b,  v25.8b
2401*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_4)
2402*c0909341SAndroid Build Coastguard Worker        mov             v21.8b,  v24.8b
2403*c0909341SAndroid Build Coastguard Worker        mov             v22.8b,  v25.8b
2404*c0909341SAndroid Build Coastguard Worker
2405*c0909341SAndroid Build Coastguard Worker48:
2406*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_4)
2407*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2408*c0909341SAndroid Build Coastguard Worker        smull           v3.4s,   v18.4h,  v1.h[1]
2409*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v19.4h,  v1.h[2]
2410*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v20.4h,  v1.h[3]
2411*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v21.4h,  v1.h[4]
2412*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v22.4h,  v1.h[5]
2413*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v24.4h,  v1.h[6]
2414*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v19.4h,  v1.h[1]
2415*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v20.4h,  v1.h[2]
2416*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v21.4h,  v1.h[3]
2417*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v22.4h,  v1.h[4]
2418*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v24.4h,  v1.h[5]
2419*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v25.4h,  v1.h[6]
2420*c0909341SAndroid Build Coastguard Worker.else   // 8tap
2421*c0909341SAndroid Build Coastguard Worker        smull           v3.4s,   v16.4h,  v1.h[0]
2422*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v17.4h,  v1.h[1]
2423*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v18.4h,  v1.h[2]
2424*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v19.4h,  v1.h[3]
2425*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v20.4h,  v1.h[4]
2426*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v21.4h,  v1.h[5]
2427*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v22.4h,  v1.h[6]
2428*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v24.4h,  v1.h[7]
2429*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v17.4h,  v1.h[0]
2430*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v18.4h,  v1.h[1]
2431*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v19.4h,  v1.h[2]
2432*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v20.4h,  v1.h[3]
2433*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v21.4h,  v1.h[4]
2434*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v22.4h,  v1.h[5]
2435*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v24.4h,  v1.h[6]
2436*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v25.4h,  v1.h[7]
2437*c0909341SAndroid Build Coastguard Worker.endif
2438*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2439*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2440*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2441*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.4h,   v3.4s
2442*c0909341SAndroid Build Coastguard Worker        sqxtun2         v3.8h,   v4.4s
2443*c0909341SAndroid Build Coastguard Worker        umin            v3.8h,   v3.8h,   v31.8h
2444*c0909341SAndroid Build Coastguard Worker.else
2445*c0909341SAndroid Build Coastguard Worker        rshrn           v3.4h,   v3.4s,   #6
2446*c0909341SAndroid Build Coastguard Worker        rshrn2          v3.8h,   v4.4s,   #6
2447*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2448*c0909341SAndroid Build Coastguard Worker.endif
2449*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2450*c0909341SAndroid Build Coastguard Worker        st1             {v3.8b},   [\dst], \d_strd
2451*c0909341SAndroid Build Coastguard Worker        st1             {v3.d}[1], [\ds2], \d_strd
2452*c0909341SAndroid Build Coastguard Worker        b.le            0f
2453*c0909341SAndroid Build Coastguard Worker.ifc \taps, 8tap
2454*c0909341SAndroid Build Coastguard Worker        mov             v16.8b,  v18.8b
2455*c0909341SAndroid Build Coastguard Worker        mov             v17.8b,  v19.8b
2456*c0909341SAndroid Build Coastguard Worker.endif
2457*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v20.8b
2458*c0909341SAndroid Build Coastguard Worker        mov             v19.8b,  v21.8b
2459*c0909341SAndroid Build Coastguard Worker        mov             v20.8b,  v22.8b
2460*c0909341SAndroid Build Coastguard Worker        mov             v21.8b,  v24.8b
2461*c0909341SAndroid Build Coastguard Worker        mov             v22.8b,  v25.8b
2462*c0909341SAndroid Build Coastguard Worker        b               48b
2463*c0909341SAndroid Build Coastguard Worker0:
2464*c0909341SAndroid Build Coastguard Worker        ret             x15
2465*c0909341SAndroid Build Coastguard Worker
2466*c0909341SAndroid Build Coastguard WorkerL(\type\()_\taps\()_filter_4):
2467*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h}, [\sr2], \s_strd
2468*c0909341SAndroid Build Coastguard Worker        ld1             {v25.8h}, [\src], \s_strd
2469*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v24.16b, v24.16b, #2
2470*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v24.16b, v24.16b, #4
2471*c0909341SAndroid Build Coastguard Worker        ext             v28.16b, v24.16b, v24.16b, #6
2472*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v24.4h,  v0.h[0]
2473*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v26.4h,  v0.h[1]
2474*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v27.4h,  v0.h[2]
2475*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v28.4h,  v0.h[3]
2476*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v25.16b, v25.16b, #2
2477*c0909341SAndroid Build Coastguard Worker        ext             v27.16b, v25.16b, v25.16b, #4
2478*c0909341SAndroid Build Coastguard Worker        ext             v28.16b, v25.16b, v25.16b, #6
2479*c0909341SAndroid Build Coastguard Worker        smull           v25.4s,  v25.4h,  v0.h[0]
2480*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v26.4h,  v0.h[1]
2481*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v27.4h,  v0.h[2]
2482*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v28.4h,  v0.h[3]
2483*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2484*c0909341SAndroid Build Coastguard Worker        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2485*c0909341SAndroid Build Coastguard Worker        xtn             v24.4h,  v24.4s
2486*c0909341SAndroid Build Coastguard Worker        xtn             v25.4h,  v25.4s
2487*c0909341SAndroid Build Coastguard Worker        ret
2488*c0909341SAndroid Build Coastguard Worker
2489*c0909341SAndroid Build Coastguard Worker80:
2490*c0909341SAndroid Build Coastguard Worker160:
2491*c0909341SAndroid Build Coastguard Worker320:
2492*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2493*c0909341SAndroid Build Coastguard Worker        b.gt            880f
2494*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [\xmx]
2495*c0909341SAndroid Build Coastguard Worker        ldur            s1,  [\xmy, #2]
2496*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2497*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #4
2498*c0909341SAndroid Build Coastguard Worker.else
2499*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #6
2500*c0909341SAndroid Build Coastguard Worker.endif
2501*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd
2502*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2503*c0909341SAndroid Build Coastguard Worker        sxtl            v1.8h,   v1.8b
2504*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
2505*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
2506*c0909341SAndroid Build Coastguard Worker
2507*c0909341SAndroid Build Coastguard Worker164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2508*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2509*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2510*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2511*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2512*c0909341SAndroid Build Coastguard Worker
2513*c0909341SAndroid Build Coastguard Worker        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2514*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2515*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v27.4h,  v0.h[1]
2516*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v27.8h,  v0.h[1]
2517*c0909341SAndroid Build Coastguard Worker    .irpc i, 23456
2518*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
2519*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v26.4h,  v0.h[\i]
2520*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2521*c0909341SAndroid Build Coastguard Worker    .endr
2522*c0909341SAndroid Build Coastguard Worker.else
2523*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v27.4h,  v0.h[0]
2524*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v27.8h,  v0.h[0]
2525*c0909341SAndroid Build Coastguard Worker    .irpc i, 1234567
2526*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2527*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v26.4h,  v0.h[\i]
2528*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2529*c0909341SAndroid Build Coastguard Worker    .endr
2530*c0909341SAndroid Build Coastguard Worker.endif
2531*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2532*c0909341SAndroid Build Coastguard Worker        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2533*c0909341SAndroid Build Coastguard Worker        // The intermediates from the horizontal pass fit in 16 bit without
2534*c0909341SAndroid Build Coastguard Worker        // any bias; we could just as well keep them as .4s, but narrowing
2535*c0909341SAndroid Build Coastguard Worker        // them to .4h gives a significant speedup on out of order cores
2536*c0909341SAndroid Build Coastguard Worker        // (at the cost of a smaller slowdown on in-order cores such as A53),
2537*c0909341SAndroid Build Coastguard Worker        // and conserves register space (no need to clobber v8-v15).
2538*c0909341SAndroid Build Coastguard Worker        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2539*c0909341SAndroid Build Coastguard Worker
2540*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_8)
2541*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v23.16b
2542*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v24.16b
2543*c0909341SAndroid Build Coastguard Worker
2544*c0909341SAndroid Build Coastguard Worker8:
2545*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v16.4h,  v1.h[0]
2546*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v16.8h,  v1.h[0]
2547*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_8)
2548*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v17.4h,  v1.h[0]
2549*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v17.8h,  v1.h[0]
2550*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v17.4h,  v1.h[1]
2551*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v17.8h,  v1.h[1]
2552*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v18.4h,  v1.h[1]
2553*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v18.8h,  v1.h[1]
2554*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v1.h[2]
2555*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v18.8h,  v1.h[2]
2556*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v23.4h,  v1.h[2]
2557*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v23.8h,  v1.h[2]
2558*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v23.4h,  v1.h[3]
2559*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v23.8h,  v1.h[3]
2560*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v24.4h,  v1.h[3]
2561*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v24.8h,  v1.h[3]
2562*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2563*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2564*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2565*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2566*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2567*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.4h,   v2.4s
2568*c0909341SAndroid Build Coastguard Worker        sqxtun2         v2.8h,   v3.4s
2569*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.4h,   v4.4s
2570*c0909341SAndroid Build Coastguard Worker        sqxtun2         v3.8h,   v5.4s
2571*c0909341SAndroid Build Coastguard Worker        umin            v2.8h,   v2.8h,   v31.8h
2572*c0909341SAndroid Build Coastguard Worker        umin            v3.8h,   v3.8h,   v31.8h
2573*c0909341SAndroid Build Coastguard Worker.else
2574*c0909341SAndroid Build Coastguard Worker        rshrn           v2.4h,   v2.4s,   #6
2575*c0909341SAndroid Build Coastguard Worker        rshrn2          v2.8h,   v3.4s,   #6
2576*c0909341SAndroid Build Coastguard Worker        rshrn           v3.4h,   v4.4s,   #6
2577*c0909341SAndroid Build Coastguard Worker        rshrn2          v3.8h,   v5.4s,   #6
2578*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2579*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2580*c0909341SAndroid Build Coastguard Worker.endif
2581*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2582*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h}, [\dst], \d_strd
2583*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h}, [\ds2], \d_strd
2584*c0909341SAndroid Build Coastguard Worker        b.le            9f
2585*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v18.16b
2586*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v23.16b
2587*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v24.16b
2588*c0909341SAndroid Build Coastguard Worker        b               8b
2589*c0909341SAndroid Build Coastguard Worker9:
2590*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
2591*c0909341SAndroid Build Coastguard Worker        b.le            0f
2592*c0909341SAndroid Build Coastguard Worker        asr             \s_strd,  \s_strd,  #1
2593*c0909341SAndroid Build Coastguard Worker        asr             \d_strd,  \d_strd,  #1
2594*c0909341SAndroid Build Coastguard Worker        msub            \src,  \s_strd,  \xmy,  \src
2595*c0909341SAndroid Build Coastguard Worker        msub            \dst,  \d_strd,  \xmy,  \dst
2596*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd,  lsl #2
2597*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
2598*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  #16
2599*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  #16
2600*c0909341SAndroid Build Coastguard Worker        b               164b
2601*c0909341SAndroid Build Coastguard Worker
2602*c0909341SAndroid Build Coastguard Worker880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2603*c0909341SAndroid Build Coastguard Worker640:
2604*c0909341SAndroid Build Coastguard Worker1280:
2605*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2606*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [\xmx]
2607*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b},  [\xmy]
2608*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2609*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #4
2610*c0909341SAndroid Build Coastguard Worker.else
2611*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #6
2612*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd
2613*c0909341SAndroid Build Coastguard Worker.endif
2614*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd, lsl #1
2615*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
2616*c0909341SAndroid Build Coastguard Worker        sxtl            v1.8h,   v1.8b
2617*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
2618*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
2619*c0909341SAndroid Build Coastguard Worker
2620*c0909341SAndroid Build Coastguard Worker168:
2621*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2622*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2623*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2624*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2625*c0909341SAndroid Build Coastguard Worker
2626*c0909341SAndroid Build Coastguard Worker        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2627*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2628*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v27.4h,  v0.h[1]
2629*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v27.8h,  v0.h[1]
2630*c0909341SAndroid Build Coastguard Worker    .irpc i, 23456
2631*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
2632*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v26.4h,  v0.h[\i]
2633*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2634*c0909341SAndroid Build Coastguard Worker    .endr
2635*c0909341SAndroid Build Coastguard Worker.else   // 8tap
2636*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v27.4h,  v0.h[0]
2637*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v27.8h,  v0.h[0]
2638*c0909341SAndroid Build Coastguard Worker    .irpc i, 1234567
2639*c0909341SAndroid Build Coastguard Worker        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2640*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v26.4h,  v0.h[\i]
2641*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2642*c0909341SAndroid Build Coastguard Worker    .endr
2643*c0909341SAndroid Build Coastguard Worker.endif
2644*c0909341SAndroid Build Coastguard Worker        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2645*c0909341SAndroid Build Coastguard Worker        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2646*c0909341SAndroid Build Coastguard Worker        // The intermediates from the horizontal pass fit in 16 bit without
2647*c0909341SAndroid Build Coastguard Worker        // any bias; we could just as well keep them as .4s, but narrowing
2648*c0909341SAndroid Build Coastguard Worker        // them to .4h gives a significant speedup on out of order cores
2649*c0909341SAndroid Build Coastguard Worker        // (at the cost of a smaller slowdown on in-order cores such as A53),
2650*c0909341SAndroid Build Coastguard Worker        // and conserves register space (no need to clobber v8-v15).
2651*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2652*c0909341SAndroid Build Coastguard Worker        uzp1            v18.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2653*c0909341SAndroid Build Coastguard Worker.else
2654*c0909341SAndroid Build Coastguard Worker        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2655*c0909341SAndroid Build Coastguard Worker
2656*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_8)
2657*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v23.16b
2658*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v24.16b
2659*c0909341SAndroid Build Coastguard Worker.endif
2660*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_8)
2661*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v23.16b
2662*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v24.16b
2663*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_8)
2664*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v23.16b
2665*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v24.16b
2666*c0909341SAndroid Build Coastguard Worker
2667*c0909341SAndroid Build Coastguard Worker88:
2668*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2669*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v18.4h,  v1.h[1]
2670*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v18.8h,  v1.h[1]
2671*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_8)
2672*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v19.4h,  v1.h[1]
2673*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v19.8h,  v1.h[1]
2674*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v19.4h,  v1.h[2]
2675*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v19.8h,  v1.h[2]
2676*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v20.4h,  v1.h[2]
2677*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v20.8h,  v1.h[2]
2678*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v20.4h,  v1.h[3]
2679*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v20.8h,  v1.h[3]
2680*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v21.4h,  v1.h[3]
2681*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v21.8h,  v1.h[3]
2682*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v21.4h,  v1.h[4]
2683*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v21.8h,  v1.h[4]
2684*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v22.4h,  v1.h[4]
2685*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v22.8h,  v1.h[4]
2686*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v22.4h,  v1.h[5]
2687*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v22.8h,  v1.h[5]
2688*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v23.4h,  v1.h[5]
2689*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v23.8h,  v1.h[5]
2690*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v23.4h,  v1.h[6]
2691*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v23.8h,  v1.h[6]
2692*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v24.4h,  v1.h[6]
2693*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v24.8h,  v1.h[6]
2694*c0909341SAndroid Build Coastguard Worker.else   // 8tap
2695*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v16.4h,  v1.h[0]
2696*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v16.8h,  v1.h[0]
2697*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_\taps\()_filter_8)
2698*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v17.4h,  v1.h[0]
2699*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v17.8h,  v1.h[0]
2700*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v17.4h,  v1.h[1]
2701*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v17.8h,  v1.h[1]
2702*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v18.4h,  v1.h[1]
2703*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v18.8h,  v1.h[1]
2704*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v1.h[2]
2705*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v18.8h,  v1.h[2]
2706*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v19.4h,  v1.h[2]
2707*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v19.8h,  v1.h[2]
2708*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v19.4h,  v1.h[3]
2709*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v19.8h,  v1.h[3]
2710*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v20.4h,  v1.h[3]
2711*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v20.8h,  v1.h[3]
2712*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v20.4h,  v1.h[4]
2713*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v20.8h,  v1.h[4]
2714*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v21.4h,  v1.h[4]
2715*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v21.8h,  v1.h[4]
2716*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v21.4h,  v1.h[5]
2717*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v21.8h,  v1.h[5]
2718*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v22.4h,  v1.h[5]
2719*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v22.8h,  v1.h[5]
2720*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v22.4h,  v1.h[6]
2721*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v22.8h,  v1.h[6]
2722*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v23.4h,  v1.h[6]
2723*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v23.8h,  v1.h[6]
2724*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v23.4h,  v1.h[7]
2725*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v23.8h,  v1.h[7]
2726*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v24.4h,  v1.h[7]
2727*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v24.8h,  v1.h[7]
2728*c0909341SAndroid Build Coastguard Worker.endif
2729*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2730*c0909341SAndroid Build Coastguard Worker        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2731*c0909341SAndroid Build Coastguard Worker        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2732*c0909341SAndroid Build Coastguard Worker        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2733*c0909341SAndroid Build Coastguard Worker        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2734*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.4h,   v2.4s
2735*c0909341SAndroid Build Coastguard Worker        sqxtun2         v2.8h,   v3.4s
2736*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.4h,   v4.4s
2737*c0909341SAndroid Build Coastguard Worker        sqxtun2         v3.8h,   v5.4s
2738*c0909341SAndroid Build Coastguard Worker        umin            v2.8h,   v2.8h,   v31.8h
2739*c0909341SAndroid Build Coastguard Worker        umin            v3.8h,   v3.8h,   v31.8h
2740*c0909341SAndroid Build Coastguard Worker.else
2741*c0909341SAndroid Build Coastguard Worker        rshrn           v2.4h,   v2.4s,   #6
2742*c0909341SAndroid Build Coastguard Worker        rshrn2          v2.8h,   v3.4s,   #6
2743*c0909341SAndroid Build Coastguard Worker        rshrn           v3.4h,   v4.4s,   #6
2744*c0909341SAndroid Build Coastguard Worker        rshrn2          v3.8h,   v5.4s,   #6
2745*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2746*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2747*c0909341SAndroid Build Coastguard Worker.endif
2748*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2749*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h}, [\dst], \d_strd
2750*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h}, [\ds2], \d_strd
2751*c0909341SAndroid Build Coastguard Worker        b.le            9f
2752*c0909341SAndroid Build Coastguard Worker.ifc \taps, 8tap
2753*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v18.16b
2754*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v19.16b
2755*c0909341SAndroid Build Coastguard Worker.endif
2756*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2757*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v21.16b
2758*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v22.16b
2759*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v23.16b
2760*c0909341SAndroid Build Coastguard Worker        mov             v22.16b, v24.16b
2761*c0909341SAndroid Build Coastguard Worker        b               88b
2762*c0909341SAndroid Build Coastguard Worker9:
2763*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
2764*c0909341SAndroid Build Coastguard Worker        b.le            0f
2765*c0909341SAndroid Build Coastguard Worker        asr             \s_strd,  \s_strd,  #1
2766*c0909341SAndroid Build Coastguard Worker        asr             \d_strd,  \d_strd,  #1
2767*c0909341SAndroid Build Coastguard Worker        msub            \src,  \s_strd,  \xmy,  \src
2768*c0909341SAndroid Build Coastguard Worker        msub            \dst,  \d_strd,  \xmy,  \dst
2769*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd,  lsl #3
2770*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
2771*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  #16
2772*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  #16
2773*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2774*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  \s_strd,  lsl #1
2775*c0909341SAndroid Build Coastguard Worker.endif
2776*c0909341SAndroid Build Coastguard Worker        b               168b
2777*c0909341SAndroid Build Coastguard Worker0:
2778*c0909341SAndroid Build Coastguard Worker        ret             x15
2779*c0909341SAndroid Build Coastguard Worker
2780*c0909341SAndroid Build Coastguard WorkerL(\type\()_\taps\()_filter_8):
2781*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
2782*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h},  [\src], \s_strd
2783*c0909341SAndroid Build Coastguard Worker.ifc \taps, 6tap
2784*c0909341SAndroid Build Coastguard Worker        smull           v25.4s,  v4.4h,   v0.h[1]
2785*c0909341SAndroid Build Coastguard Worker        smull2          v26.4s,  v4.8h,   v0.h[1]
2786*c0909341SAndroid Build Coastguard Worker        smull           v27.4s,  v6.4h,   v0.h[1]
2787*c0909341SAndroid Build Coastguard Worker        smull2          v28.4s,  v6.8h,   v0.h[1]
2788*c0909341SAndroid Build Coastguard Worker.irpc i, 23456
2789*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i-2)
2790*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i-2)
2791*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v23.4h,  v0.h[\i]
2792*c0909341SAndroid Build Coastguard Worker        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2793*c0909341SAndroid Build Coastguard Worker        smlal           v27.4s,  v24.4h,  v0.h[\i]
2794*c0909341SAndroid Build Coastguard Worker        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2795*c0909341SAndroid Build Coastguard Worker.endr
2796*c0909341SAndroid Build Coastguard Worker.else   // 8tap
2797*c0909341SAndroid Build Coastguard Worker        smull           v25.4s,  v4.4h,   v0.h[0]
2798*c0909341SAndroid Build Coastguard Worker        smull2          v26.4s,  v4.8h,   v0.h[0]
2799*c0909341SAndroid Build Coastguard Worker        smull           v27.4s,  v6.4h,   v0.h[0]
2800*c0909341SAndroid Build Coastguard Worker        smull2          v28.4s,  v6.8h,   v0.h[0]
2801*c0909341SAndroid Build Coastguard Worker.irpc i, 1234567
2802*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
2803*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
2804*c0909341SAndroid Build Coastguard Worker        smlal           v25.4s,  v23.4h,  v0.h[\i]
2805*c0909341SAndroid Build Coastguard Worker        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2806*c0909341SAndroid Build Coastguard Worker        smlal           v27.4s,  v24.4h,  v0.h[\i]
2807*c0909341SAndroid Build Coastguard Worker        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2808*c0909341SAndroid Build Coastguard Worker.endr
2809*c0909341SAndroid Build Coastguard Worker.endif
2810*c0909341SAndroid Build Coastguard Worker        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2811*c0909341SAndroid Build Coastguard Worker        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
2812*c0909341SAndroid Build Coastguard Worker        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
2813*c0909341SAndroid Build Coastguard Worker        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
2814*c0909341SAndroid Build Coastguard Worker        uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2
2815*c0909341SAndroid Build Coastguard Worker        uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
2816*c0909341SAndroid Build Coastguard Worker        ret
2817*c0909341SAndroid Build Coastguard Workerendfunc
2818*c0909341SAndroid Build Coastguard Worker
2819*c0909341SAndroid Build Coastguard Workerjumptable \type\()_\taps\()_hv_tbl
2820*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_\taps\()_hv_tbl
2821*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_\taps\()_hv_tbl
2822*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_\taps\()_hv_tbl
2823*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_\taps\()_hv_tbl
2824*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_\taps\()_hv_tbl
2825*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_\taps\()_hv_tbl
2826*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_\taps\()_hv_tbl
2827*c0909341SAndroid Build Coastguard Workerendjumptable
2828*c0909341SAndroid Build Coastguard Worker.endm
2829*c0909341SAndroid Build Coastguard Worker
2830*c0909341SAndroid Build Coastguard Worker
2831*c0909341SAndroid Build Coastguard Worker.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
2832*c0909341SAndroid Build Coastguard Workerfunction \type\()_bilin_16bpc_neon, export=1
2833*c0909341SAndroid Build Coastguard Worker.ifc \bdmax, w8
2834*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]
2835*c0909341SAndroid Build Coastguard Worker.endif
2836*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   \mx
2837*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   \my
2838*c0909341SAndroid Build Coastguard Worker        mov             w10, #16
2839*c0909341SAndroid Build Coastguard Worker        sub             w9,  w10, \mx
2840*c0909341SAndroid Build Coastguard Worker        sub             w10, w10, \my
2841*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   w9
2842*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   w10
2843*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
2844*c0909341SAndroid Build Coastguard Worker        uxtw            \d_strd, \w
2845*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2846*c0909341SAndroid Build Coastguard Worker.endif
2847*c0909341SAndroid Build Coastguard Worker
2848*c0909341SAndroid Build Coastguard Worker        clz             \bdmax,   \bdmax       // bitdepth_max
2849*c0909341SAndroid Build Coastguard Worker        clz             w9,  \w
2850*c0909341SAndroid Build Coastguard Worker        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
2851*c0909341SAndroid Build Coastguard Worker        mov             w11, #4
2852*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #24
2853*c0909341SAndroid Build Coastguard Worker        sub             w11, w11, \bdmax  // 4 - intermediate_bits
2854*c0909341SAndroid Build Coastguard Worker        add             w12, \bdmax, #4   // 4 + intermediate_bits
2855*c0909341SAndroid Build Coastguard Worker        cbnz            \mx, L(\type\()_bilin_h)
2856*c0909341SAndroid Build Coastguard Worker        cbnz            \my, L(\type\()_bilin_v)
2857*c0909341SAndroid Build Coastguard Worker        b               \type\()_16bpc_neon
2858*c0909341SAndroid Build Coastguard Worker
2859*c0909341SAndroid Build Coastguard WorkerL(\type\()_bilin_h):
2860*c0909341SAndroid Build Coastguard Worker        cbnz            \my, L(\type\()_bilin_hv)
2861*c0909341SAndroid Build Coastguard Worker
2862*c0909341SAndroid Build Coastguard Worker        movrel          x10, \type\()_bilin_h_tbl
2863*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w11      // 4 - intermediate_bits
2864*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, x9, lsl #2]
2865*c0909341SAndroid Build Coastguard Worker        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2866*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2867*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  \bdmax   // intermediate_bits
2868*c0909341SAndroid Build Coastguard Worker.else
2869*c0909341SAndroid Build Coastguard Worker        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2870*c0909341SAndroid Build Coastguard Worker.endif
2871*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
2872*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2873*c0909341SAndroid Build Coastguard Worker        neg             v30.8h,  v30.8h   // -intermediate_bits
2874*c0909341SAndroid Build Coastguard Worker.endif
2875*c0909341SAndroid Build Coastguard Worker        br              x10
2876*c0909341SAndroid Build Coastguard Worker
2877*c0909341SAndroid Build Coastguard Worker20:     // 2xN h
2878*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2879*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2880*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2881*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2882*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2883*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2884*c0909341SAndroid Build Coastguard Worker2:
2885*c0909341SAndroid Build Coastguard Worker        ld1             {v4.4h},  [\src], \s_strd
2886*c0909341SAndroid Build Coastguard Worker        ld1             {v6.4h},  [\sr2], \s_strd
2887*c0909341SAndroid Build Coastguard Worker        ext             v5.8b,   v4.8b,   v4.8b,   #2
2888*c0909341SAndroid Build Coastguard Worker        ext             v7.8b,   v6.8b,   v6.8b,   #2
2889*c0909341SAndroid Build Coastguard Worker        trn1            v4.2s,   v4.2s,   v6.2s
2890*c0909341SAndroid Build Coastguard Worker        trn1            v5.2s,   v5.2s,   v7.2s
2891*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2892*c0909341SAndroid Build Coastguard Worker        mul             v4.4h,   v4.4h,   v0.4h
2893*c0909341SAndroid Build Coastguard Worker        mla             v4.4h,   v5.4h,   v1.4h
2894*c0909341SAndroid Build Coastguard Worker        urshl           v4.4h,   v4.4h,   v31.4h
2895*c0909341SAndroid Build Coastguard Worker        urshl           v4.4h,   v4.4h,   v30.4h
2896*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[0], [\dst], \d_strd
2897*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[1], [\ds2], \d_strd
2898*c0909341SAndroid Build Coastguard Worker        b.gt            2b
2899*c0909341SAndroid Build Coastguard Worker        ret
2900*c0909341SAndroid Build Coastguard Worker.endif
2901*c0909341SAndroid Build Coastguard Worker
2902*c0909341SAndroid Build Coastguard Worker40:     // 4xN h
2903*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2904*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2905*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2906*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2907*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2908*c0909341SAndroid Build Coastguard Worker4:
2909*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [\src], \s_strd
2910*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h}, [\sr2], \s_strd
2911*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v4.16b,  #2
2912*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v6.16b,  #2
2913*c0909341SAndroid Build Coastguard Worker        trn1            v4.2d,   v4.2d,   v6.2d
2914*c0909341SAndroid Build Coastguard Worker        trn1            v5.2d,   v5.2d,   v7.2d
2915*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2916*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v4.8h,   v0.8h
2917*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v5.8h,   v1.8h
2918*c0909341SAndroid Build Coastguard Worker        urshl           v4.8h,   v4.8h,   v31.8h
2919*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2920*c0909341SAndroid Build Coastguard Worker        urshl           v4.8h,   v4.8h,   v30.8h
2921*c0909341SAndroid Build Coastguard Worker.else
2922*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v29.8h
2923*c0909341SAndroid Build Coastguard Worker.endif
2924*c0909341SAndroid Build Coastguard Worker        st1             {v4.8b},   [\dst], \d_strd
2925*c0909341SAndroid Build Coastguard Worker        st1             {v4.d}[1], [\ds2], \d_strd
2926*c0909341SAndroid Build Coastguard Worker        b.gt            4b
2927*c0909341SAndroid Build Coastguard Worker        ret
2928*c0909341SAndroid Build Coastguard Worker
2929*c0909341SAndroid Build Coastguard Worker80:     // 8xN h
2930*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2931*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2932*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2933*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2934*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2935*c0909341SAndroid Build Coastguard Worker8:
2936*c0909341SAndroid Build Coastguard Worker        ldr             h5,  [\src, #16]
2937*c0909341SAndroid Build Coastguard Worker        ldr             h7,  [\sr2, #16]
2938*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [\src], \s_strd
2939*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h}, [\sr2], \s_strd
2940*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v5.16b,  #2
2941*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v7.16b,  #2
2942*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2943*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v4.8h,   v0.8h
2944*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v5.8h,   v1.8h
2945*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v6.8h,   v0.8h
2946*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v7.8h,   v1.8h
2947*c0909341SAndroid Build Coastguard Worker        urshl           v4.8h,   v4.8h,   v31.8h
2948*c0909341SAndroid Build Coastguard Worker        urshl           v6.8h,   v6.8h,   v31.8h
2949*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2950*c0909341SAndroid Build Coastguard Worker        urshl           v4.8h,   v4.8h,   v30.8h
2951*c0909341SAndroid Build Coastguard Worker        urshl           v6.8h,   v6.8h,   v30.8h
2952*c0909341SAndroid Build Coastguard Worker.else
2953*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v29.8h
2954*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v29.8h
2955*c0909341SAndroid Build Coastguard Worker.endif
2956*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h}, [\dst], \d_strd
2957*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h}, [\ds2], \d_strd
2958*c0909341SAndroid Build Coastguard Worker        b.gt            8b
2959*c0909341SAndroid Build Coastguard Worker        ret
2960*c0909341SAndroid Build Coastguard Worker160:
2961*c0909341SAndroid Build Coastguard Worker320:
2962*c0909341SAndroid Build Coastguard Worker640:
2963*c0909341SAndroid Build Coastguard Worker1280:   // 16xN, 32xN, ... h
2964*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2965*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2966*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2967*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2968*c0909341SAndroid Build Coastguard Worker
2969*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  \w, uxtw #1
2970*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  #16
2971*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2972*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2973*c0909341SAndroid Build Coastguard Worker        sub             \d_strd,  \d_strd,  \w, uxtw #1
2974*c0909341SAndroid Build Coastguard Worker.endif
2975*c0909341SAndroid Build Coastguard Worker161:
2976*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h},  [\src], #16
2977*c0909341SAndroid Build Coastguard Worker        ld1             {v21.8h},  [\sr2], #16
2978*c0909341SAndroid Build Coastguard Worker        mov             \mx, \w
2979*c0909341SAndroid Build Coastguard Worker
2980*c0909341SAndroid Build Coastguard Worker16:
2981*c0909341SAndroid Build Coastguard Worker        ld1             {v17.8h, v18.8h},  [\src], #32
2982*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h, v23.8h},  [\sr2], #32
2983*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v16.16b, v17.16b, #2
2984*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v17.16b, v18.16b, #2
2985*c0909341SAndroid Build Coastguard Worker        ext             v24.16b, v21.16b, v22.16b, #2
2986*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v22.16b, v23.16b, #2
2987*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v16.8h,  v0.8h
2988*c0909341SAndroid Build Coastguard Worker        mla             v16.8h,  v19.8h,  v1.8h
2989*c0909341SAndroid Build Coastguard Worker        mul             v17.8h,  v17.8h,  v0.8h
2990*c0909341SAndroid Build Coastguard Worker        mla             v17.8h,  v20.8h,  v1.8h
2991*c0909341SAndroid Build Coastguard Worker        mul             v21.8h,  v21.8h,  v0.8h
2992*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v24.8h,  v1.8h
2993*c0909341SAndroid Build Coastguard Worker        mul             v22.8h,  v22.8h,  v0.8h
2994*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v25.8h,  v1.8h
2995*c0909341SAndroid Build Coastguard Worker        urshl           v16.8h,  v16.8h,  v31.8h
2996*c0909341SAndroid Build Coastguard Worker        urshl           v17.8h,  v17.8h,  v31.8h
2997*c0909341SAndroid Build Coastguard Worker        urshl           v21.8h,  v21.8h,  v31.8h
2998*c0909341SAndroid Build Coastguard Worker        urshl           v22.8h,  v22.8h,  v31.8h
2999*c0909341SAndroid Build Coastguard Worker        subs            \mx, \mx, #16
3000*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3001*c0909341SAndroid Build Coastguard Worker        urshl           v16.8h,  v16.8h,  v30.8h
3002*c0909341SAndroid Build Coastguard Worker        urshl           v17.8h,  v17.8h,  v30.8h
3003*c0909341SAndroid Build Coastguard Worker        urshl           v21.8h,  v21.8h,  v30.8h
3004*c0909341SAndroid Build Coastguard Worker        urshl           v22.8h,  v22.8h,  v30.8h
3005*c0909341SAndroid Build Coastguard Worker.else
3006*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v16.8h,  v29.8h
3007*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v17.8h,  v29.8h
3008*c0909341SAndroid Build Coastguard Worker        sub             v21.8h,  v21.8h,  v29.8h
3009*c0909341SAndroid Build Coastguard Worker        sub             v22.8h,  v22.8h,  v29.8h
3010*c0909341SAndroid Build Coastguard Worker.endif
3011*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h, v17.8h}, [\dst], #32
3012*c0909341SAndroid Build Coastguard Worker        st1             {v21.8h, v22.8h}, [\ds2], #32
3013*c0909341SAndroid Build Coastguard Worker        b.le            9f
3014*c0909341SAndroid Build Coastguard Worker
3015*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v18.16b
3016*c0909341SAndroid Build Coastguard Worker        mov             v21.16b, v23.16b
3017*c0909341SAndroid Build Coastguard Worker        b               16b
3018*c0909341SAndroid Build Coastguard Worker
3019*c0909341SAndroid Build Coastguard Worker9:
3020*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  \d_strd
3021*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \ds2,  \d_strd
3022*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  \s_strd
3023*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \sr2,  \s_strd
3024*c0909341SAndroid Build Coastguard Worker
3025*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3026*c0909341SAndroid Build Coastguard Worker        b.gt            161b
3027*c0909341SAndroid Build Coastguard Worker        ret
3028*c0909341SAndroid Build Coastguard Workerendfunc
3029*c0909341SAndroid Build Coastguard Worker
3030*c0909341SAndroid Build Coastguard Workerjumptable \type\()_bilin_h_tbl
3031*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_bilin_h_tbl
3032*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_bilin_h_tbl
3033*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_bilin_h_tbl
3034*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_bilin_h_tbl
3035*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_bilin_h_tbl
3036*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_bilin_h_tbl
3037*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_bilin_h_tbl
3038*c0909341SAndroid Build Coastguard Workerendjumptable
3039*c0909341SAndroid Build Coastguard Worker
3040*c0909341SAndroid Build Coastguard Worker
3041*c0909341SAndroid Build Coastguard Workerfunction L(\type\()_bilin_v)
3042*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #4
3043*c0909341SAndroid Build Coastguard Worker        movrel          x10, \type\()_bilin_v_tbl
3044*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
3045*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w11      // 4 - intermediate_bits
3046*c0909341SAndroid Build Coastguard Worker.endif
3047*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, x9, lsl #2]
3048*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
3049*c0909341SAndroid Build Coastguard Worker        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
3050*c0909341SAndroid Build Coastguard Worker        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
3051*c0909341SAndroid Build Coastguard Worker.endif
3052*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
3053*c0909341SAndroid Build Coastguard Worker        br              x10
3054*c0909341SAndroid Build Coastguard Worker
3055*c0909341SAndroid Build Coastguard Worker20:     // 2xN v
3056*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3057*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3058*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
3059*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
3060*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
3061*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
3062*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
3063*c0909341SAndroid Build Coastguard Worker
3064*c0909341SAndroid Build Coastguard Worker        // 2x2 v
3065*c0909341SAndroid Build Coastguard Worker        ld1r            {v16.4s}, [\src], \s_strd
3066*c0909341SAndroid Build Coastguard Worker        b.gt            24f
3067*c0909341SAndroid Build Coastguard Worker22:
3068*c0909341SAndroid Build Coastguard Worker        ld1r            {v17.4s}, [\sr2], \s_strd
3069*c0909341SAndroid Build Coastguard Worker        ld1r            {v18.4s}, [\src], \s_strd
3070*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v17.2s
3071*c0909341SAndroid Build Coastguard Worker        trn1            v17.2s,  v17.2s,  v18.2s
3072*c0909341SAndroid Build Coastguard Worker        mul             v4.4h,   v16.4h,  v2.4h
3073*c0909341SAndroid Build Coastguard Worker        mla             v4.4h,   v17.4h,  v3.4h
3074*c0909341SAndroid Build Coastguard Worker        urshr           v4.8h,   v4.8h,   #4
3075*c0909341SAndroid Build Coastguard Worker        str             s4,        [\dst]
3076*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[1], [\ds2]
3077*c0909341SAndroid Build Coastguard Worker        ret
3078*c0909341SAndroid Build Coastguard Worker24:     // 2x4, 2x6, 2x8, ... v
3079*c0909341SAndroid Build Coastguard Worker        ld1r            {v17.4s}, [\sr2], \s_strd
3080*c0909341SAndroid Build Coastguard Worker        ld1r            {v18.4s}, [\src], \s_strd
3081*c0909341SAndroid Build Coastguard Worker        ld1r            {v19.4s}, [\sr2], \s_strd
3082*c0909341SAndroid Build Coastguard Worker        ld1r            {v20.4s}, [\src], \s_strd
3083*c0909341SAndroid Build Coastguard Worker        sub             \h,  \h,  #4
3084*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v17.2s
3085*c0909341SAndroid Build Coastguard Worker        trn1            v17.2s,  v17.2s,  v18.2s
3086*c0909341SAndroid Build Coastguard Worker        trn1            v18.2s,  v18.2s,  v19.2s
3087*c0909341SAndroid Build Coastguard Worker        trn1            v19.2s,  v19.2s,  v20.2s
3088*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v18.2d
3089*c0909341SAndroid Build Coastguard Worker        trn1            v17.2d,  v17.2d,  v19.2d
3090*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v16.8h,  v2.8h
3091*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v17.8h,  v3.8h
3092*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
3093*c0909341SAndroid Build Coastguard Worker        urshr           v4.8h,   v4.8h,   #4
3094*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[0], [\dst], \d_strd
3095*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[1], [\ds2], \d_strd
3096*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[2], [\dst], \d_strd
3097*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[3], [\ds2], \d_strd
3098*c0909341SAndroid Build Coastguard Worker        b.lt            0f
3099*c0909341SAndroid Build Coastguard Worker        mov             v16.8b,  v20.8b
3100*c0909341SAndroid Build Coastguard Worker        b.eq            22b
3101*c0909341SAndroid Build Coastguard Worker        b               24b
3102*c0909341SAndroid Build Coastguard Worker0:
3103*c0909341SAndroid Build Coastguard Worker        ret
3104*c0909341SAndroid Build Coastguard Worker.endif
3105*c0909341SAndroid Build Coastguard Worker
3106*c0909341SAndroid Build Coastguard Worker40:     // 4xN v
3107*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3108*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
3109*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
3110*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
3111*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
3112*c0909341SAndroid Build Coastguard Worker        ld1             {v16.4h}, [\src], \s_strd
3113*c0909341SAndroid Build Coastguard Worker4:
3114*c0909341SAndroid Build Coastguard Worker        ld1             {v17.4h}, [\sr2], \s_strd
3115*c0909341SAndroid Build Coastguard Worker        ld1             {v18.4h}, [\src], \s_strd
3116*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v17.2d
3117*c0909341SAndroid Build Coastguard Worker        trn1            v17.2d,  v17.2d,  v18.2d
3118*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v16.8h,  v2.8h
3119*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v17.8h,  v3.8h
3120*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3121*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3122*c0909341SAndroid Build Coastguard Worker        urshr           v4.8h,   v4.8h,   #4
3123*c0909341SAndroid Build Coastguard Worker.else
3124*c0909341SAndroid Build Coastguard Worker        urshl           v4.8h,   v4.8h,   v31.8h
3125*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v29.8h
3126*c0909341SAndroid Build Coastguard Worker.endif
3127*c0909341SAndroid Build Coastguard Worker        st1             {v4.8b},   [\dst], \d_strd
3128*c0909341SAndroid Build Coastguard Worker        st1             {v4.d}[1], [\ds2], \d_strd
3129*c0909341SAndroid Build Coastguard Worker        b.le            0f
3130*c0909341SAndroid Build Coastguard Worker        mov             v16.8b,  v18.8b
3131*c0909341SAndroid Build Coastguard Worker        b               4b
3132*c0909341SAndroid Build Coastguard Worker0:
3133*c0909341SAndroid Build Coastguard Worker        ret
3134*c0909341SAndroid Build Coastguard Worker
3135*c0909341SAndroid Build Coastguard Worker80:     // 8xN v
3136*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3137*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
3138*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
3139*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
3140*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
3141*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h}, [\src], \s_strd
3142*c0909341SAndroid Build Coastguard Worker8:
3143*c0909341SAndroid Build Coastguard Worker        ld1             {v17.8h}, [\sr2], \s_strd
3144*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h}, [\src], \s_strd
3145*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v16.8h,  v2.8h
3146*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v17.8h,  v3.8h
3147*c0909341SAndroid Build Coastguard Worker        mul             v5.8h,   v17.8h,  v2.8h
3148*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v18.8h,  v3.8h
3149*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3150*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3151*c0909341SAndroid Build Coastguard Worker        urshr           v4.8h,   v4.8h,   #4
3152*c0909341SAndroid Build Coastguard Worker        urshr           v5.8h,   v5.8h,   #4
3153*c0909341SAndroid Build Coastguard Worker.else
3154*c0909341SAndroid Build Coastguard Worker        urshl           v4.8h,   v4.8h,   v31.8h
3155*c0909341SAndroid Build Coastguard Worker        urshl           v5.8h,   v5.8h,   v31.8h
3156*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v29.8h
3157*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v5.8h,   v29.8h
3158*c0909341SAndroid Build Coastguard Worker.endif
3159*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h}, [\dst], \d_strd
3160*c0909341SAndroid Build Coastguard Worker        st1             {v5.8h}, [\ds2], \d_strd
3161*c0909341SAndroid Build Coastguard Worker        b.le            0f
3162*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v18.16b
3163*c0909341SAndroid Build Coastguard Worker        b               8b
3164*c0909341SAndroid Build Coastguard Worker0:
3165*c0909341SAndroid Build Coastguard Worker        ret
3166*c0909341SAndroid Build Coastguard Worker
3167*c0909341SAndroid Build Coastguard Worker160:    // 16xN, 32xN, ...
3168*c0909341SAndroid Build Coastguard Worker320:
3169*c0909341SAndroid Build Coastguard Worker640:
3170*c0909341SAndroid Build Coastguard Worker1280:
3171*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3172*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
3173*c0909341SAndroid Build Coastguard Worker1:
3174*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
3175*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
3176*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
3177*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
3178*c0909341SAndroid Build Coastguard Worker
3179*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [\src], \s_strd
3180*c0909341SAndroid Build Coastguard Worker2:
3181*c0909341SAndroid Build Coastguard Worker        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
3182*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h}, [\src], \s_strd
3183*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v16.8h,  v2.8h
3184*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v18.8h,  v3.8h
3185*c0909341SAndroid Build Coastguard Worker        mul             v5.8h,   v17.8h,  v2.8h
3186*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v19.8h,  v3.8h
3187*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v18.8h,  v2.8h
3188*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v20.8h,  v3.8h
3189*c0909341SAndroid Build Coastguard Worker        mul             v7.8h,   v19.8h,  v2.8h
3190*c0909341SAndroid Build Coastguard Worker        mla             v7.8h,   v21.8h,  v3.8h
3191*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3192*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3193*c0909341SAndroid Build Coastguard Worker        urshr           v4.8h,   v4.8h,   #4
3194*c0909341SAndroid Build Coastguard Worker        urshr           v5.8h,   v5.8h,   #4
3195*c0909341SAndroid Build Coastguard Worker        urshr           v6.8h,   v6.8h,   #4
3196*c0909341SAndroid Build Coastguard Worker        urshr           v7.8h,   v7.8h,   #4
3197*c0909341SAndroid Build Coastguard Worker.else
3198*c0909341SAndroid Build Coastguard Worker        urshl           v4.8h,   v4.8h,   v31.8h
3199*c0909341SAndroid Build Coastguard Worker        urshl           v5.8h,   v5.8h,   v31.8h
3200*c0909341SAndroid Build Coastguard Worker        urshl           v6.8h,   v6.8h,   v31.8h
3201*c0909341SAndroid Build Coastguard Worker        urshl           v7.8h,   v7.8h,   v31.8h
3202*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v29.8h
3203*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v5.8h,   v29.8h
3204*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v29.8h
3205*c0909341SAndroid Build Coastguard Worker        sub             v7.8h,   v7.8h,   v29.8h
3206*c0909341SAndroid Build Coastguard Worker.endif
3207*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h}, [\dst], \d_strd
3208*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
3209*c0909341SAndroid Build Coastguard Worker        b.le            9f
3210*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v20.16b
3211*c0909341SAndroid Build Coastguard Worker        mov             v17.16b, v21.16b
3212*c0909341SAndroid Build Coastguard Worker        b               2b
3213*c0909341SAndroid Build Coastguard Worker9:
3214*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #16
3215*c0909341SAndroid Build Coastguard Worker        b.le            0f
3216*c0909341SAndroid Build Coastguard Worker        asr             \s_strd, \s_strd, #1
3217*c0909341SAndroid Build Coastguard Worker        asr             \d_strd, \d_strd, #1
3218*c0909341SAndroid Build Coastguard Worker        msub            \src, \s_strd, \xmy, \src
3219*c0909341SAndroid Build Coastguard Worker        msub            \dst, \d_strd, \xmy, \dst
3220*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1
3221*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
3222*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #32
3223*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #32
3224*c0909341SAndroid Build Coastguard Worker        b               1b
3225*c0909341SAndroid Build Coastguard Worker0:
3226*c0909341SAndroid Build Coastguard Worker        ret
3227*c0909341SAndroid Build Coastguard Workerendfunc
3228*c0909341SAndroid Build Coastguard Worker
3229*c0909341SAndroid Build Coastguard Workerjumptable \type\()_bilin_v_tbl
3230*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_bilin_v_tbl
3231*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_bilin_v_tbl
3232*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_bilin_v_tbl
3233*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_bilin_v_tbl
3234*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_bilin_v_tbl
3235*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_bilin_v_tbl
3236*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_bilin_v_tbl
3237*c0909341SAndroid Build Coastguard Workerendjumptable
3238*c0909341SAndroid Build Coastguard Worker
3239*c0909341SAndroid Build Coastguard Workerfunction L(\type\()_bilin_hv)
3240*c0909341SAndroid Build Coastguard Worker        movrel          x10, \type\()_bilin_hv_tbl
3241*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w11      // 4 - intermediate_bits
3242*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, x9, lsl #2]
3243*c0909341SAndroid Build Coastguard Worker        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
3244*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3245*c0909341SAndroid Build Coastguard Worker        dup             v30.4s,  w12      // 4 + intermediate_bits
3246*c0909341SAndroid Build Coastguard Worker.else
3247*c0909341SAndroid Build Coastguard Worker        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
3248*c0909341SAndroid Build Coastguard Worker.endif
3249*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x9
3250*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3251*c0909341SAndroid Build Coastguard Worker        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
3252*c0909341SAndroid Build Coastguard Worker.endif
3253*c0909341SAndroid Build Coastguard Worker        br              x10
3254*c0909341SAndroid Build Coastguard Worker
3255*c0909341SAndroid Build Coastguard Worker20:     // 2xN hv
3256*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3257*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3258*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
3259*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
3260*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
3261*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
3262*c0909341SAndroid Build Coastguard Worker
3263*c0909341SAndroid Build Coastguard Worker        ld1             {v20.4h},  [\src], \s_strd
3264*c0909341SAndroid Build Coastguard Worker        ext             v21.8b,  v20.8b,  v20.8b,  #2
3265*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v20.4h,  v0.4h
3266*c0909341SAndroid Build Coastguard Worker        mla             v16.4h,  v21.4h,  v1.4h
3267*c0909341SAndroid Build Coastguard Worker        urshl           v16.4h,  v16.4h,  v31.4h
3268*c0909341SAndroid Build Coastguard Worker
3269*c0909341SAndroid Build Coastguard Worker2:
3270*c0909341SAndroid Build Coastguard Worker        ld1             {v22.4h},  [\sr2], \s_strd
3271*c0909341SAndroid Build Coastguard Worker        ld1             {v24.4h},  [\src], \s_strd
3272*c0909341SAndroid Build Coastguard Worker        ext             v23.8b,  v22.8b,  v22.8b,  #2
3273*c0909341SAndroid Build Coastguard Worker        ext             v25.8b,  v24.8b,  v24.8b,  #2
3274*c0909341SAndroid Build Coastguard Worker        trn1            v22.2s,  v22.2s,  v24.2s
3275*c0909341SAndroid Build Coastguard Worker        trn1            v23.2s,  v23.2s,  v25.2s
3276*c0909341SAndroid Build Coastguard Worker        mul             v17.4h,  v22.4h,  v0.4h
3277*c0909341SAndroid Build Coastguard Worker        mla             v17.4h,  v23.4h,  v1.4h
3278*c0909341SAndroid Build Coastguard Worker        urshl           v17.4h,  v17.4h,  v31.4h
3279*c0909341SAndroid Build Coastguard Worker
3280*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v17.2s
3281*c0909341SAndroid Build Coastguard Worker
3282*c0909341SAndroid Build Coastguard Worker        umull           v4.4s,   v16.4h,  v2.4h
3283*c0909341SAndroid Build Coastguard Worker        umlal           v4.4s,   v17.4h,  v3.4h
3284*c0909341SAndroid Build Coastguard Worker        urshl           v4.4s,   v4.4s,   v30.4s
3285*c0909341SAndroid Build Coastguard Worker        xtn             v4.4h,   v4.4s
3286*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3287*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[0], [\dst], \d_strd
3288*c0909341SAndroid Build Coastguard Worker        st1             {v4.s}[1], [\ds2], \d_strd
3289*c0909341SAndroid Build Coastguard Worker        b.le            0f
3290*c0909341SAndroid Build Coastguard Worker        trn2            v16.2s,  v17.2s,  v17.2s
3291*c0909341SAndroid Build Coastguard Worker        b               2b
3292*c0909341SAndroid Build Coastguard Worker0:
3293*c0909341SAndroid Build Coastguard Worker        ret
3294*c0909341SAndroid Build Coastguard Worker.endif
3295*c0909341SAndroid Build Coastguard Worker
3296*c0909341SAndroid Build Coastguard Worker40:     // 4xN hv
3297*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3298*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
3299*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
3300*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
3301*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
3302*c0909341SAndroid Build Coastguard Worker
3303*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h},  [\src], \s_strd
3304*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v20.16b, v20.16b, #2
3305*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v20.4h,  v0.4h
3306*c0909341SAndroid Build Coastguard Worker        mla             v16.4h,  v21.4h,  v1.4h
3307*c0909341SAndroid Build Coastguard Worker        urshl           v16.4h,  v16.4h,  v31.4h
3308*c0909341SAndroid Build Coastguard Worker
3309*c0909341SAndroid Build Coastguard Worker4:
3310*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h},  [\sr2], \s_strd
3311*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h},  [\src], \s_strd
3312*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v22.16b, v22.16b, #2
3313*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v24.16b, v24.16b, #2
3314*c0909341SAndroid Build Coastguard Worker        trn1            v22.2d,  v22.2d,  v24.2d
3315*c0909341SAndroid Build Coastguard Worker        trn1            v23.2d,  v23.2d,  v25.2d
3316*c0909341SAndroid Build Coastguard Worker        mul             v17.8h,  v22.8h,  v0.8h
3317*c0909341SAndroid Build Coastguard Worker        mla             v17.8h,  v23.8h,  v1.8h
3318*c0909341SAndroid Build Coastguard Worker        urshl           v17.8h,  v17.8h,  v31.8h
3319*c0909341SAndroid Build Coastguard Worker
3320*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v17.2d
3321*c0909341SAndroid Build Coastguard Worker
3322*c0909341SAndroid Build Coastguard Worker        umull           v4.4s,   v16.4h,  v2.4h
3323*c0909341SAndroid Build Coastguard Worker        umlal           v4.4s,   v17.4h,  v3.4h
3324*c0909341SAndroid Build Coastguard Worker        umull2          v5.4s,   v16.8h,  v2.8h
3325*c0909341SAndroid Build Coastguard Worker        umlal2          v5.4s,   v17.8h,  v3.8h
3326*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3327*c0909341SAndroid Build Coastguard Worker        urshl           v4.4s,   v4.4s,   v30.4s
3328*c0909341SAndroid Build Coastguard Worker        urshl           v5.4s,   v5.4s,   v30.4s
3329*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
3330*c0909341SAndroid Build Coastguard Worker.else
3331*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v4.4s,   #4
3332*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.8h,   v5.4s,   #4
3333*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v29.8h
3334*c0909341SAndroid Build Coastguard Worker.endif
3335*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3336*c0909341SAndroid Build Coastguard Worker        st1             {v4.8b},   [\dst], \d_strd
3337*c0909341SAndroid Build Coastguard Worker        st1             {v4.d}[1], [\ds2], \d_strd
3338*c0909341SAndroid Build Coastguard Worker        b.le            0f
3339*c0909341SAndroid Build Coastguard Worker        trn2            v16.2d,  v17.2d,  v17.2d
3340*c0909341SAndroid Build Coastguard Worker        b               4b
3341*c0909341SAndroid Build Coastguard Worker0:
3342*c0909341SAndroid Build Coastguard Worker        ret
3343*c0909341SAndroid Build Coastguard Worker
3344*c0909341SAndroid Build Coastguard Worker80:     // 8xN, 16xN, ... hv
3345*c0909341SAndroid Build Coastguard Worker160:
3346*c0909341SAndroid Build Coastguard Worker320:
3347*c0909341SAndroid Build Coastguard Worker640:
3348*c0909341SAndroid Build Coastguard Worker1280:
3349*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3350*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
3351*c0909341SAndroid Build Coastguard Worker
3352*c0909341SAndroid Build Coastguard Worker1:
3353*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
3354*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
3355*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
3356*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
3357*c0909341SAndroid Build Coastguard Worker
3358*c0909341SAndroid Build Coastguard Worker        ldr             h21, [\src, #16]
3359*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h},  [\src], \s_strd
3360*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v20.16b, v21.16b, #2
3361*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v20.8h,  v0.8h
3362*c0909341SAndroid Build Coastguard Worker        mla             v16.8h,  v21.8h,  v1.8h
3363*c0909341SAndroid Build Coastguard Worker        urshl           v16.8h,  v16.8h,  v31.8h
3364*c0909341SAndroid Build Coastguard Worker
3365*c0909341SAndroid Build Coastguard Worker2:
3366*c0909341SAndroid Build Coastguard Worker        ldr             h23, [\sr2, #16]
3367*c0909341SAndroid Build Coastguard Worker        ld1             {v22.8h},  [\sr2], \s_strd
3368*c0909341SAndroid Build Coastguard Worker        ldr             h25, [\src, #16]
3369*c0909341SAndroid Build Coastguard Worker        ld1             {v24.8h},  [\src], \s_strd
3370*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v22.16b, v23.16b, #2
3371*c0909341SAndroid Build Coastguard Worker        ext             v25.16b, v24.16b, v25.16b, #2
3372*c0909341SAndroid Build Coastguard Worker        mul             v17.8h,  v22.8h,  v0.8h
3373*c0909341SAndroid Build Coastguard Worker        mla             v17.8h,  v23.8h,  v1.8h
3374*c0909341SAndroid Build Coastguard Worker        mul             v18.8h,  v24.8h,  v0.8h
3375*c0909341SAndroid Build Coastguard Worker        mla             v18.8h,  v25.8h,  v1.8h
3376*c0909341SAndroid Build Coastguard Worker        urshl           v17.8h,  v17.8h,  v31.8h
3377*c0909341SAndroid Build Coastguard Worker        urshl           v18.8h,  v18.8h,  v31.8h
3378*c0909341SAndroid Build Coastguard Worker
3379*c0909341SAndroid Build Coastguard Worker        umull           v4.4s,   v16.4h,  v2.4h
3380*c0909341SAndroid Build Coastguard Worker        umlal           v4.4s,   v17.4h,  v3.4h
3381*c0909341SAndroid Build Coastguard Worker        umull2          v5.4s,   v16.8h,  v2.8h
3382*c0909341SAndroid Build Coastguard Worker        umlal2          v5.4s,   v17.8h,  v3.8h
3383*c0909341SAndroid Build Coastguard Worker        umull           v6.4s,   v17.4h,  v2.4h
3384*c0909341SAndroid Build Coastguard Worker        umlal           v6.4s,   v18.4h,  v3.4h
3385*c0909341SAndroid Build Coastguard Worker        umull2          v7.4s,   v17.8h,  v2.8h
3386*c0909341SAndroid Build Coastguard Worker        umlal2          v7.4s,   v18.8h,  v3.8h
3387*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3388*c0909341SAndroid Build Coastguard Worker        urshl           v4.4s,   v4.4s,   v30.4s
3389*c0909341SAndroid Build Coastguard Worker        urshl           v5.4s,   v5.4s,   v30.4s
3390*c0909341SAndroid Build Coastguard Worker        urshl           v6.4s,   v6.4s,   v30.4s
3391*c0909341SAndroid Build Coastguard Worker        urshl           v7.4s,   v7.4s,   v30.4s
3392*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
3393*c0909341SAndroid Build Coastguard Worker        uzp1            v5.8h,   v6.8h,   v7.8h  // Ditto
3394*c0909341SAndroid Build Coastguard Worker.else
3395*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v4.4s,   #4
3396*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.8h,   v5.4s,   #4
3397*c0909341SAndroid Build Coastguard Worker        rshrn           v5.4h,   v6.4s,   #4
3398*c0909341SAndroid Build Coastguard Worker        rshrn2          v5.8h,   v7.4s,   #4
3399*c0909341SAndroid Build Coastguard Worker        sub             v4.8h,   v4.8h,   v29.8h
3400*c0909341SAndroid Build Coastguard Worker        sub             v5.8h,   v5.8h,   v29.8h
3401*c0909341SAndroid Build Coastguard Worker.endif
3402*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3403*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h}, [\dst], \d_strd
3404*c0909341SAndroid Build Coastguard Worker        st1             {v5.8h}, [\ds2], \d_strd
3405*c0909341SAndroid Build Coastguard Worker        b.le            9f
3406*c0909341SAndroid Build Coastguard Worker        mov             v16.16b, v18.16b
3407*c0909341SAndroid Build Coastguard Worker        b               2b
3408*c0909341SAndroid Build Coastguard Worker9:
3409*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
3410*c0909341SAndroid Build Coastguard Worker        b.le            0f
3411*c0909341SAndroid Build Coastguard Worker        asr             \s_strd,  \s_strd,  #1
3412*c0909341SAndroid Build Coastguard Worker        asr             \d_strd,  \d_strd,  #1
3413*c0909341SAndroid Build Coastguard Worker        msub            \src,  \s_strd,  \xmy,  \src
3414*c0909341SAndroid Build Coastguard Worker        msub            \dst,  \d_strd,  \xmy,  \dst
3415*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd,  lsl #1
3416*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
3417*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  #16
3418*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  #16
3419*c0909341SAndroid Build Coastguard Worker        b               1b
3420*c0909341SAndroid Build Coastguard Worker0:
3421*c0909341SAndroid Build Coastguard Worker        ret
3422*c0909341SAndroid Build Coastguard Workerendfunc
3423*c0909341SAndroid Build Coastguard Worker
3424*c0909341SAndroid Build Coastguard Workerjumptable \type\()_bilin_hv_tbl
3425*c0909341SAndroid Build Coastguard Worker        .word 1280b - \type\()_bilin_hv_tbl
3426*c0909341SAndroid Build Coastguard Worker        .word 640b  - \type\()_bilin_hv_tbl
3427*c0909341SAndroid Build Coastguard Worker        .word 320b  - \type\()_bilin_hv_tbl
3428*c0909341SAndroid Build Coastguard Worker        .word 160b  - \type\()_bilin_hv_tbl
3429*c0909341SAndroid Build Coastguard Worker        .word 80b   - \type\()_bilin_hv_tbl
3430*c0909341SAndroid Build Coastguard Worker        .word 40b   - \type\()_bilin_hv_tbl
3431*c0909341SAndroid Build Coastguard Worker        .word 20b   - \type\()_bilin_hv_tbl
3432*c0909341SAndroid Build Coastguard Workerendjumptable
3433*c0909341SAndroid Build Coastguard Worker.endm
3434*c0909341SAndroid Build Coastguard Worker
3435*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
3436*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3437*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
3438*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
3439*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3440*c0909341SAndroid Build Coastguard Workerfilter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
3441*c0909341SAndroid Build Coastguard Worker
3442*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
3443*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
3444*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
3445*c0909341SAndroid Build Coastguard Workermake_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
3446*c0909341SAndroid Build Coastguard Workerfilter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
3447*c0909341SAndroid Build Coastguard Workerfilter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
3448*c0909341SAndroid Build Coastguard Worker
3449*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  regular_sharp,  REGULAR, SHARP,   8tap
3450*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3451*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  sharp,          SHARP,   SHARP,   8tap
3452*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  sharp_regular,  SHARP,   REGULAR, 8tap
3453*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3454*c0909341SAndroid Build Coastguard Workerfilter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
3455*c0909341SAndroid Build Coastguard Worker
3456*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  regular,        REGULAR, REGULAR, 6tap
3457*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  regular_smooth, REGULAR, SMOOTH,  6tap
3458*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  smooth,         SMOOTH,  SMOOTH,  6tap
3459*c0909341SAndroid Build Coastguard Workermake_8tap_fn    prep,  smooth_regular, SMOOTH,  REGULAR, 6tap
3460*c0909341SAndroid Build Coastguard Workerfilter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
3461*c0909341SAndroid Build Coastguard Workerfilter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
3462*c0909341SAndroid Build Coastguard Worker
3463*c0909341SAndroid Build Coastguard Worker
3464*c0909341SAndroid Build Coastguard Worker.macro load_filter_row dst, src, inc
3465*c0909341SAndroid Build Coastguard Worker        asr             w13, \src, #10
3466*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \inc
3467*c0909341SAndroid Build Coastguard Worker        ldr             \dst, [x11, w13, sxtw #3]
3468*c0909341SAndroid Build Coastguard Worker.endm
3469*c0909341SAndroid Build Coastguard Worker
3470*c0909341SAndroid Build Coastguard Workerfunction warp_filter_horz_neon
3471*c0909341SAndroid Build Coastguard Worker        add             w12, w5,  #512
3472*c0909341SAndroid Build Coastguard Worker
3473*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x2], x3
3474*c0909341SAndroid Build Coastguard Worker
3475*c0909341SAndroid Build Coastguard Worker        load_filter_row d0, w12, w7
3476*c0909341SAndroid Build Coastguard Worker        load_filter_row d1, w12, w7
3477*c0909341SAndroid Build Coastguard Worker        load_filter_row d2, w12, w7
3478*c0909341SAndroid Build Coastguard Worker        sxtl            v0.8h,   v0.8b
3479*c0909341SAndroid Build Coastguard Worker        load_filter_row d3, w12, w7
3480*c0909341SAndroid Build Coastguard Worker        sxtl            v1.8h,   v1.8b
3481*c0909341SAndroid Build Coastguard Worker        load_filter_row d4, w12, w7
3482*c0909341SAndroid Build Coastguard Worker        sxtl            v2.8h,   v2.8b
3483*c0909341SAndroid Build Coastguard Worker        load_filter_row d5, w12, w7
3484*c0909341SAndroid Build Coastguard Worker        sxtl            v3.8h,   v3.8b
3485*c0909341SAndroid Build Coastguard Worker        load_filter_row d6, w12, w7
3486*c0909341SAndroid Build Coastguard Worker        sxtl            v4.8h,   v4.8b
3487*c0909341SAndroid Build Coastguard Worker        load_filter_row d7, w12, w7
3488*c0909341SAndroid Build Coastguard Worker        sxtl            v5.8h,   v5.8b
3489*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v16.16b, v17.16b, #2*1
3490*c0909341SAndroid Build Coastguard Worker        smull           v8.4s,   v16.4h,  v0.4h
3491*c0909341SAndroid Build Coastguard Worker        smull2          v9.4s,   v16.8h,  v0.8h
3492*c0909341SAndroid Build Coastguard Worker        sxtl            v6.8h,   v6.8b
3493*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v16.16b, v17.16b, #2*2
3494*c0909341SAndroid Build Coastguard Worker        smull           v10.4s,  v18.4h,  v1.4h
3495*c0909341SAndroid Build Coastguard Worker        smull2          v11.4s,  v18.8h,  v1.8h
3496*c0909341SAndroid Build Coastguard Worker        sxtl            v7.8h,   v7.8b
3497*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v16.16b, v17.16b, #2*3
3498*c0909341SAndroid Build Coastguard Worker        smull           v0.4s,   v19.4h,  v2.4h
3499*c0909341SAndroid Build Coastguard Worker        smull2          v1.4s,   v19.8h,  v2.8h
3500*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v16.16b, v17.16b, #2*4
3501*c0909341SAndroid Build Coastguard Worker        addp            v8.4s,   v8.4s,   v9.4s
3502*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v20.4h,  v3.4h
3503*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v20.8h,  v3.8h
3504*c0909341SAndroid Build Coastguard Worker        ext             v22.16b, v16.16b, v17.16b, #2*5
3505*c0909341SAndroid Build Coastguard Worker        addp            v9.4s,   v10.4s,  v11.4s
3506*c0909341SAndroid Build Coastguard Worker        smull           v10.4s,  v21.4h,  v4.4h
3507*c0909341SAndroid Build Coastguard Worker        smull2          v11.4s,  v21.8h,  v4.8h
3508*c0909341SAndroid Build Coastguard Worker        ext             v23.16b, v16.16b, v17.16b, #2*6
3509*c0909341SAndroid Build Coastguard Worker        addp            v0.4s,   v0.4s,   v1.4s
3510*c0909341SAndroid Build Coastguard Worker        smull           v18.4s,  v22.4h,  v5.4h
3511*c0909341SAndroid Build Coastguard Worker        smull2          v19.4s,  v22.8h,  v5.8h
3512*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v16.16b, v17.16b, #2*7
3513*c0909341SAndroid Build Coastguard Worker        addp            v1.4s,   v2.4s,   v3.4s
3514*c0909341SAndroid Build Coastguard Worker        addp            v2.4s,   v10.4s,  v11.4s
3515*c0909341SAndroid Build Coastguard Worker        smull           v20.4s,  v23.4h,  v6.4h
3516*c0909341SAndroid Build Coastguard Worker        smull2          v21.4s,  v23.8h,  v6.8h
3517*c0909341SAndroid Build Coastguard Worker        addp            v3.4s,   v18.4s,  v19.4s
3518*c0909341SAndroid Build Coastguard Worker        smull           v22.4s,  v16.4h,  v7.4h
3519*c0909341SAndroid Build Coastguard Worker        smull2          v23.4s,  v16.8h,  v7.8h
3520*c0909341SAndroid Build Coastguard Worker        addp            v4.4s,   v20.4s,  v21.4s
3521*c0909341SAndroid Build Coastguard Worker        addp            v5.4s,   v22.4s,  v23.4s
3522*c0909341SAndroid Build Coastguard Worker
3523*c0909341SAndroid Build Coastguard Worker        addp            v8.4s,   v8.4s,   v9.4s
3524*c0909341SAndroid Build Coastguard Worker        addp            v0.4s,   v0.4s,   v1.4s
3525*c0909341SAndroid Build Coastguard Worker        addp            v2.4s,   v2.4s,   v3.4s
3526*c0909341SAndroid Build Coastguard Worker        addp            v4.4s,   v4.4s,   v5.4s
3527*c0909341SAndroid Build Coastguard Worker
3528*c0909341SAndroid Build Coastguard Worker        addp            v16.4s,  v8.4s,   v0.4s
3529*c0909341SAndroid Build Coastguard Worker        addp            v17.4s,  v2.4s,   v4.4s
3530*c0909341SAndroid Build Coastguard Worker
3531*c0909341SAndroid Build Coastguard Worker        add             w5,  w5,  w8
3532*c0909341SAndroid Build Coastguard Worker
3533*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
3534*c0909341SAndroid Build Coastguard Worker        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
3535*c0909341SAndroid Build Coastguard Worker
3536*c0909341SAndroid Build Coastguard Worker        ret
3537*c0909341SAndroid Build Coastguard Workerendfunc
3538*c0909341SAndroid Build Coastguard Worker
3539*c0909341SAndroid Build Coastguard Worker// void dav1d_warp_affine_8x8_16bpc_neon(
3540*c0909341SAndroid Build Coastguard Worker//         pixel *dst, const ptrdiff_t dst_stride,
3541*c0909341SAndroid Build Coastguard Worker//         const pixel *src, const ptrdiff_t src_stride,
3542*c0909341SAndroid Build Coastguard Worker//         const int16_t *const abcd, int mx, int my,
3543*c0909341SAndroid Build Coastguard Worker//         const int bitdepth_max)
3544*c0909341SAndroid Build Coastguard Worker.macro warp t
3545*c0909341SAndroid Build Coastguard Workerfunction warp_affine_8x8\t\()_16bpc_neon, export=1
3546*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #-0x40]!
3547*c0909341SAndroid Build Coastguard Worker        stp             d10, d11, [sp, #0x10]
3548*c0909341SAndroid Build Coastguard Worker        stp             d12, d13, [sp, #0x20]
3549*c0909341SAndroid Build Coastguard Worker        stp             d14, d15, [sp, #0x30]
3550*c0909341SAndroid Build Coastguard Worker
3551*c0909341SAndroid Build Coastguard Worker.ifb \t
3552*c0909341SAndroid Build Coastguard Worker        dup             v15.8h,  w7        // bitdepth_max
3553*c0909341SAndroid Build Coastguard Worker.else
3554*c0909341SAndroid Build Coastguard Worker        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
3555*c0909341SAndroid Build Coastguard Worker.endif
3556*c0909341SAndroid Build Coastguard Worker        clz             w7,  w7
3557*c0909341SAndroid Build Coastguard Worker                                           // intermediate_bits = clz(bitdepth_max) - 18
3558*c0909341SAndroid Build Coastguard Worker.ifb \t
3559*c0909341SAndroid Build Coastguard Worker        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
3560*c0909341SAndroid Build Coastguard Worker.endif
3561*c0909341SAndroid Build Coastguard Worker        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
3562*c0909341SAndroid Build Coastguard Worker.ifb \t
3563*c0909341SAndroid Build Coastguard Worker        neg             w8,  w8            // -(7 + intermediate_bits)
3564*c0909341SAndroid Build Coastguard Worker.endif
3565*c0909341SAndroid Build Coastguard Worker        dup             v14.4s,  w7        // -(7 - intermediate_bits)
3566*c0909341SAndroid Build Coastguard Worker.ifb \t
3567*c0909341SAndroid Build Coastguard Worker        dup             v13.4s,  w8        // -(7 + intermediate_bits)
3568*c0909341SAndroid Build Coastguard Worker.endif
3569*c0909341SAndroid Build Coastguard Worker
3570*c0909341SAndroid Build Coastguard Worker        ldr             x4,  [x4]
3571*c0909341SAndroid Build Coastguard Worker        sbfx            x7,  x4, #0,  #16
3572*c0909341SAndroid Build Coastguard Worker        sbfx            x8,  x4, #16, #16
3573*c0909341SAndroid Build Coastguard Worker        sbfx            x9,  x4, #32, #16
3574*c0909341SAndroid Build Coastguard Worker        sbfx            x4,  x4, #48, #16
3575*c0909341SAndroid Build Coastguard Worker        mov             w10, #8
3576*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  x3, lsl #1
3577*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  x3
3578*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #6
3579*c0909341SAndroid Build Coastguard Worker        movrel          x11, X(mc_warp_filter), 64*8
3580*c0909341SAndroid Build Coastguard Worker        mov             x15, x30
3581*c0909341SAndroid Build Coastguard Worker.ifnb \t
3582*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3583*c0909341SAndroid Build Coastguard Worker.endif
3584*c0909341SAndroid Build Coastguard Worker
3585*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3586*c0909341SAndroid Build Coastguard Worker        uzp1            v24.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
3587*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3588*c0909341SAndroid Build Coastguard Worker        uzp1            v25.8h,  v16.8h,  v17.8h // Ditto
3589*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3590*c0909341SAndroid Build Coastguard Worker        uzp1            v26.8h,  v16.8h,  v17.8h // Ditto
3591*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3592*c0909341SAndroid Build Coastguard Worker        uzp1            v27.8h,  v16.8h,  v17.8h // Ditto
3593*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3594*c0909341SAndroid Build Coastguard Worker        uzp1            v28.8h,  v16.8h,  v17.8h // Ditto
3595*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3596*c0909341SAndroid Build Coastguard Worker        uzp1            v29.8h,  v16.8h,  v17.8h // Ditto
3597*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3598*c0909341SAndroid Build Coastguard Worker        uzp1            v30.8h,  v16.8h,  v17.8h // Ditto
3599*c0909341SAndroid Build Coastguard Worker
3600*c0909341SAndroid Build Coastguard Worker1:
3601*c0909341SAndroid Build Coastguard Worker        add             w14, w6,  #512
3602*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3603*c0909341SAndroid Build Coastguard Worker        uzp1            v31.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
3604*c0909341SAndroid Build Coastguard Worker
3605*c0909341SAndroid Build Coastguard Worker        load_filter_row d0, w14, w9
3606*c0909341SAndroid Build Coastguard Worker        load_filter_row d1, w14, w9
3607*c0909341SAndroid Build Coastguard Worker        load_filter_row d2, w14, w9
3608*c0909341SAndroid Build Coastguard Worker        load_filter_row d3, w14, w9
3609*c0909341SAndroid Build Coastguard Worker        load_filter_row d4, w14, w9
3610*c0909341SAndroid Build Coastguard Worker        load_filter_row d5, w14, w9
3611*c0909341SAndroid Build Coastguard Worker        load_filter_row d6, w14, w9
3612*c0909341SAndroid Build Coastguard Worker        load_filter_row d7, w14, w9
3613*c0909341SAndroid Build Coastguard Worker        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3614*c0909341SAndroid Build Coastguard Worker
3615*c0909341SAndroid Build Coastguard Worker        // This ordering of smull/smlal/smull2/smlal2 is highly
3616*c0909341SAndroid Build Coastguard Worker        // beneficial for Cortex A53 here.
3617*c0909341SAndroid Build Coastguard Worker        smull           v16.4s,  v24.4h,  v0.4h
3618*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v25.4h,  v1.4h
3619*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v26.4h,  v2.4h
3620*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v27.4h,  v3.4h
3621*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v28.4h,  v4.4h
3622*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v29.4h,  v5.4h
3623*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v30.4h,  v6.4h
3624*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v31.4h,  v7.4h
3625*c0909341SAndroid Build Coastguard Worker        smull2          v17.4s,  v24.8h,  v0.8h
3626*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v25.8h,  v1.8h
3627*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v26.8h,  v2.8h
3628*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v27.8h,  v3.8h
3629*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v28.8h,  v4.8h
3630*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v29.8h,  v5.8h
3631*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v30.8h,  v6.8h
3632*c0909341SAndroid Build Coastguard Worker        smlal2          v17.4s,  v31.8h,  v7.8h
3633*c0909341SAndroid Build Coastguard Worker
3634*c0909341SAndroid Build Coastguard Worker        mov             v24.16b, v25.16b
3635*c0909341SAndroid Build Coastguard Worker        mov             v25.16b, v26.16b
3636*c0909341SAndroid Build Coastguard Worker.ifb \t
3637*c0909341SAndroid Build Coastguard Worker        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
3638*c0909341SAndroid Build Coastguard Worker        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
3639*c0909341SAndroid Build Coastguard Worker.else
3640*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #7
3641*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.8h,  v17.4s,  #7
3642*c0909341SAndroid Build Coastguard Worker.endif
3643*c0909341SAndroid Build Coastguard Worker        mov             v26.16b, v27.16b
3644*c0909341SAndroid Build Coastguard Worker.ifb \t
3645*c0909341SAndroid Build Coastguard Worker        sqxtun          v16.4h,  v16.4s
3646*c0909341SAndroid Build Coastguard Worker        sqxtun2         v16.8h,  v17.4s
3647*c0909341SAndroid Build Coastguard Worker.else
3648*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
3649*c0909341SAndroid Build Coastguard Worker.endif
3650*c0909341SAndroid Build Coastguard Worker        mov             v27.16b, v28.16b
3651*c0909341SAndroid Build Coastguard Worker        mov             v28.16b, v29.16b
3652*c0909341SAndroid Build Coastguard Worker.ifb \t
3653*c0909341SAndroid Build Coastguard Worker        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
3654*c0909341SAndroid Build Coastguard Worker.endif
3655*c0909341SAndroid Build Coastguard Worker        mov             v29.16b, v30.16b
3656*c0909341SAndroid Build Coastguard Worker        mov             v30.16b, v31.16b
3657*c0909341SAndroid Build Coastguard Worker        subs            w10, w10, #1
3658*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h}, [x0], x1
3659*c0909341SAndroid Build Coastguard Worker
3660*c0909341SAndroid Build Coastguard Worker        add             w6,  w6,  w4
3661*c0909341SAndroid Build Coastguard Worker        b.gt            1b
3662*c0909341SAndroid Build Coastguard Worker
3663*c0909341SAndroid Build Coastguard Worker        ldp             d14, d15, [sp, #0x30]
3664*c0909341SAndroid Build Coastguard Worker        ldp             d12, d13, [sp, #0x20]
3665*c0909341SAndroid Build Coastguard Worker        ldp             d10, d11, [sp, #0x10]
3666*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp], 0x40
3667*c0909341SAndroid Build Coastguard Worker
3668*c0909341SAndroid Build Coastguard Worker        ret             x15
3669*c0909341SAndroid Build Coastguard Workerendfunc
3670*c0909341SAndroid Build Coastguard Worker.endm
3671*c0909341SAndroid Build Coastguard Worker
3672*c0909341SAndroid Build Coastguard Workerwarp
3673*c0909341SAndroid Build Coastguard Workerwarp t
3674*c0909341SAndroid Build Coastguard Worker
3675*c0909341SAndroid Build Coastguard Worker// void dav1d_emu_edge_16bpc_neon(
3676*c0909341SAndroid Build Coastguard Worker//         const intptr_t bw, const intptr_t bh,
3677*c0909341SAndroid Build Coastguard Worker//         const intptr_t iw, const intptr_t ih,
3678*c0909341SAndroid Build Coastguard Worker//         const intptr_t x, const intptr_t y,
3679*c0909341SAndroid Build Coastguard Worker//         pixel *dst, const ptrdiff_t dst_stride,
3680*c0909341SAndroid Build Coastguard Worker//         const pixel *ref, const ptrdiff_t ref_stride)
3681*c0909341SAndroid Build Coastguard Workerfunction emu_edge_16bpc_neon, export=1
3682*c0909341SAndroid Build Coastguard Worker        ldp             x8,  x9,  [sp]
3683*c0909341SAndroid Build Coastguard Worker
3684*c0909341SAndroid Build Coastguard Worker        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3685*c0909341SAndroid Build Coastguard Worker        // ref += iclip(x, 0, iw - 1)
3686*c0909341SAndroid Build Coastguard Worker        sub             x12, x3,  #1           // ih - 1
3687*c0909341SAndroid Build Coastguard Worker        cmp             x5,  x3
3688*c0909341SAndroid Build Coastguard Worker        sub             x13, x2,  #1           // iw - 1
3689*c0909341SAndroid Build Coastguard Worker        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3690*c0909341SAndroid Build Coastguard Worker        cmp             x4,  x2
3691*c0909341SAndroid Build Coastguard Worker        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3692*c0909341SAndroid Build Coastguard Worker        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3693*c0909341SAndroid Build Coastguard Worker        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3694*c0909341SAndroid Build Coastguard Worker        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3695*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x13, lsl #1  // ref += iclip()
3696*c0909341SAndroid Build Coastguard Worker
3697*c0909341SAndroid Build Coastguard Worker        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3698*c0909341SAndroid Build Coastguard Worker        // top_ext = iclip(-y, 0, bh - 1)
3699*c0909341SAndroid Build Coastguard Worker        add             x10, x5,  x1           // y + bh
3700*c0909341SAndroid Build Coastguard Worker        neg             x5,  x5                // -y
3701*c0909341SAndroid Build Coastguard Worker        sub             x10, x10, x3           // y + bh - ih
3702*c0909341SAndroid Build Coastguard Worker        sub             x12, x1,  #1           // bh - 1
3703*c0909341SAndroid Build Coastguard Worker        cmp             x10, x1
3704*c0909341SAndroid Build Coastguard Worker        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3705*c0909341SAndroid Build Coastguard Worker        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3706*c0909341SAndroid Build Coastguard Worker        cmp             x5,  x1
3707*c0909341SAndroid Build Coastguard Worker        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3708*c0909341SAndroid Build Coastguard Worker        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3709*c0909341SAndroid Build Coastguard Worker
3710*c0909341SAndroid Build Coastguard Worker        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3711*c0909341SAndroid Build Coastguard Worker        // left_ext = iclip(-x, 0, bw - 1)
3712*c0909341SAndroid Build Coastguard Worker        add             x11, x4,  x0           // x + bw
3713*c0909341SAndroid Build Coastguard Worker        neg             x4,  x4                // -x
3714*c0909341SAndroid Build Coastguard Worker        sub             x11, x11, x2           // x + bw - iw
3715*c0909341SAndroid Build Coastguard Worker        sub             x13, x0,  #1           // bw - 1
3716*c0909341SAndroid Build Coastguard Worker        cmp             x11, x0
3717*c0909341SAndroid Build Coastguard Worker        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3718*c0909341SAndroid Build Coastguard Worker        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3719*c0909341SAndroid Build Coastguard Worker        cmp             x4,  x0
3720*c0909341SAndroid Build Coastguard Worker        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3721*c0909341SAndroid Build Coastguard Worker        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3722*c0909341SAndroid Build Coastguard Worker
3723*c0909341SAndroid Build Coastguard Worker        // center_h = bh - top_ext - bottom_ext
3724*c0909341SAndroid Build Coastguard Worker        // dst += top_ext * PXSTRIDE(dst_stride)
3725*c0909341SAndroid Build Coastguard Worker        // center_w = bw - left_ext - right_ext
3726*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  x5           // bh - top_ext
3727*c0909341SAndroid Build Coastguard Worker        madd            x6,  x5,  x7,  x6
3728*c0909341SAndroid Build Coastguard Worker        sub             x2,  x0,  x4           // bw - left_ext
3729*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3730*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3731*c0909341SAndroid Build Coastguard Worker
3732*c0909341SAndroid Build Coastguard Worker        mov             x14, x6                // backup of dst
3733*c0909341SAndroid Build Coastguard Worker
3734*c0909341SAndroid Build Coastguard Worker.macro v_loop need_left, need_right
3735*c0909341SAndroid Build Coastguard Worker0:
3736*c0909341SAndroid Build Coastguard Worker.if \need_left
3737*c0909341SAndroid Build Coastguard Worker        ld1r            {v0.8h}, [x8]
3738*c0909341SAndroid Build Coastguard Worker        mov             x12, x6                // out = dst
3739*c0909341SAndroid Build Coastguard Worker        mov             x3,  x4
3740*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
3741*c0909341SAndroid Build Coastguard Worker1:
3742*c0909341SAndroid Build Coastguard Worker        subs            x3,  x3,  #16
3743*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x12], #32
3744*c0909341SAndroid Build Coastguard Worker        b.gt            1b
3745*c0909341SAndroid Build Coastguard Worker.endif
3746*c0909341SAndroid Build Coastguard Worker        mov             x13, x8
3747*c0909341SAndroid Build Coastguard Worker        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
3748*c0909341SAndroid Build Coastguard Worker        mov             x3,  x2
3749*c0909341SAndroid Build Coastguard Worker1:
3750*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
3751*c0909341SAndroid Build Coastguard Worker        subs            x3,  x3,  #32
3752*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
3753*c0909341SAndroid Build Coastguard Worker        b.gt            1b
3754*c0909341SAndroid Build Coastguard Worker.if \need_right
3755*c0909341SAndroid Build Coastguard Worker        add             x3,  x8,  x2, lsl #1   // in + center_w
3756*c0909341SAndroid Build Coastguard Worker        sub             x3,  x3,  #2           // in + center_w - 1
3757*c0909341SAndroid Build Coastguard Worker        add             x12, x6,  x4, lsl #1   // dst + left_ext
3758*c0909341SAndroid Build Coastguard Worker        ld1r            {v0.8h}, [x3]
3759*c0909341SAndroid Build Coastguard Worker        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
3760*c0909341SAndroid Build Coastguard Worker        mov             x3,  x11
3761*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
3762*c0909341SAndroid Build Coastguard Worker1:
3763*c0909341SAndroid Build Coastguard Worker        subs            x3,  x3,  #16
3764*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x12], #32
3765*c0909341SAndroid Build Coastguard Worker        b.gt            1b
3766*c0909341SAndroid Build Coastguard Worker.endif
3767*c0909341SAndroid Build Coastguard Worker
3768*c0909341SAndroid Build Coastguard Worker        subs            x1,  x1,  #1           // center_h--
3769*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x7
3770*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x9
3771*c0909341SAndroid Build Coastguard Worker        b.gt            0b
3772*c0909341SAndroid Build Coastguard Worker.endm
3773*c0909341SAndroid Build Coastguard Worker
3774*c0909341SAndroid Build Coastguard Worker        cbz             x4,  2f
3775*c0909341SAndroid Build Coastguard Worker        // need_left
3776*c0909341SAndroid Build Coastguard Worker        cbz             x11, 3f
3777*c0909341SAndroid Build Coastguard Worker        // need_left + need_right
3778*c0909341SAndroid Build Coastguard Worker        v_loop          1,   1
3779*c0909341SAndroid Build Coastguard Worker        b               5f
3780*c0909341SAndroid Build Coastguard Worker
3781*c0909341SAndroid Build Coastguard Worker2:
3782*c0909341SAndroid Build Coastguard Worker        // !need_left
3783*c0909341SAndroid Build Coastguard Worker        cbz             x11, 4f
3784*c0909341SAndroid Build Coastguard Worker        // !need_left + need_right
3785*c0909341SAndroid Build Coastguard Worker        v_loop          0,   1
3786*c0909341SAndroid Build Coastguard Worker        b               5f
3787*c0909341SAndroid Build Coastguard Worker
3788*c0909341SAndroid Build Coastguard Worker3:
3789*c0909341SAndroid Build Coastguard Worker        // need_left + !need_right
3790*c0909341SAndroid Build Coastguard Worker        v_loop          1,   0
3791*c0909341SAndroid Build Coastguard Worker        b               5f
3792*c0909341SAndroid Build Coastguard Worker
3793*c0909341SAndroid Build Coastguard Worker4:
3794*c0909341SAndroid Build Coastguard Worker        // !need_left + !need_right
3795*c0909341SAndroid Build Coastguard Worker        v_loop          0,   0
3796*c0909341SAndroid Build Coastguard Worker
3797*c0909341SAndroid Build Coastguard Worker5:
3798*c0909341SAndroid Build Coastguard Worker
3799*c0909341SAndroid Build Coastguard Worker        cbz             x10, 3f
3800*c0909341SAndroid Build Coastguard Worker        // need_bottom
3801*c0909341SAndroid Build Coastguard Worker        sub             x8,  x6,  x7           // ref = dst - stride
3802*c0909341SAndroid Build Coastguard Worker        mov             x4,  x0
3803*c0909341SAndroid Build Coastguard Worker1:
3804*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
3805*c0909341SAndroid Build Coastguard Worker        mov             x3,  x10
3806*c0909341SAndroid Build Coastguard Worker2:
3807*c0909341SAndroid Build Coastguard Worker        subs            x3,  x3,  #1
3808*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3809*c0909341SAndroid Build Coastguard Worker        b.gt            2b
3810*c0909341SAndroid Build Coastguard Worker        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3811*c0909341SAndroid Build Coastguard Worker        subs            x4,  x4,  #32          // bw -= 32
3812*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  #64          // dst += 32
3813*c0909341SAndroid Build Coastguard Worker        b.gt            1b
3814*c0909341SAndroid Build Coastguard Worker
3815*c0909341SAndroid Build Coastguard Worker3:
3816*c0909341SAndroid Build Coastguard Worker        cbz             x5,  3f
3817*c0909341SAndroid Build Coastguard Worker        // need_top
3818*c0909341SAndroid Build Coastguard Worker        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
3819*c0909341SAndroid Build Coastguard Worker1:
3820*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
3821*c0909341SAndroid Build Coastguard Worker        mov             x3,  x5
3822*c0909341SAndroid Build Coastguard Worker2:
3823*c0909341SAndroid Build Coastguard Worker        subs            x3,  x3,  #1
3824*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3825*c0909341SAndroid Build Coastguard Worker        b.gt            2b
3826*c0909341SAndroid Build Coastguard Worker        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3827*c0909341SAndroid Build Coastguard Worker        subs            x0,  x0,  #32          // bw -= 32
3828*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  #64          // dst += 32
3829*c0909341SAndroid Build Coastguard Worker        b.gt            1b
3830*c0909341SAndroid Build Coastguard Worker
3831*c0909341SAndroid Build Coastguard Worker3:
3832*c0909341SAndroid Build Coastguard Worker        ret
3833*c0909341SAndroid Build Coastguard Workerendfunc
3834