xref: /aosp_15_r20/external/libdav1d/src/arm/32/mc16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, Janne Grunau
4*c0909341SAndroid Build Coastguard Worker * Copyright © 2020, Martin Storsjo
5*c0909341SAndroid Build Coastguard Worker * All rights reserved.
6*c0909341SAndroid Build Coastguard Worker *
7*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
8*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
9*c0909341SAndroid Build Coastguard Worker *
10*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
11*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
12*c0909341SAndroid Build Coastguard Worker *
13*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
14*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
15*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
16*c0909341SAndroid Build Coastguard Worker *
17*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*c0909341SAndroid Build Coastguard Worker */
28*c0909341SAndroid Build Coastguard Worker
29*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
30*c0909341SAndroid Build Coastguard Worker#include "util.S"
31*c0909341SAndroid Build Coastguard Worker
32*c0909341SAndroid Build Coastguard Worker#define PREP_BIAS 8192
33*c0909341SAndroid Build Coastguard Worker
34*c0909341SAndroid Build Coastguard Worker.macro avg d0, d00, d01, d1, d10, d11
35*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r2, :128]!
36*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r3, :128]!
37*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q0,  q0,  q2
38*c0909341SAndroid Build Coastguard Worker        vqadd.s16       q1,  q1,  q3
39*c0909341SAndroid Build Coastguard Worker        vmax.s16        q0,  q0,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
40*c0909341SAndroid Build Coastguard Worker        vmax.s16        q1,  q1,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
41*c0909341SAndroid Build Coastguard Worker        vqsub.s16       q0,  q0,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
42*c0909341SAndroid Build Coastguard Worker        vqsub.s16       q1,  q1,  q12 // -2*PREP_BIAS - 1 << intermediate_bits
43*c0909341SAndroid Build Coastguard Worker        vshl.s16        \d0, q0,  q13 // -(intermediate_bits+1)
44*c0909341SAndroid Build Coastguard Worker        vshl.s16        \d1, q1,  q13 // -(intermediate_bits+1)
45*c0909341SAndroid Build Coastguard Worker.endm
46*c0909341SAndroid Build Coastguard Worker
47*c0909341SAndroid Build Coastguard Worker.macro w_avg d0, d00, d01, d1, d10, d11
48*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r2, :128]!
49*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r3, :128]!
50*c0909341SAndroid Build Coastguard Worker        // This difference requires a 17 bit range, and all bits are
51*c0909341SAndroid Build Coastguard Worker        // significant for the following multiplication.
52*c0909341SAndroid Build Coastguard Worker        vsubl.s16       \d0, d4,  d0
53*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q0,  d5,  d1
54*c0909341SAndroid Build Coastguard Worker        vsubl.s16       \d1, d6,  d2
55*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q1,  d7,  d3
56*c0909341SAndroid Build Coastguard Worker        vmul.s32        \d0, \d0, q4
57*c0909341SAndroid Build Coastguard Worker        vmul.s32        q0,  q0,  q4
58*c0909341SAndroid Build Coastguard Worker        vmul.s32        \d1, \d1, q4
59*c0909341SAndroid Build Coastguard Worker        vmul.s32        q1,  q1,  q4
60*c0909341SAndroid Build Coastguard Worker        vshr.s32        \d0, \d0, #4
61*c0909341SAndroid Build Coastguard Worker        vshr.s32        q0,  q0,  #4
62*c0909341SAndroid Build Coastguard Worker        vshr.s32        \d1, \d1, #4
63*c0909341SAndroid Build Coastguard Worker        vshr.s32        q1,  q1,  #4
64*c0909341SAndroid Build Coastguard Worker        vaddw.s16       \d0, \d0, d4
65*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q0,  q0,  d5
66*c0909341SAndroid Build Coastguard Worker        vaddw.s16       \d1, \d1, d6
67*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q1,  q1,  d7
68*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d00, \d0
69*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d01, q0
70*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d10, \d1
71*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d11, q1
72*c0909341SAndroid Build Coastguard Worker        vrshl.s16       \d0, \d0, q13 // -intermediate_bits
73*c0909341SAndroid Build Coastguard Worker        vrshl.s16       \d1, \d1, q13 // -intermediate_bits
74*c0909341SAndroid Build Coastguard Worker        vadd.s16        \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
75*c0909341SAndroid Build Coastguard Worker        vadd.s16        \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
76*c0909341SAndroid Build Coastguard Worker        vmin.s16        \d0, \d0, q15 // bitdepth_max
77*c0909341SAndroid Build Coastguard Worker        vmin.s16        \d1, \d1, q15 // bitdepth_max
78*c0909341SAndroid Build Coastguard Worker        vmax.s16        \d0, \d0, q14 // 0
79*c0909341SAndroid Build Coastguard Worker        vmax.s16        \d1, \d1, q14 // 0
80*c0909341SAndroid Build Coastguard Worker.endm
81*c0909341SAndroid Build Coastguard Worker
82*c0909341SAndroid Build Coastguard Worker.macro mask d0, d00, d01, d1, d10, d11
83*c0909341SAndroid Build Coastguard Worker        vld1.8          {q7},     [r6, :128]!
84*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r2, :128]!
85*c0909341SAndroid Build Coastguard Worker        vneg.s8         q7,  q7
86*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r3, :128]!
87*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q6,  d14
88*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q7,  d15
89*c0909341SAndroid Build Coastguard Worker        vmovl.s16       q4,  d12
90*c0909341SAndroid Build Coastguard Worker        vmovl.s16       q5,  d13
91*c0909341SAndroid Build Coastguard Worker        vmovl.s16       q6,  d14
92*c0909341SAndroid Build Coastguard Worker        vmovl.s16       q7,  d15
93*c0909341SAndroid Build Coastguard Worker        vsubl.s16       \d0, d4,  d0
94*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q0,  d5,  d1
95*c0909341SAndroid Build Coastguard Worker        vsubl.s16       \d1, d6,  d2
96*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q1,  d7,  d3
97*c0909341SAndroid Build Coastguard Worker        vmul.s32        \d0, \d0, q4
98*c0909341SAndroid Build Coastguard Worker        vmul.s32        q0,  q0,  q5
99*c0909341SAndroid Build Coastguard Worker        vmul.s32        \d1, \d1, q6
100*c0909341SAndroid Build Coastguard Worker        vmul.s32        q1,  q1,  q7
101*c0909341SAndroid Build Coastguard Worker        vshr.s32        \d0, \d0, #6
102*c0909341SAndroid Build Coastguard Worker        vshr.s32        q0,  q0,  #6
103*c0909341SAndroid Build Coastguard Worker        vshr.s32        \d1, \d1, #6
104*c0909341SAndroid Build Coastguard Worker        vshr.s32        q1,  q1,  #6
105*c0909341SAndroid Build Coastguard Worker        vaddw.s16       \d0, \d0, d4
106*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q0,  q0,  d5
107*c0909341SAndroid Build Coastguard Worker        vaddw.s16       \d1, \d1, d6
108*c0909341SAndroid Build Coastguard Worker        vaddw.s16       q1,  q1,  d7
109*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d00, \d0
110*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d01, q0
111*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d10, \d1
112*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d11, q1
113*c0909341SAndroid Build Coastguard Worker        vrshl.s16       \d0, \d0, q13 // -intermediate_bits
114*c0909341SAndroid Build Coastguard Worker        vrshl.s16       \d1, \d1, q13 // -intermediate_bits
115*c0909341SAndroid Build Coastguard Worker        vadd.s16        \d0, \d0, q12 // PREP_BIAS >> intermediate_bits
116*c0909341SAndroid Build Coastguard Worker        vadd.s16        \d1, \d1, q12 // PREP_BIAS >> intermediate_bits
117*c0909341SAndroid Build Coastguard Worker        vmin.s16        \d0, \d0, q15 // bitdepth_max
118*c0909341SAndroid Build Coastguard Worker        vmin.s16        \d1, \d1, q15 // bitdepth_max
119*c0909341SAndroid Build Coastguard Worker        vmax.s16        \d0, \d0, q14 // 0
120*c0909341SAndroid Build Coastguard Worker        vmax.s16        \d1, \d1, q14 // 0
121*c0909341SAndroid Build Coastguard Worker.endm
122*c0909341SAndroid Build Coastguard Worker
123*c0909341SAndroid Build Coastguard Worker.macro bidir_fn type, bdmax
124*c0909341SAndroid Build Coastguard Workerfunction \type\()_16bpc_neon, export=1
125*c0909341SAndroid Build Coastguard Worker        push            {r4-r7,lr}
126*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #20]
127*c0909341SAndroid Build Coastguard Worker        ldr             r6,  [sp, #28]
128*c0909341SAndroid Build Coastguard Worker        clz             r4,  r4
129*c0909341SAndroid Build Coastguard Worker.ifnc \type, avg
130*c0909341SAndroid Build Coastguard Worker        ldr             r7,  [sp, #32]
131*c0909341SAndroid Build Coastguard Worker        vmov.i16        q14, #0
132*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r7         // bitdepth_max
133*c0909341SAndroid Build Coastguard Worker.endif
134*c0909341SAndroid Build Coastguard Worker.ifc \type, w_avg
135*c0909341SAndroid Build Coastguard Worker        vpush           {q4}
136*c0909341SAndroid Build Coastguard Worker.endif
137*c0909341SAndroid Build Coastguard Worker.ifc \type, mask
138*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
139*c0909341SAndroid Build Coastguard Worker.endif
140*c0909341SAndroid Build Coastguard Worker        clz             r7,  \bdmax
141*c0909341SAndroid Build Coastguard Worker        sub             r7,  r7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
142*c0909341SAndroid Build Coastguard Worker.ifc \type, avg
143*c0909341SAndroid Build Coastguard Worker        mov             lr,  #1
144*c0909341SAndroid Build Coastguard Worker        movw            r12, #2*PREP_BIAS
145*c0909341SAndroid Build Coastguard Worker        lsl             lr,  lr,  r7    // 1 << intermediate_bits
146*c0909341SAndroid Build Coastguard Worker        neg             r12, r12         // -2*PREP_BIAS
147*c0909341SAndroid Build Coastguard Worker        add             r7,  r7,  #1
148*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, lr    // -2*PREP_BIAS - 1 << intermediate_bits
149*c0909341SAndroid Build Coastguard Worker        neg             r7,  r7         // -(intermediate_bits+1)
150*c0909341SAndroid Build Coastguard Worker        vdup.16         q12, r12         // -2*PREP_BIAS - 1 << intermediate_bits
151*c0909341SAndroid Build Coastguard Worker        vdup.16         q13, r7         // -(intermediate_bits+1)
152*c0909341SAndroid Build Coastguard Worker.else
153*c0909341SAndroid Build Coastguard Worker        mov             r12, #PREP_BIAS
154*c0909341SAndroid Build Coastguard Worker        lsr             r12, r12, r7    // PREP_BIAS >> intermediate_bits
155*c0909341SAndroid Build Coastguard Worker        neg             r7,  r7         // -intermediate_bits
156*c0909341SAndroid Build Coastguard Worker        vdup.16         q12, r12         // PREP_BIAS >> intermediate_bits
157*c0909341SAndroid Build Coastguard Worker        vdup.16         q13, r7         // -intermediate_bits
158*c0909341SAndroid Build Coastguard Worker.endif
159*c0909341SAndroid Build Coastguard Worker.ifc \type, w_avg
160*c0909341SAndroid Build Coastguard Worker        vdup.32         q4,  r6
161*c0909341SAndroid Build Coastguard Worker        vneg.s32        q4,  q4
162*c0909341SAndroid Build Coastguard Worker.endif
163*c0909341SAndroid Build Coastguard Worker        adr             r7,  L(\type\()_tbl)
164*c0909341SAndroid Build Coastguard Worker        sub             r4,  r4,  #24
165*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
166*c0909341SAndroid Build Coastguard Worker        ldr             r4,  [r7, r4, lsl #2]
167*c0909341SAndroid Build Coastguard Worker        add             r7,  r7,  r4
168*c0909341SAndroid Build Coastguard Worker        bx              r7
169*c0909341SAndroid Build Coastguard Worker
170*c0909341SAndroid Build Coastguard Worker        .align 2
171*c0909341SAndroid Build Coastguard WorkerL(\type\()_tbl):
172*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(\type\()_tbl) + CONFIG_THUMB
173*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(\type\()_tbl) + CONFIG_THUMB
174*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(\type\()_tbl) + CONFIG_THUMB
175*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(\type\()_tbl) + CONFIG_THUMB
176*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(\type\()_tbl) + CONFIG_THUMB
177*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(\type\()_tbl) + CONFIG_THUMB
178*c0909341SAndroid Build Coastguard Worker
179*c0909341SAndroid Build Coastguard Worker40:
180*c0909341SAndroid Build Coastguard Worker        add             r7,  r0,  r1
181*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
182*c0909341SAndroid Build Coastguard Worker4:
183*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #4
184*c0909341SAndroid Build Coastguard Worker        vst1.16         {d16},  [r0, :64], r1
185*c0909341SAndroid Build Coastguard Worker        vst1.16         {d17},  [r7, :64], r1
186*c0909341SAndroid Build Coastguard Worker        vst1.16         {d18},  [r0, :64], r1
187*c0909341SAndroid Build Coastguard Worker        vst1.16         {d19},  [r7, :64], r1
188*c0909341SAndroid Build Coastguard Worker        ble             0f
189*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
190*c0909341SAndroid Build Coastguard Worker        b               4b
191*c0909341SAndroid Build Coastguard Worker80:
192*c0909341SAndroid Build Coastguard Worker        add             r7,  r0,  r1
193*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
194*c0909341SAndroid Build Coastguard Worker8:
195*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8},  [r0, :128], r1
196*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #2
197*c0909341SAndroid Build Coastguard Worker        vst1.16         {q9},  [r7, :128], r1
198*c0909341SAndroid Build Coastguard Worker        ble             0f
199*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
200*c0909341SAndroid Build Coastguard Worker        b               8b
201*c0909341SAndroid Build Coastguard Worker160:
202*c0909341SAndroid Build Coastguard Worker16:
203*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
204*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r1
205*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #2
206*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r0, :128], r1
207*c0909341SAndroid Build Coastguard Worker        ble             0f
208*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
209*c0909341SAndroid Build Coastguard Worker        b               16b
210*c0909341SAndroid Build Coastguard Worker320:
211*c0909341SAndroid Build Coastguard Worker        add             r7,  r0,  #32
212*c0909341SAndroid Build Coastguard Worker32:
213*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
214*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r1
215*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #1
216*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r7, :128], r1
217*c0909341SAndroid Build Coastguard Worker        ble             0f
218*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
219*c0909341SAndroid Build Coastguard Worker        b               32b
220*c0909341SAndroid Build Coastguard Worker640:
221*c0909341SAndroid Build Coastguard Worker        add             r7,  r0,  #32
222*c0909341SAndroid Build Coastguard Worker        mov             r12, #64
223*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #64
224*c0909341SAndroid Build Coastguard Worker64:
225*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
226*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r12
227*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
228*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r7, :128], r12
229*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
230*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r1
231*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #1
232*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r7, :128], r1
233*c0909341SAndroid Build Coastguard Worker        ble             0f
234*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
235*c0909341SAndroid Build Coastguard Worker        b               64b
236*c0909341SAndroid Build Coastguard Worker1280:
237*c0909341SAndroid Build Coastguard Worker        add             r7,  r0,  #32
238*c0909341SAndroid Build Coastguard Worker        mov             r12, #64
239*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #192
240*c0909341SAndroid Build Coastguard Worker128:
241*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
242*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r12
243*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
244*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r7, :128], r12
245*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
246*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r12
247*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
248*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r7, :128], r12
249*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
250*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r12
251*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
252*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r7, :128], r12
253*c0909341SAndroid Build Coastguard Worker        \type           q10, d20, d21, q11, d22, d23
254*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128], r1
255*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #1
256*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r7, :128], r1
257*c0909341SAndroid Build Coastguard Worker        ble             0f
258*c0909341SAndroid Build Coastguard Worker        \type           q8,  d16, d17, q9,  d18, d19
259*c0909341SAndroid Build Coastguard Worker        b               128b
260*c0909341SAndroid Build Coastguard Worker0:
261*c0909341SAndroid Build Coastguard Worker.ifc \type, mask
262*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
263*c0909341SAndroid Build Coastguard Worker.endif
264*c0909341SAndroid Build Coastguard Worker.ifc \type, w_avg
265*c0909341SAndroid Build Coastguard Worker        vpop            {q4}
266*c0909341SAndroid Build Coastguard Worker.endif
267*c0909341SAndroid Build Coastguard Worker        pop             {r4-r7,pc}
268*c0909341SAndroid Build Coastguard Workerendfunc
269*c0909341SAndroid Build Coastguard Worker.endm
270*c0909341SAndroid Build Coastguard Worker
271*c0909341SAndroid Build Coastguard Workerbidir_fn avg, r6
272*c0909341SAndroid Build Coastguard Workerbidir_fn w_avg, r7
273*c0909341SAndroid Build Coastguard Workerbidir_fn mask, r7
274*c0909341SAndroid Build Coastguard Worker
275*c0909341SAndroid Build Coastguard Worker
276*c0909341SAndroid Build Coastguard Worker.macro w_mask_fn type
277*c0909341SAndroid Build Coastguard Workerfunction w_mask_\type\()_16bpc_neon, export=1
278*c0909341SAndroid Build Coastguard Worker        push            {r4-r10,lr}
279*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
280*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #96]
281*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #104]
282*c0909341SAndroid Build Coastguard Worker        ldr             r8,  [sp, #112]
283*c0909341SAndroid Build Coastguard Worker        clz             r9,  r4
284*c0909341SAndroid Build Coastguard Worker        adr             lr,  L(w_mask_\type\()_tbl)
285*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r8       // bitdepth_max
286*c0909341SAndroid Build Coastguard Worker        sub             r9,  r9,  #24
287*c0909341SAndroid Build Coastguard Worker        clz             r8,  r8       // clz(bitdepth_max)
288*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [lr,  r9,  lsl #2]
289*c0909341SAndroid Build Coastguard Worker        add             r9,  lr,  r9
290*c0909341SAndroid Build Coastguard Worker        sub             r8,  r8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
291*c0909341SAndroid Build Coastguard Worker        mov             r10, #PREP_BIAS*64
292*c0909341SAndroid Build Coastguard Worker        neg             r8,  r8       // -sh
293*c0909341SAndroid Build Coastguard Worker        movw            r12, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
294*c0909341SAndroid Build Coastguard Worker        vdup.32         q14, r8       // -sh
295*c0909341SAndroid Build Coastguard Worker        vdup.16         q0,  r12
296*c0909341SAndroid Build Coastguard Worker.if \type == 444
297*c0909341SAndroid Build Coastguard Worker        vmov.i8         q1,  #64
298*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
299*c0909341SAndroid Build Coastguard Worker        vdup.8          d4,  r7
300*c0909341SAndroid Build Coastguard Worker        vmov.i8         d2,  #129
301*c0909341SAndroid Build Coastguard Worker        vsub.i16        d2,  d2,  d4
302*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
303*c0909341SAndroid Build Coastguard Worker        vdup.16         q2,  r7
304*c0909341SAndroid Build Coastguard Worker        vmov.i16        q1,  #0x100
305*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q2
306*c0909341SAndroid Build Coastguard Worker.endif
307*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
308*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
309*c0909341SAndroid Build Coastguard Worker        bx              r9
310*c0909341SAndroid Build Coastguard Worker
311*c0909341SAndroid Build Coastguard Worker        .align 2
312*c0909341SAndroid Build Coastguard WorkerL(w_mask_\type\()_tbl):
313*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(w_mask_\type\()_tbl) + CONFIG_THUMB
314*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(w_mask_\type\()_tbl) + CONFIG_THUMB
315*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(w_mask_\type\()_tbl) + CONFIG_THUMB
316*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(w_mask_\type\()_tbl) + CONFIG_THUMB
317*c0909341SAndroid Build Coastguard Worker        .word 8f    - L(w_mask_\type\()_tbl) + CONFIG_THUMB
318*c0909341SAndroid Build Coastguard Worker        .word 4f    - L(w_mask_\type\()_tbl) + CONFIG_THUMB
319*c0909341SAndroid Build Coastguard Worker
320*c0909341SAndroid Build Coastguard Worker4:
321*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r2, :128]! // tmp1 (four rows at once)
322*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4, q5}, [r3, :128]! // tmp2 (four rows at once)
323*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #4
324*c0909341SAndroid Build Coastguard Worker        vdup.32         q13, r10       // PREP_BIAS*64
325*c0909341SAndroid Build Coastguard Worker        vabd.s16        q6,  q2,  q4   // abs(tmp1 - tmp2)
326*c0909341SAndroid Build Coastguard Worker        vabd.s16        q7,  q3,  q5
327*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q8,  d8,  d4   // tmp2 - tmp1 (requires 17 bit)
328*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q9,  d9,  d5
329*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q10, d10, d6
330*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q11, d11, d7
331*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q6,  q0,  q6   // 27615 - abs()
332*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q7,  q0,  q7
333*c0909341SAndroid Build Coastguard Worker        vshll.s16       q5,  d7,  #6   // tmp1 << 6
334*c0909341SAndroid Build Coastguard Worker        vshll.s16       q4,  d6,  #6
335*c0909341SAndroid Build Coastguard Worker        vshll.s16       q3,  d5,  #6
336*c0909341SAndroid Build Coastguard Worker        vshll.s16       q2,  d4,  #6
337*c0909341SAndroid Build Coastguard Worker        vshr.u16        q6,  q6,  #10  // 64-m = (27615 - abs()) >> mask_sh
338*c0909341SAndroid Build Coastguard Worker        vshr.u16        q7,  q7,  #10
339*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q13  // += PREP_BIAS*64
340*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q13
341*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q13
342*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q13
343*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q12, d12
344*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d13
345*c0909341SAndroid Build Coastguard Worker        vmla.i32        q2,  q8,  q12  // (tmp2-tmp1)*(64-m)
346*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q12, d14
347*c0909341SAndroid Build Coastguard Worker        vmla.i32        q3,  q9,  q13
348*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d15
349*c0909341SAndroid Build Coastguard Worker        vmla.i32        q4,  q10, q12
350*c0909341SAndroid Build Coastguard Worker        vmla.i32        q5,  q11, q13
351*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q14  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
352*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14
353*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q4,  q4,  q14
354*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q5,  q5,  q14
355*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2        // iclip_pixel
356*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
357*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d6,  q4
358*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d7,  q5
359*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15  // iclip_pixel
360*c0909341SAndroid Build Coastguard Worker        vmin.u16        q3,  q3,  q15  // iclip_pixel
361*c0909341SAndroid Build Coastguard Worker.if \type == 444
362*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d12, q6        // 64 - m
363*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d13, q7
364*c0909341SAndroid Build Coastguard Worker        vsub.i16        q6,  q1,  q6   // m
365*c0909341SAndroid Build Coastguard Worker        vst1.8          {q6}, [r6, :128]!
366*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
367*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d12, d12, d13  // (64 - m) + (64 - n) (column wise addition)
368*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d13, d14, d15
369*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d12, q6
370*c0909341SAndroid Build Coastguard Worker        vhsub.u8        d12, d2,  d12  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
371*c0909341SAndroid Build Coastguard Worker        vst1.8          {d12}, [r6, :64]!
372*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
373*c0909341SAndroid Build Coastguard Worker        vadd.i16        d12, d12, d13  // (64 - my1) + (64 - my2) (row wise addition)
374*c0909341SAndroid Build Coastguard Worker        vadd.i16        d13, d14, d15
375*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d12, d12, d13  // (128 - m) + (128 - n) (column wise addition)
376*c0909341SAndroid Build Coastguard Worker        vsub.i16        d12, d2,  d12  // (256 - sign) - ((128 - m) + (128 - n))
377*c0909341SAndroid Build Coastguard Worker        vrshrn.i16      d12, q6,  #2   // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
378*c0909341SAndroid Build Coastguard Worker        vst1.32         {d12[0]}, [r6, :32]!
379*c0909341SAndroid Build Coastguard Worker.endif
380*c0909341SAndroid Build Coastguard Worker        vst1.16         {d4}, [r0,  :64], r1
381*c0909341SAndroid Build Coastguard Worker        vst1.16         {d5}, [r12, :64], r1
382*c0909341SAndroid Build Coastguard Worker        vst1.16         {d6}, [r0,  :64], r1
383*c0909341SAndroid Build Coastguard Worker        vst1.16         {d7}, [r12, :64], r1
384*c0909341SAndroid Build Coastguard Worker        bgt             4b
385*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
386*c0909341SAndroid Build Coastguard Worker        pop             {r4-r10,pc}
387*c0909341SAndroid Build Coastguard Worker8:
388*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r2, :128]! // tmp1
389*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4, q5}, [r3, :128]! // tmp2
390*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #2
391*c0909341SAndroid Build Coastguard Worker        vdup.32         q13, r10       // PREP_BIAS*64
392*c0909341SAndroid Build Coastguard Worker        vabd.s16        q6,  q2,  q4   // abs(tmp1 - tmp2)
393*c0909341SAndroid Build Coastguard Worker        vabd.s16        q7,  q3,  q5
394*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q8,  d8,  d4   // tmp2 - tmp1 (requires 17 bit)
395*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q9,  d9,  d5
396*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q10, d10, d6
397*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q11, d11, d7
398*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q6,  q0,  q6   // 27615 - abs()
399*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q7,  q0,  q7
400*c0909341SAndroid Build Coastguard Worker        vshll.s16       q5,  d7,  #6   // tmp1 << 6
401*c0909341SAndroid Build Coastguard Worker        vshll.s16       q4,  d6,  #6
402*c0909341SAndroid Build Coastguard Worker        vshll.s16       q3,  d5,  #6
403*c0909341SAndroid Build Coastguard Worker        vshll.s16       q2,  d4,  #6
404*c0909341SAndroid Build Coastguard Worker        vshr.u16        q6,  q6,  #10  // 64-m = (27615 - abs()) >> mask_sh
405*c0909341SAndroid Build Coastguard Worker        vshr.u16        q7,  q7,  #10
406*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q13  // += PREP_BIAS*64
407*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q13
408*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q13
409*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q13
410*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q12, d12
411*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d13
412*c0909341SAndroid Build Coastguard Worker        vmla.i32        q2,  q8,  q12  // (tmp2-tmp1)*(64-m)
413*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q12, d14
414*c0909341SAndroid Build Coastguard Worker        vmla.i32        q3,  q9,  q13
415*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d15
416*c0909341SAndroid Build Coastguard Worker        vmla.i32        q4,  q10, q12
417*c0909341SAndroid Build Coastguard Worker        vmla.i32        q5,  q11, q13
418*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q14  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
419*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14
420*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q4,  q4,  q14
421*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q5,  q5,  q14
422*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2        // iclip_pixel
423*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
424*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d6,  q4
425*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d7,  q5
426*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15  // iclip_pixel
427*c0909341SAndroid Build Coastguard Worker        vmin.u16        q3,  q3,  q15  // iclip_pixel
428*c0909341SAndroid Build Coastguard Worker.if \type == 444
429*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d12, q6        // 64 - m
430*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d13, q7
431*c0909341SAndroid Build Coastguard Worker        vsub.i16        q6,  q1,  q6   // m
432*c0909341SAndroid Build Coastguard Worker        vst1.8          {q6}, [r6, :128]!
433*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
434*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d12, d12, d13  // (64 - m) + (64 - n) (column wise addition)
435*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d13, d14, d15
436*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d12, q6
437*c0909341SAndroid Build Coastguard Worker        vhsub.u8        d12, d2,  d12  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
438*c0909341SAndroid Build Coastguard Worker        vst1.8          {d12}, [r6, :64]!
439*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
440*c0909341SAndroid Build Coastguard Worker        vadd.i16        q6,  q6,  q7   // (64 - my1) + (64 - my2) (row wise addition)
441*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d12, d12, d13  // (128 - m) + (128 - n) (column wise addition)
442*c0909341SAndroid Build Coastguard Worker        vsub.i16        d12, d2,  d12  // (256 - sign) - ((128 - m) + (128 - n))
443*c0909341SAndroid Build Coastguard Worker        vrshrn.i16      d12, q6,  #2   // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
444*c0909341SAndroid Build Coastguard Worker        vst1.32         {d12[0]}, [r6, :32]!
445*c0909341SAndroid Build Coastguard Worker.endif
446*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2}, [r0,  :128], r1
447*c0909341SAndroid Build Coastguard Worker        vst1.16         {q3}, [r12, :128], r1
448*c0909341SAndroid Build Coastguard Worker        bgt             8b
449*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
450*c0909341SAndroid Build Coastguard Worker        pop             {r4-r10,pc}
451*c0909341SAndroid Build Coastguard Worker1280:
452*c0909341SAndroid Build Coastguard Worker640:
453*c0909341SAndroid Build Coastguard Worker320:
454*c0909341SAndroid Build Coastguard Worker160:
455*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  r4,  lsl #1
456*c0909341SAndroid Build Coastguard Worker.if \type == 444
457*c0909341SAndroid Build Coastguard Worker        add             lr,  r6,  r4
458*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
459*c0909341SAndroid Build Coastguard Worker        add             lr,  r6,  r4,  lsr #1
460*c0909341SAndroid Build Coastguard Worker.endif
461*c0909341SAndroid Build Coastguard Worker        add             r7,  r2,  r4,  lsl #1
462*c0909341SAndroid Build Coastguard Worker        add             r9,  r3,  r4,  lsl #1
463*c0909341SAndroid Build Coastguard Worker161:
464*c0909341SAndroid Build Coastguard Worker        mov             r8,  r4
465*c0909341SAndroid Build Coastguard Worker16:
466*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2}, [r2, :128]! // tmp1
467*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4}, [r3, :128]! // tmp2
468*c0909341SAndroid Build Coastguard Worker        vld1.16         {q3}, [r7, :128]!
469*c0909341SAndroid Build Coastguard Worker        vld1.16         {q5}, [r9, :128]!
470*c0909341SAndroid Build Coastguard Worker        subs            r8,  r8,  #8
471*c0909341SAndroid Build Coastguard Worker        vdup.32         q13, r10       // PREP_BIAS*64
472*c0909341SAndroid Build Coastguard Worker        vabd.s16        q6,  q2,  q4   // abs(tmp1 - tmp2)
473*c0909341SAndroid Build Coastguard Worker        vabd.s16        q7,  q3,  q5
474*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q8,  d8,  d4   // tmp2 - tmp1 (requires 17 bit)
475*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q9,  d9,  d5
476*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q10, d10, d6
477*c0909341SAndroid Build Coastguard Worker        vsubl.s16       q11, d11, d7
478*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q6,  q0,  q6   // 27615 - abs()
479*c0909341SAndroid Build Coastguard Worker        vqsub.u16       q7,  q0,  q7
480*c0909341SAndroid Build Coastguard Worker        vshll.s16       q5,  d7,  #6   // tmp1 << 6
481*c0909341SAndroid Build Coastguard Worker        vshll.s16       q4,  d6,  #6
482*c0909341SAndroid Build Coastguard Worker        vshll.s16       q3,  d5,  #6
483*c0909341SAndroid Build Coastguard Worker        vshll.s16       q2,  d4,  #6
484*c0909341SAndroid Build Coastguard Worker        vshr.u16        q6,  q6,  #10  // 64-m = (27615 - abs()) >> mask_sh
485*c0909341SAndroid Build Coastguard Worker        vshr.u16        q7,  q7,  #10
486*c0909341SAndroid Build Coastguard Worker        vadd.i32        q2,  q2,  q13  // += PREP_BIAS*64
487*c0909341SAndroid Build Coastguard Worker        vadd.i32        q3,  q3,  q13
488*c0909341SAndroid Build Coastguard Worker        vadd.i32        q4,  q4,  q13
489*c0909341SAndroid Build Coastguard Worker        vadd.i32        q5,  q5,  q13
490*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q12, d12
491*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d13
492*c0909341SAndroid Build Coastguard Worker        vmla.i32        q2,  q8,  q12  // (tmp2-tmp1)*(64-m)
493*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q12, d14
494*c0909341SAndroid Build Coastguard Worker        vmla.i32        q3,  q9,  q13
495*c0909341SAndroid Build Coastguard Worker        vmovl.u16       q13, d15
496*c0909341SAndroid Build Coastguard Worker        vmla.i32        q4,  q10, q12
497*c0909341SAndroid Build Coastguard Worker        vmla.i32        q5,  q11, q13
498*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q14  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
499*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14
500*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q4,  q4,  q14
501*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q5,  q5,  q14
502*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2        // iclip_pixel
503*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
504*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d6,  q4
505*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d7,  q5
506*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15  // iclip_pixel
507*c0909341SAndroid Build Coastguard Worker        vmin.u16        q3,  q3,  q15  // iclip_pixel
508*c0909341SAndroid Build Coastguard Worker.if \type == 444
509*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d12, q6        // 64 - m
510*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d13, q7
511*c0909341SAndroid Build Coastguard Worker        vsub.i16        q6,  q1,  q6   // m
512*c0909341SAndroid Build Coastguard Worker        vst1.8          {d12}, [r6, :64]!
513*c0909341SAndroid Build Coastguard Worker        vst1.8          {d13}, [lr, :64]!
514*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
515*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d12, d12, d13  // (64 - m) + (64 - n) (column wise addition)
516*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d13, d14, d15
517*c0909341SAndroid Build Coastguard Worker        vmovn.i16       d12, q6
518*c0909341SAndroid Build Coastguard Worker        vhsub.u8        d12, d2,  d12  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
519*c0909341SAndroid Build Coastguard Worker        vst1.32         {d12[0]}, [r6, :32]!
520*c0909341SAndroid Build Coastguard Worker        vst1.32         {d12[1]}, [lr, :32]!
521*c0909341SAndroid Build Coastguard Worker.elseif \type == 420
522*c0909341SAndroid Build Coastguard Worker        vadd.i16        q6,  q6,  q7   // (64 - my1) + (64 - my2) (row wise addition)
523*c0909341SAndroid Build Coastguard Worker        vpadd.i16       d12, d12, d13  // (128 - m) + (128 - n) (column wise addition)
524*c0909341SAndroid Build Coastguard Worker        vsub.i16        d12, d2,  d12  // (256 - sign) - ((128 - m) + (128 - n))
525*c0909341SAndroid Build Coastguard Worker        vrshrn.i16      d12, q6,  #2   // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
526*c0909341SAndroid Build Coastguard Worker        vst1.32         {d12[0]}, [r6, :32]!
527*c0909341SAndroid Build Coastguard Worker.endif
528*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2}, [r0,  :128]!
529*c0909341SAndroid Build Coastguard Worker        vst1.16         {q3}, [r12, :128]!
530*c0909341SAndroid Build Coastguard Worker        bgt             16b
531*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #2
532*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  r4,  lsl #1
533*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  r4,  lsl #1
534*c0909341SAndroid Build Coastguard Worker        add             r7,  r7,  r4,  lsl #1
535*c0909341SAndroid Build Coastguard Worker        add             r9,  r9,  r4,  lsl #1
536*c0909341SAndroid Build Coastguard Worker.if \type == 444
537*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  r4
538*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  r4
539*c0909341SAndroid Build Coastguard Worker.elseif \type == 422
540*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  r4,  lsr #1
541*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  r4,  lsr #1
542*c0909341SAndroid Build Coastguard Worker.endif
543*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  r1
544*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r1
545*c0909341SAndroid Build Coastguard Worker        bgt             161b
546*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
547*c0909341SAndroid Build Coastguard Worker        pop             {r4-r10,pc}
548*c0909341SAndroid Build Coastguard Workerendfunc
549*c0909341SAndroid Build Coastguard Worker.endm
550*c0909341SAndroid Build Coastguard Worker
551*c0909341SAndroid Build Coastguard Workerw_mask_fn 444
552*c0909341SAndroid Build Coastguard Workerw_mask_fn 422
553*c0909341SAndroid Build Coastguard Workerw_mask_fn 420
554*c0909341SAndroid Build Coastguard Worker
555*c0909341SAndroid Build Coastguard Workerfunction blend_16bpc_neon, export=1
556*c0909341SAndroid Build Coastguard Worker        push            {r4-r5,lr}
557*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #12]
558*c0909341SAndroid Build Coastguard Worker        clz             lr,  r3
559*c0909341SAndroid Build Coastguard Worker        adr             r3,  L(blend_tbl)
560*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  #26
561*c0909341SAndroid Build Coastguard Worker        ldr             lr,  [r3,  lr,  lsl #2]
562*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  lr
563*c0909341SAndroid Build Coastguard Worker        bx              r3
564*c0909341SAndroid Build Coastguard Worker
565*c0909341SAndroid Build Coastguard Worker        .align 2
566*c0909341SAndroid Build Coastguard WorkerL(blend_tbl):
567*c0909341SAndroid Build Coastguard Worker        .word 320f - L(blend_tbl) + CONFIG_THUMB
568*c0909341SAndroid Build Coastguard Worker        .word 160f - L(blend_tbl) + CONFIG_THUMB
569*c0909341SAndroid Build Coastguard Worker        .word 80f  - L(blend_tbl) + CONFIG_THUMB
570*c0909341SAndroid Build Coastguard Worker        .word 40f  - L(blend_tbl) + CONFIG_THUMB
571*c0909341SAndroid Build Coastguard Worker
572*c0909341SAndroid Build Coastguard Worker40:
573*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
574*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
575*c0909341SAndroid Build Coastguard Worker4:
576*c0909341SAndroid Build Coastguard Worker        vld1.8          {d4}, [r5, :64]!
577*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1}, [r2, :128]!
578*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0}, [r0, :64]
579*c0909341SAndroid Build Coastguard Worker        vneg.s8         d4,  d4       // -m
580*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
581*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1}, [r12, :64]
582*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q2,  d4
583*c0909341SAndroid Build Coastguard Worker        vshl.i16        q2,  q2,  #9  // -m << 9
584*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q0,  q1  // a - b
585*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q1,  q1,  q2  // ((a-b)*-m + 32) >> 6
586*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q1
587*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0}, [r0,  :64], r1
588*c0909341SAndroid Build Coastguard Worker        vst1.16         {d1}, [r12, :64], r1
589*c0909341SAndroid Build Coastguard Worker        bgt             4b
590*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
591*c0909341SAndroid Build Coastguard Worker80:
592*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
593*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
594*c0909341SAndroid Build Coastguard Worker8:
595*c0909341SAndroid Build Coastguard Worker        vld1.8          {q8},     [r5, :128]!
596*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r2, :128]!
597*c0909341SAndroid Build Coastguard Worker        vneg.s8         q9,  q8       // -m
598*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0},     [r0,  :128]
599*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1},     [r12, :128]
600*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q8,  d18
601*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q9,  d19
602*c0909341SAndroid Build Coastguard Worker        vshl.i16        q8,  q8,  #9  // -m << 9
603*c0909341SAndroid Build Coastguard Worker        vshl.i16        q9,  q9,  #9
604*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q0,  q2  // a - b
605*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q1,  q3
606*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
607*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q2,  q2,  q8  // ((a-b)*-m + 32) >> 6
608*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q3,  q3,  q9
609*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q2
610*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q3
611*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0}, [r0,  :128], r1
612*c0909341SAndroid Build Coastguard Worker        vst1.16         {q1}, [r12, :128], r1
613*c0909341SAndroid Build Coastguard Worker        bgt             8b
614*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
615*c0909341SAndroid Build Coastguard Worker160:
616*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
617*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
618*c0909341SAndroid Build Coastguard Worker16:
619*c0909341SAndroid Build Coastguard Worker        vld1.8          {q12, q13}, [r5, :128]!
620*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2, :128]!
621*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
622*c0909341SAndroid Build Coastguard Worker        vneg.s8         q14, q12      // -m
623*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r0, :128]
624*c0909341SAndroid Build Coastguard Worker        vneg.s8         q15, q13
625*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r2, :128]!
626*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q12, d28
627*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q13, d29
628*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q14, d30
629*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q15, d31
630*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r12, :128]
631*c0909341SAndroid Build Coastguard Worker        vshl.i16        q12, q12, #9  // -m << 9
632*c0909341SAndroid Build Coastguard Worker        vshl.i16        q13, q13, #9
633*c0909341SAndroid Build Coastguard Worker        vshl.i16        q14, q14, #9
634*c0909341SAndroid Build Coastguard Worker        vshl.i16        q15, q15, #9
635*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q0,  q8  // a - b
636*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q1,  q9
637*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q2,  q10
638*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q3,  q11
639*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q8,  q8,  q12 // ((a-b)*-m + 32) >> 6
640*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q9,  q9,  q13
641*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q10, q10, q14
642*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q11, q11, q15
643*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q8
644*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q9
645*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q10
646*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0,  :128], r1
647*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q11
648*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r12, :128], r1
649*c0909341SAndroid Build Coastguard Worker        bgt             16b
650*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
651*c0909341SAndroid Build Coastguard Worker320:
652*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  #32
653*c0909341SAndroid Build Coastguard Worker32:
654*c0909341SAndroid Build Coastguard Worker        vld1.8          {q12, q13}, [r5, :128]!
655*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2, :128]!
656*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #1
657*c0909341SAndroid Build Coastguard Worker        vneg.s8         q14, q12      // -m
658*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r0, :128]
659*c0909341SAndroid Build Coastguard Worker        vneg.s8         q15, q13
660*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r2, :128]!
661*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q12, d28
662*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q13, d29
663*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q14, d30
664*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q15, d31
665*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r12, :128]
666*c0909341SAndroid Build Coastguard Worker        vshl.i16        q12, q12, #9  // -m << 9
667*c0909341SAndroid Build Coastguard Worker        vshl.i16        q13, q13, #9
668*c0909341SAndroid Build Coastguard Worker        vshl.i16        q14, q14, #9
669*c0909341SAndroid Build Coastguard Worker        vshl.i16        q15, q15, #9
670*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q0,  q8  // a - b
671*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q1,  q9
672*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q2,  q10
673*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q3,  q11
674*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q8,  q8,  q12 // ((a-b)*-m + 32) >> 6
675*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q9,  q9,  q13
676*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q10, q10, q14
677*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q11, q11, q15
678*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q8
679*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q9
680*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q10
681*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0,  :128], r1
682*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q11
683*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r12, :128], r1
684*c0909341SAndroid Build Coastguard Worker        bgt             32b
685*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
686*c0909341SAndroid Build Coastguard Workerendfunc
687*c0909341SAndroid Build Coastguard Worker
688*c0909341SAndroid Build Coastguard Workerfunction blend_h_16bpc_neon, export=1
689*c0909341SAndroid Build Coastguard Worker        push            {r4-r5,lr}
690*c0909341SAndroid Build Coastguard Worker        ldr             r4,  [sp, #12]
691*c0909341SAndroid Build Coastguard Worker        movrel          r5,  X(obmc_masks)
692*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r4
693*c0909341SAndroid Build Coastguard Worker        sub             r4,  r4,  r4,  lsr #2
694*c0909341SAndroid Build Coastguard Worker        clz             lr,  r3
695*c0909341SAndroid Build Coastguard Worker        adr             r12, L(blend_h_tbl)
696*c0909341SAndroid Build Coastguard Worker        sub             lr,  lr,  #24
697*c0909341SAndroid Build Coastguard Worker        ldr             lr,  [r12, lr,  lsl #2]
698*c0909341SAndroid Build Coastguard Worker        add             r12, r12, lr
699*c0909341SAndroid Build Coastguard Worker        bx              r12
700*c0909341SAndroid Build Coastguard Worker
701*c0909341SAndroid Build Coastguard Worker        .align 2
702*c0909341SAndroid Build Coastguard WorkerL(blend_h_tbl):
703*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(blend_h_tbl) + CONFIG_THUMB
704*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(blend_h_tbl) + CONFIG_THUMB
705*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(blend_h_tbl) + CONFIG_THUMB
706*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(blend_h_tbl) + CONFIG_THUMB
707*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(blend_h_tbl) + CONFIG_THUMB
708*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(blend_h_tbl) + CONFIG_THUMB
709*c0909341SAndroid Build Coastguard Worker        .word 20f   - L(blend_h_tbl) + CONFIG_THUMB
710*c0909341SAndroid Build Coastguard Worker
711*c0909341SAndroid Build Coastguard Worker20:
712*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
713*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
714*c0909341SAndroid Build Coastguard Worker2:
715*c0909341SAndroid Build Coastguard Worker        vld2.8          {d4[], d5[]}, [r5, :16]!
716*c0909341SAndroid Build Coastguard Worker        vld1.16         {d2},         [r2, :64]!
717*c0909341SAndroid Build Coastguard Worker        vext.8          d4,  d4,  d5,  #6
718*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
719*c0909341SAndroid Build Coastguard Worker        vneg.s8         d4,  d4       // -m
720*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]},  [r0, :32]
721*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[1]}, [r12, :32]
722*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q2,  d4
723*c0909341SAndroid Build Coastguard Worker        vshl.i16        d4,  d4,  #9  // -m << 9
724*c0909341SAndroid Build Coastguard Worker        vsub.i16        d2,  d0,  d2  // a - b
725*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    d2,  d2,  d4  // ((a-b)*-m + 32) >> 6
726*c0909341SAndroid Build Coastguard Worker        vadd.i16        d0,  d0,  d2
727*c0909341SAndroid Build Coastguard Worker        vst1.32         {d0[0]}, [r0,  :32], r1
728*c0909341SAndroid Build Coastguard Worker        vst1.32         {d0[1]}, [r12, :32], r1
729*c0909341SAndroid Build Coastguard Worker        bgt             2b
730*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
731*c0909341SAndroid Build Coastguard Worker40:
732*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
733*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
734*c0909341SAndroid Build Coastguard Worker4:
735*c0909341SAndroid Build Coastguard Worker        vld2.8          {d4[], d5[]}, [r5, :16]!
736*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1},         [r2, :128]!
737*c0909341SAndroid Build Coastguard Worker        vext.8          d4,  d4,  d5,  #4
738*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
739*c0909341SAndroid Build Coastguard Worker        vneg.s8         d4,  d4       // -m
740*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0}, [r0,  :64]
741*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1}, [r12, :64]
742*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q2,  d4
743*c0909341SAndroid Build Coastguard Worker        vshl.i16        q2,  q2,  #9  // -m << 9
744*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q0,  q1  // a - b
745*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q1,  q1,  q2  // ((a-b)*-m + 32) >> 6
746*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q1
747*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0}, [r0,  :64], r1
748*c0909341SAndroid Build Coastguard Worker        vst1.16         {d1}, [r12, :64], r1
749*c0909341SAndroid Build Coastguard Worker        bgt             4b
750*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
751*c0909341SAndroid Build Coastguard Worker80:
752*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
753*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
754*c0909341SAndroid Build Coastguard Worker8:
755*c0909341SAndroid Build Coastguard Worker        vld2.8          {d16[], d17[]}, [r5, :16]!
756*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3},       [r2, :128]!
757*c0909341SAndroid Build Coastguard Worker        vneg.s8         q9,  q8      // -m
758*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0}, [r0, :128]
759*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
760*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q8,  d18
761*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q9,  d19
762*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1}, [r12, :128]
763*c0909341SAndroid Build Coastguard Worker        vshl.i16        q8,  q8,  #9  // -m << 9
764*c0909341SAndroid Build Coastguard Worker        vshl.i16        q9,  q9,  #9
765*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q0,  q2  // a - b
766*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q1,  q3
767*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q2,  q2,  q8  // ((a-b)*-m + 32) >> 6
768*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q3,  q3,  q9
769*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q2
770*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q3
771*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0}, [r0,  :128], r1
772*c0909341SAndroid Build Coastguard Worker        vst1.16         {q1}, [r12, :128], r1
773*c0909341SAndroid Build Coastguard Worker        bgt             8b
774*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
775*c0909341SAndroid Build Coastguard Worker160:
776*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
777*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
778*c0909341SAndroid Build Coastguard Worker16:
779*c0909341SAndroid Build Coastguard Worker        vld2.8          {d24[], d25[]}, [r5, :16]!
780*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2, :128]!
781*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
782*c0909341SAndroid Build Coastguard Worker        vneg.s8         q13, q12      // -m
783*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r0, :128]
784*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q12, d26
785*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r2, :128]!
786*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q13, d27
787*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r12, :128]
788*c0909341SAndroid Build Coastguard Worker        vshl.i16        q12, q12, #9  // -m << 9
789*c0909341SAndroid Build Coastguard Worker        vshl.i16        q13, q13, #9
790*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q0,  q8  // a - b
791*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q1,  q9
792*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q2,  q10
793*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q3,  q11
794*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q8,  q8,  q12 // ((a-b)*-m + 32) >> 6
795*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q9,  q9,  q12
796*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q10, q10, q13
797*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q11, q11, q13
798*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q8
799*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q9
800*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q10
801*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q11
802*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0,  :128], r1
803*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r12, :128], r1
804*c0909341SAndroid Build Coastguard Worker        bgt             16b
805*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
806*c0909341SAndroid Build Coastguard Worker1280:
807*c0909341SAndroid Build Coastguard Worker640:
808*c0909341SAndroid Build Coastguard Worker320:
809*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  r3,  lsl #1
810*c0909341SAndroid Build Coastguard Worker321:
811*c0909341SAndroid Build Coastguard Worker        vld1.8          {d24[]}, [r5]!
812*c0909341SAndroid Build Coastguard Worker        mov             r12, r3
813*c0909341SAndroid Build Coastguard Worker        vneg.s8         d24, d24      // -m
814*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q12, d24
815*c0909341SAndroid Build Coastguard Worker        vshl.i16        q12, q12, #9  // -m << 9
816*c0909341SAndroid Build Coastguard Worker32:
817*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2, :128]!
818*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r0, :128]!
819*c0909341SAndroid Build Coastguard Worker        subs            r12, r12, #32
820*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r2, :128]!
821*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r0, :128]
822*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q0,  q8  // a - b
823*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q1,  q9
824*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q2,  q10
825*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q3,  q11
826*c0909341SAndroid Build Coastguard Worker        sub             r0,  r0,  #32
827*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q8,  q8,  q12 // ((a-b)*-m + 32) >> 6
828*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q9,  q9,  q12
829*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q10, q10, q12
830*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q11, q11, q12
831*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q8
832*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q9
833*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q10
834*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0, :128]!
835*c0909341SAndroid Build Coastguard Worker        vadd.i16        q3,  q3,  q11
836*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r0, :128]!
837*c0909341SAndroid Build Coastguard Worker        bgt             32b
838*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #1
839*c0909341SAndroid Build Coastguard Worker        add             r0,  r0,  r1
840*c0909341SAndroid Build Coastguard Worker        bgt             321b
841*c0909341SAndroid Build Coastguard Worker        pop             {r4-r5,pc}
842*c0909341SAndroid Build Coastguard Workerendfunc
843*c0909341SAndroid Build Coastguard Worker
844*c0909341SAndroid Build Coastguard Workerfunction blend_v_16bpc_neon, export=1
845*c0909341SAndroid Build Coastguard Worker        push            {r4,lr}
846*c0909341SAndroid Build Coastguard Worker        ldr             r4,  [sp, #8]
847*c0909341SAndroid Build Coastguard Worker        movrel          lr,  X(obmc_masks)
848*c0909341SAndroid Build Coastguard Worker        add             lr,  lr,  r3
849*c0909341SAndroid Build Coastguard Worker        clz             r12, r3
850*c0909341SAndroid Build Coastguard Worker        adr             r3,  L(blend_v_tbl)
851*c0909341SAndroid Build Coastguard Worker        sub             r12, r12, #26
852*c0909341SAndroid Build Coastguard Worker        ldr             r12, [r3,  r12, lsl #2]
853*c0909341SAndroid Build Coastguard Worker        add             r3,  r3,  r12
854*c0909341SAndroid Build Coastguard Worker        bx              r3
855*c0909341SAndroid Build Coastguard Worker
856*c0909341SAndroid Build Coastguard Worker        .align 2
857*c0909341SAndroid Build Coastguard WorkerL(blend_v_tbl):
858*c0909341SAndroid Build Coastguard Worker        .word 320f - L(blend_v_tbl) + CONFIG_THUMB
859*c0909341SAndroid Build Coastguard Worker        .word 160f - L(blend_v_tbl) + CONFIG_THUMB
860*c0909341SAndroid Build Coastguard Worker        .word 80f  - L(blend_v_tbl) + CONFIG_THUMB
861*c0909341SAndroid Build Coastguard Worker        .word 40f  - L(blend_v_tbl) + CONFIG_THUMB
862*c0909341SAndroid Build Coastguard Worker        .word 20f  - L(blend_v_tbl) + CONFIG_THUMB
863*c0909341SAndroid Build Coastguard Worker
864*c0909341SAndroid Build Coastguard Worker20:
865*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
866*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
867*c0909341SAndroid Build Coastguard Worker        vld1.8          {d4[]}, [lr]
868*c0909341SAndroid Build Coastguard Worker        vneg.s8         d4,  d4       // -m
869*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q2,  d4
870*c0909341SAndroid Build Coastguard Worker        vshl.i16        d4,  d4,  #9  // -m << 9
871*c0909341SAndroid Build Coastguard Worker2:
872*c0909341SAndroid Build Coastguard Worker        vld1.32         {d2[]},  [r2, :32]!
873*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[]},  [r0, :16]
874*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
875*c0909341SAndroid Build Coastguard Worker        vld1.16         {d2[1]}, [r2,  :16]
876*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[1]}, [r12, :16]
877*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  #4
878*c0909341SAndroid Build Coastguard Worker        vsub.i16        d2,  d0,  d2  // a - b
879*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    d2,  d2,  d4  // ((a-b)*-m + 32) >> 6
880*c0909341SAndroid Build Coastguard Worker        vadd.i16        d0,  d0,  d2
881*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0[0]}, [r0,  :16], r1
882*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0[1]}, [r12, :16], r1
883*c0909341SAndroid Build Coastguard Worker        bgt             2b
884*c0909341SAndroid Build Coastguard Worker        pop             {r4,pc}
885*c0909341SAndroid Build Coastguard Worker40:
886*c0909341SAndroid Build Coastguard Worker        vld1.32         {d4[]}, [lr, :32]
887*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
888*c0909341SAndroid Build Coastguard Worker        vneg.s8         d4,  d4       // -m
889*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
890*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q2,  d4
891*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #4
892*c0909341SAndroid Build Coastguard Worker        vshl.i16        q2,  q2,  #9  // -m << 9
893*c0909341SAndroid Build Coastguard Worker4:
894*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1}, [r2, :128]!
895*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0}, [r0,  :64]
896*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1}, [r12, :64]
897*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
898*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q0,  q1  // a - b
899*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q1,  q1,  q2  // ((a-b)*-m + 32) >> 6
900*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q1
901*c0909341SAndroid Build Coastguard Worker        vst1.32         {d0[0]}, [r0,  :32]!
902*c0909341SAndroid Build Coastguard Worker        vst1.32         {d1[0]}, [r12, :32]!
903*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0[2]}, [r0,  :16], r1
904*c0909341SAndroid Build Coastguard Worker        vst1.16         {d1[2]}, [r12, :16], r1
905*c0909341SAndroid Build Coastguard Worker        bgt             4b
906*c0909341SAndroid Build Coastguard Worker        pop             {r4,pc}
907*c0909341SAndroid Build Coastguard Worker80:
908*c0909341SAndroid Build Coastguard Worker        vld1.8          {d16}, [lr, :64]
909*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
910*c0909341SAndroid Build Coastguard Worker        vneg.s8         d16, d16      // -m
911*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
912*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q8,  d16
913*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #8
914*c0909341SAndroid Build Coastguard Worker        vshl.i16        q8,  q8,  #9  // -m << 9
915*c0909341SAndroid Build Coastguard Worker8:
916*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r2,  :128]!
917*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0},     [r0,  :128]
918*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1},     [r12, :128]
919*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
920*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q0,  q2  // a - b
921*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q1,  q3
922*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q2,  q2,  q8  // ((a-b)*-m + 32) >> 6
923*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q3,  q3,  q8
924*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q2
925*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q3
926*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0},    [r0,  :64]!
927*c0909341SAndroid Build Coastguard Worker        vst1.16         {d2},    [r12, :64]!
928*c0909341SAndroid Build Coastguard Worker        vst1.32         {d1[0]}, [r0,  :32], r1
929*c0909341SAndroid Build Coastguard Worker        vst1.32         {d3[0]}, [r12, :32], r1
930*c0909341SAndroid Build Coastguard Worker        bgt             8b
931*c0909341SAndroid Build Coastguard Worker        pop             {r4,pc}
932*c0909341SAndroid Build Coastguard Worker160:
933*c0909341SAndroid Build Coastguard Worker        vld1.8          {q12}, [lr, :128]
934*c0909341SAndroid Build Coastguard Worker        add             r12, r0,  r1
935*c0909341SAndroid Build Coastguard Worker        vneg.s8         q13, q12      // -m
936*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
937*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q12, d26
938*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q13, d27
939*c0909341SAndroid Build Coastguard Worker        vshl.i16        q12, q12, #9  // -m << 9
940*c0909341SAndroid Build Coastguard Worker        vshl.i16        d26, d26, #9
941*c0909341SAndroid Build Coastguard Worker16:
942*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},      [r2,  :128]!
943*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0,  d1,  d2}, [r0,  :64]
944*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
945*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11},     [r2,  :128]!
946*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q0,  q8  // a - b
947*c0909341SAndroid Build Coastguard Worker        vld1.16         {d4,  d5,  d6}, [r12, :64]
948*c0909341SAndroid Build Coastguard Worker        vsub.i16        d18, d2,  d18
949*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q2,  q10
950*c0909341SAndroid Build Coastguard Worker        vsub.i16        d22, d6,  d22
951*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q8,  q8,  q12  // ((a-b)*-m + 32) >> 6
952*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    d18, d18, d26
953*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q10, q10, q12
954*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    d22, d22, d26
955*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q8
956*c0909341SAndroid Build Coastguard Worker        vadd.i16        d2,  d2,  d18
957*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q10
958*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0,  d1,  d2}, [r0,  :64], r1
959*c0909341SAndroid Build Coastguard Worker        vadd.i16        d6,  d6,  d22
960*c0909341SAndroid Build Coastguard Worker        vst1.16         {d4,  d5,  d6}, [r12, :64], r1
961*c0909341SAndroid Build Coastguard Worker        bgt             16b
962*c0909341SAndroid Build Coastguard Worker        pop             {r4,pc}
963*c0909341SAndroid Build Coastguard Worker320:
964*c0909341SAndroid Build Coastguard Worker        vld1.8          {d24, d25, d26}, [lr, :64]
965*c0909341SAndroid Build Coastguard Worker        vneg.s8         q14, q12      // -m
966*c0909341SAndroid Build Coastguard Worker        vneg.s8         d30, d26
967*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q12, d28
968*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q13, d29
969*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q14, d30
970*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #32
971*c0909341SAndroid Build Coastguard Worker        vshl.i16        q12, q12, #9  // -m << 9
972*c0909341SAndroid Build Coastguard Worker        vshl.i16        q13, q13, #9
973*c0909341SAndroid Build Coastguard Worker        vshl.i16        q14, q14, #9
974*c0909341SAndroid Build Coastguard Worker32:
975*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2, :128]!
976*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r0, :128]!
977*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #1
978*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10},      [r2, :128]
979*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q0,  q8  // a - b
980*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2},       [r0, :128]
981*c0909341SAndroid Build Coastguard Worker        sub             r0,  r0,  #32
982*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q1,  q9
983*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q2,  q10
984*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q8,  q8,  q12  // ((a-b)*-m + 32) >> 6
985*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q9,  q9,  q13
986*c0909341SAndroid Build Coastguard Worker        vqrdmulh.s16    q10, q10, q14
987*c0909341SAndroid Build Coastguard Worker        vadd.i16        q0,  q0,  q8
988*c0909341SAndroid Build Coastguard Worker        vadd.i16        q1,  q1,  q9
989*c0909341SAndroid Build Coastguard Worker        vadd.i16        q2,  q2,  q10
990*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0, :128]!
991*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  #32
992*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2},     [r0, :128], r1
993*c0909341SAndroid Build Coastguard Worker        bgt             32b
994*c0909341SAndroid Build Coastguard Worker        pop             {r4,pc}
995*c0909341SAndroid Build Coastguard Workerendfunc
996*c0909341SAndroid Build Coastguard Worker
997*c0909341SAndroid Build Coastguard Worker// This has got the same signature as the put_8tap functions,
998*c0909341SAndroid Build Coastguard Worker// and assumes that r9 is set to (clz(w)-24).
999*c0909341SAndroid Build Coastguard Workerfunction put_neon
1000*c0909341SAndroid Build Coastguard Worker        adr             r10, L(put_tbl)
1001*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
1002*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
1003*c0909341SAndroid Build Coastguard Worker        bx              r10
1004*c0909341SAndroid Build Coastguard Worker
1005*c0909341SAndroid Build Coastguard Worker        .align 2
1006*c0909341SAndroid Build Coastguard WorkerL(put_tbl):
1007*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(put_tbl) + CONFIG_THUMB
1008*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(put_tbl) + CONFIG_THUMB
1009*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(put_tbl) + CONFIG_THUMB
1010*c0909341SAndroid Build Coastguard Worker        .word 16f   - L(put_tbl) + CONFIG_THUMB
1011*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(put_tbl) + CONFIG_THUMB
1012*c0909341SAndroid Build Coastguard Worker        .word 4f    - L(put_tbl) + CONFIG_THUMB
1013*c0909341SAndroid Build Coastguard Worker        .word 2f    - L(put_tbl) + CONFIG_THUMB
1014*c0909341SAndroid Build Coastguard Worker
1015*c0909341SAndroid Build Coastguard Worker2:
1016*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [r2], r3
1017*c0909341SAndroid Build Coastguard Worker        vld1.32         {d1[]}, [r2], r3
1018*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #2
1019*c0909341SAndroid Build Coastguard Worker        vst1.32         {d0[0]}, [r0, :32], r1
1020*c0909341SAndroid Build Coastguard Worker        vst1.32         {d1[1]}, [r0, :32], r1
1021*c0909341SAndroid Build Coastguard Worker        bgt             2b
1022*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1023*c0909341SAndroid Build Coastguard Worker4:
1024*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0}, [r2], r3
1025*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1}, [r2], r3
1026*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #2
1027*c0909341SAndroid Build Coastguard Worker        vst1.16         {d0}, [r0, :64], r1
1028*c0909341SAndroid Build Coastguard Worker        vst1.16         {d1}, [r0, :64], r1
1029*c0909341SAndroid Build Coastguard Worker        bgt             4b
1030*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1031*c0909341SAndroid Build Coastguard Worker80:
1032*c0909341SAndroid Build Coastguard Worker        add             r8,  r0,  r1
1033*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
1034*c0909341SAndroid Build Coastguard Worker        add             r9,  r2,  r3
1035*c0909341SAndroid Build Coastguard Worker        lsl             r3,  r3,  #1
1036*c0909341SAndroid Build Coastguard Worker8:
1037*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0}, [r2], r3
1038*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1}, [r9], r3
1039*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #2
1040*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0}, [r0, :128], r1
1041*c0909341SAndroid Build Coastguard Worker        vst1.16         {q1}, [r8, :128], r1
1042*c0909341SAndroid Build Coastguard Worker        bgt             8b
1043*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1044*c0909341SAndroid Build Coastguard Worker16:
1045*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r2], r3
1046*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #1
1047*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0,  q1},  [r0, :128], r1
1048*c0909341SAndroid Build Coastguard Worker        bgt             16b
1049*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1050*c0909341SAndroid Build Coastguard Worker320:
1051*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #32
1052*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #32
1053*c0909341SAndroid Build Coastguard Worker32:
1054*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r2]!
1055*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0,  q1},  [r0, :128]!
1056*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r2], r3
1057*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #1
1058*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2,  q3},  [r0, :128], r1
1059*c0909341SAndroid Build Coastguard Worker        bgt             32b
1060*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1061*c0909341SAndroid Build Coastguard Worker640:
1062*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #96
1063*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #96
1064*c0909341SAndroid Build Coastguard Worker64:
1065*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2]!
1066*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128]!
1067*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r2]!
1068*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r0, :128]!
1069*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12, q13}, [r2]!
1070*c0909341SAndroid Build Coastguard Worker        vst1.16         {q12, q13}, [r0, :128]!
1071*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14, q15}, [r2], r3
1072*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #1
1073*c0909341SAndroid Build Coastguard Worker        vst1.16         {q14, q15}, [r0, :128], r1
1074*c0909341SAndroid Build Coastguard Worker        bgt             64b
1075*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1076*c0909341SAndroid Build Coastguard Worker1280:
1077*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  #224
1078*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #224
1079*c0909341SAndroid Build Coastguard Worker128:
1080*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2]!
1081*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128]!
1082*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r2]!
1083*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r0, :128]!
1084*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12, q13}, [r2]!
1085*c0909341SAndroid Build Coastguard Worker        vst1.16         {q12, q13}, [r0, :128]!
1086*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14, q15}, [r2]!
1087*c0909341SAndroid Build Coastguard Worker        vst1.16         {q14, q15}, [r0, :128]!
1088*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r2]!
1089*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128]!
1090*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r2]!
1091*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r0, :128]!
1092*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12, q13}, [r2]!
1093*c0909341SAndroid Build Coastguard Worker        vst1.16         {q12, q13}, [r0, :128]!
1094*c0909341SAndroid Build Coastguard Worker        vld1.16         {q14, q15}, [r2], r3
1095*c0909341SAndroid Build Coastguard Worker        subs            r5,  r5,  #1
1096*c0909341SAndroid Build Coastguard Worker        vst1.16         {q14, q15}, [r0, :128], r1
1097*c0909341SAndroid Build Coastguard Worker        bgt             128b
1098*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1099*c0909341SAndroid Build Coastguard Workerendfunc
1100*c0909341SAndroid Build Coastguard Worker
1101*c0909341SAndroid Build Coastguard Worker// This has got the same signature as the prep_8tap functions,
1102*c0909341SAndroid Build Coastguard Worker// and assumes that r9 is set to (clz(w)-24), r7 to intermediate_bits and
1103*c0909341SAndroid Build Coastguard Worker// r8 to w*2.
1104*c0909341SAndroid Build Coastguard Workerfunction prep_neon
1105*c0909341SAndroid Build Coastguard Worker        adr             r10, L(prep_tbl)
1106*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
1107*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r7   // intermediate_bits
1108*c0909341SAndroid Build Coastguard Worker        vmov.i16        q14, #PREP_BIAS
1109*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
1110*c0909341SAndroid Build Coastguard Worker        bx              r10
1111*c0909341SAndroid Build Coastguard Worker
1112*c0909341SAndroid Build Coastguard Worker        .align 2
1113*c0909341SAndroid Build Coastguard WorkerL(prep_tbl):
1114*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(prep_tbl) + CONFIG_THUMB
1115*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(prep_tbl) + CONFIG_THUMB
1116*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(prep_tbl) + CONFIG_THUMB
1117*c0909341SAndroid Build Coastguard Worker        .word 16f   - L(prep_tbl) + CONFIG_THUMB
1118*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(prep_tbl) + CONFIG_THUMB
1119*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(prep_tbl) + CONFIG_THUMB
1120*c0909341SAndroid Build Coastguard Worker
1121*c0909341SAndroid Build Coastguard Worker40:
1122*c0909341SAndroid Build Coastguard Worker        add             r9,  r1,  r2
1123*c0909341SAndroid Build Coastguard Worker        lsl             r2,  r2,  #1
1124*c0909341SAndroid Build Coastguard Worker4:
1125*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0}, [r1], r2
1126*c0909341SAndroid Build Coastguard Worker        vld1.16         {d1}, [r9], r2
1127*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
1128*c0909341SAndroid Build Coastguard Worker        vshl.s16        q0,  q0,  q15
1129*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q14
1130*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0}, [r0, :128]!
1131*c0909341SAndroid Build Coastguard Worker        bgt             4b
1132*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1133*c0909341SAndroid Build Coastguard Worker80:
1134*c0909341SAndroid Build Coastguard Worker        add             r9,  r1,  r2
1135*c0909341SAndroid Build Coastguard Worker        lsl             r2,  r2,  #1
1136*c0909341SAndroid Build Coastguard Worker8:
1137*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0}, [r1], r2
1138*c0909341SAndroid Build Coastguard Worker        vld1.16         {q1}, [r9], r2
1139*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
1140*c0909341SAndroid Build Coastguard Worker        vshl.s16        q0,  q0,  q15
1141*c0909341SAndroid Build Coastguard Worker        vshl.s16        q1,  q1,  q15
1142*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q14
1143*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q14
1144*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0, :128]!
1145*c0909341SAndroid Build Coastguard Worker        bgt             8b
1146*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1147*c0909341SAndroid Build Coastguard Worker16:
1148*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r1], r2
1149*c0909341SAndroid Build Coastguard Worker        vshl.s16        q0,  q0,  q15
1150*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r1], r2
1151*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #2
1152*c0909341SAndroid Build Coastguard Worker        vshl.s16        q1,  q1,  q15
1153*c0909341SAndroid Build Coastguard Worker        vshl.s16        q2,  q2,  q15
1154*c0909341SAndroid Build Coastguard Worker        vshl.s16        q3,  q3,  q15
1155*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q14
1156*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q14
1157*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q14
1158*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0, :128]!
1159*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q3,  q14
1160*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r0, :128]!
1161*c0909341SAndroid Build Coastguard Worker        bgt             16b
1162*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1163*c0909341SAndroid Build Coastguard Worker320:
1164*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  #32
1165*c0909341SAndroid Build Coastguard Worker32:
1166*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r1]!
1167*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #1
1168*c0909341SAndroid Build Coastguard Worker        vshl.s16        q0,  q0,  q15
1169*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r1], r2
1170*c0909341SAndroid Build Coastguard Worker        vshl.s16        q1,  q1,  q15
1171*c0909341SAndroid Build Coastguard Worker        vshl.s16        q2,  q2,  q15
1172*c0909341SAndroid Build Coastguard Worker        vshl.s16        q3,  q3,  q15
1173*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q14
1174*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q14
1175*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q14
1176*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r0, :128]!
1177*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q3,  q14
1178*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r0, :128]!
1179*c0909341SAndroid Build Coastguard Worker        bgt             32b
1180*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1181*c0909341SAndroid Build Coastguard Worker640:
1182*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  #96
1183*c0909341SAndroid Build Coastguard Worker64:
1184*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r1]!
1185*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #1
1186*c0909341SAndroid Build Coastguard Worker        vshl.s16        q0,  q0,  q15
1187*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r1]!
1188*c0909341SAndroid Build Coastguard Worker        vshl.s16        q1,  q1,  q15
1189*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r1]!
1190*c0909341SAndroid Build Coastguard Worker        vshl.s16        q2,  q2,  q15
1191*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r1], r2
1192*c0909341SAndroid Build Coastguard Worker        vshl.s16        q3,  q3,  q15
1193*c0909341SAndroid Build Coastguard Worker        vshl.s16        q8,  q8,  q15
1194*c0909341SAndroid Build Coastguard Worker        vshl.s16        q9,  q9,  q15
1195*c0909341SAndroid Build Coastguard Worker        vshl.s16        q10, q10, q15
1196*c0909341SAndroid Build Coastguard Worker        vshl.s16        q11, q11, q15
1197*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q14
1198*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q14
1199*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q14
1200*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q3,  q14
1201*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
1202*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0,  q1},  [r0, :128]!
1203*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q9,  q14
1204*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2,  q3},  [r0, :128]!
1205*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q10, q14
1206*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128]!
1207*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q11, q14
1208*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r0, :128]!
1209*c0909341SAndroid Build Coastguard Worker        bgt             64b
1210*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1211*c0909341SAndroid Build Coastguard Worker1280:
1212*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  #224
1213*c0909341SAndroid Build Coastguard Worker128:
1214*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r1]!
1215*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #1
1216*c0909341SAndroid Build Coastguard Worker        vshl.s16        q0,  q0,  q15
1217*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r1]!
1218*c0909341SAndroid Build Coastguard Worker        vshl.s16        q1,  q1,  q15
1219*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r1]!
1220*c0909341SAndroid Build Coastguard Worker        vshl.s16        q2,  q2,  q15
1221*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r1]!
1222*c0909341SAndroid Build Coastguard Worker        vshl.s16        q3,  q3,  q15
1223*c0909341SAndroid Build Coastguard Worker        vshl.s16        q8,  q8,  q15
1224*c0909341SAndroid Build Coastguard Worker        vshl.s16        q9,  q9,  q15
1225*c0909341SAndroid Build Coastguard Worker        vshl.s16        q10, q10, q15
1226*c0909341SAndroid Build Coastguard Worker        vshl.s16        q11, q11, q15
1227*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q14
1228*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q14
1229*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q14
1230*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q3,  q14
1231*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
1232*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0,  q1},  [r0, :128]!
1233*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0,  q1},  [r1]!
1234*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q9,  q14
1235*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q10, q14
1236*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2,  q3},  [r0, :128]!
1237*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2,  q3},  [r1]!
1238*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q11, q14
1239*c0909341SAndroid Build Coastguard Worker        vshl.s16        q0,  q0,  q15
1240*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128]!
1241*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [r1]!
1242*c0909341SAndroid Build Coastguard Worker        vshl.s16        q1,  q1,  q15
1243*c0909341SAndroid Build Coastguard Worker        vshl.s16        q2,  q2,  q15
1244*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r0, :128]!
1245*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [r1], r2
1246*c0909341SAndroid Build Coastguard Worker        vshl.s16        q3,  q3,  q15
1247*c0909341SAndroid Build Coastguard Worker        vshl.s16        q8,  q8,  q15
1248*c0909341SAndroid Build Coastguard Worker        vshl.s16        q9,  q9,  q15
1249*c0909341SAndroid Build Coastguard Worker        vshl.s16        q10, q10, q15
1250*c0909341SAndroid Build Coastguard Worker        vshl.s16        q11, q11, q15
1251*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q14
1252*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q14
1253*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q14
1254*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q3,  q14
1255*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
1256*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0,  q1},  [r0, :128]!
1257*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q9,  q14
1258*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2,  q3},  [r0, :128]!
1259*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q10, q14
1260*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [r0, :128]!
1261*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q11, q14
1262*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [r0, :128]!
1263*c0909341SAndroid Build Coastguard Worker        bgt             128b
1264*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1265*c0909341SAndroid Build Coastguard Workerendfunc
1266*c0909341SAndroid Build Coastguard Worker
1267*c0909341SAndroid Build Coastguard Worker.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1268*c0909341SAndroid Build Coastguard Worker        vld1.\wd        {\d0[]}, [\s0], \strd
1269*c0909341SAndroid Build Coastguard Worker        vld1.\wd        {\d1[]}, [\s1], \strd
1270*c0909341SAndroid Build Coastguard Worker.ifnb \d2
1271*c0909341SAndroid Build Coastguard Worker        vld1.\wd        {\d2[]}, [\s0], \strd
1272*c0909341SAndroid Build Coastguard Worker        vld1.\wd        {\d3[]}, [\s1], \strd
1273*c0909341SAndroid Build Coastguard Worker.endif
1274*c0909341SAndroid Build Coastguard Worker.ifnb \d4
1275*c0909341SAndroid Build Coastguard Worker        vld1.\wd        {\d4[]}, [\s0], \strd
1276*c0909341SAndroid Build Coastguard Worker.endif
1277*c0909341SAndroid Build Coastguard Worker.ifnb \d5
1278*c0909341SAndroid Build Coastguard Worker        vld1.\wd        {\d5[]}, [\s1], \strd
1279*c0909341SAndroid Build Coastguard Worker.endif
1280*c0909341SAndroid Build Coastguard Worker.ifnb \d6
1281*c0909341SAndroid Build Coastguard Worker        vld1.\wd        {\d6[]}, [\s0], \strd
1282*c0909341SAndroid Build Coastguard Worker.endif
1283*c0909341SAndroid Build Coastguard Worker.endm
1284*c0909341SAndroid Build Coastguard Worker.macro load_reg s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1285*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d0}, [\s0], \strd
1286*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d1}, [\s1], \strd
1287*c0909341SAndroid Build Coastguard Worker.ifnb \d2
1288*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d2}, [\s0], \strd
1289*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d3}, [\s1], \strd
1290*c0909341SAndroid Build Coastguard Worker.endif
1291*c0909341SAndroid Build Coastguard Worker.ifnb \d4
1292*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d4}, [\s0], \strd
1293*c0909341SAndroid Build Coastguard Worker.endif
1294*c0909341SAndroid Build Coastguard Worker.ifnb \d5
1295*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d5}, [\s1], \strd
1296*c0909341SAndroid Build Coastguard Worker.endif
1297*c0909341SAndroid Build Coastguard Worker.ifnb \d6
1298*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d6}, [\s0], \strd
1299*c0909341SAndroid Build Coastguard Worker.endif
1300*c0909341SAndroid Build Coastguard Worker.endm
1301*c0909341SAndroid Build Coastguard Worker.macro load_regpair s0, s1, strd, d0, d1, d2, d3, d4, d5
1302*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d0, \d1}, [\s0], \strd
1303*c0909341SAndroid Build Coastguard Worker.ifnb \d2
1304*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d2, \d3}, [\s1], \strd
1305*c0909341SAndroid Build Coastguard Worker.endif
1306*c0909341SAndroid Build Coastguard Worker.ifnb \d4
1307*c0909341SAndroid Build Coastguard Worker        vld1.16         {\d4, \d5}, [\s0], \strd
1308*c0909341SAndroid Build Coastguard Worker.endif
1309*c0909341SAndroid Build Coastguard Worker.endm
1310*c0909341SAndroid Build Coastguard Worker.macro load_32 s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1311*c0909341SAndroid Build Coastguard Worker        load_slice      \s0, \s1, \strd, 32, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1312*c0909341SAndroid Build Coastguard Worker.endm
1313*c0909341SAndroid Build Coastguard Worker.macro load_16s16 s0, s1, strd, d0, d1, d2, d3, d4, d5
1314*c0909341SAndroid Build Coastguard Worker        load_regpair    \s0, \s1, \strd, \d0, \d1, \d2, \d3, \d4, \d5
1315*c0909341SAndroid Build Coastguard Worker.endm
1316*c0909341SAndroid Build Coastguard Worker.macro interleave_1_32 r0, r1, r2, r3, r4
1317*c0909341SAndroid Build Coastguard Worker        vext.8          \r0, \r0, \r1, #4
1318*c0909341SAndroid Build Coastguard Worker        vext.8          \r1, \r1, \r2, #4
1319*c0909341SAndroid Build Coastguard Worker.ifnb \r3
1320*c0909341SAndroid Build Coastguard Worker        vext.8          \r2, \r2, \r3, #4
1321*c0909341SAndroid Build Coastguard Worker        vext.8          \r3, \r3, \r4, #4
1322*c0909341SAndroid Build Coastguard Worker.endif
1323*c0909341SAndroid Build Coastguard Worker.endm
1324*c0909341SAndroid Build Coastguard Worker.macro vmin_u16 c, r0, r1, r2, r3
1325*c0909341SAndroid Build Coastguard Worker        vmin.u16        \r0, \r0, \c
1326*c0909341SAndroid Build Coastguard Worker.ifnb \r1
1327*c0909341SAndroid Build Coastguard Worker        vmin.u16        \r1, \r1, \c
1328*c0909341SAndroid Build Coastguard Worker.endif
1329*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1330*c0909341SAndroid Build Coastguard Worker        vmin.u16        \r2, \r2, \c
1331*c0909341SAndroid Build Coastguard Worker        vmin.u16        \r3, \r3, \c
1332*c0909341SAndroid Build Coastguard Worker.endif
1333*c0909341SAndroid Build Coastguard Worker.endm
1334*c0909341SAndroid Build Coastguard Worker.macro vsub_i16 c, r0, r1, r2, r3
1335*c0909341SAndroid Build Coastguard Worker        vsub.i16        \r0, \r0, \c
1336*c0909341SAndroid Build Coastguard Worker.ifnb \r1
1337*c0909341SAndroid Build Coastguard Worker        vsub.i16        \r1, \r1, \c
1338*c0909341SAndroid Build Coastguard Worker.endif
1339*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1340*c0909341SAndroid Build Coastguard Worker        vsub.i16        \r2, \r2, \c
1341*c0909341SAndroid Build Coastguard Worker        vsub.i16        \r3, \r3, \c
1342*c0909341SAndroid Build Coastguard Worker.endif
1343*c0909341SAndroid Build Coastguard Worker.endm
1344*c0909341SAndroid Build Coastguard Worker.macro vmull_vmlal_4 d, s0, s1, s2, s3
1345*c0909341SAndroid Build Coastguard Worker        vmull.s16       \d,  \s0, d0[0]
1346*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s1, d0[1]
1347*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s2, d0[2]
1348*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s3, d0[3]
1349*c0909341SAndroid Build Coastguard Worker.endm
1350*c0909341SAndroid Build Coastguard Worker.macro vmull_vmlal_8 d, s0, s1, s2, s3, s4, s5, s6, s7
1351*c0909341SAndroid Build Coastguard Worker        vmull.s16       \d,  \s0, d0[0]
1352*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s1, d0[1]
1353*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s2, d0[2]
1354*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s3, d0[3]
1355*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s4, d1[0]
1356*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s5, d1[1]
1357*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s6, d1[2]
1358*c0909341SAndroid Build Coastguard Worker        vmlal.s16       \d,  \s7, d1[3]
1359*c0909341SAndroid Build Coastguard Worker.endm
1360*c0909341SAndroid Build Coastguard Worker.macro vqrshrun_s32 shift, q0, d0, q1, d1, q2, d2, q3, d3
1361*c0909341SAndroid Build Coastguard Worker        vqrshrun.s32    \d0, \q0, #\shift
1362*c0909341SAndroid Build Coastguard Worker.ifnb \q1
1363*c0909341SAndroid Build Coastguard Worker        vqrshrun.s32    \d1, \q1, #\shift
1364*c0909341SAndroid Build Coastguard Worker.endif
1365*c0909341SAndroid Build Coastguard Worker.ifnb \q2
1366*c0909341SAndroid Build Coastguard Worker        vqrshrun.s32    \d2, \q2, #\shift
1367*c0909341SAndroid Build Coastguard Worker        vqrshrun.s32    \d3, \q3, #\shift
1368*c0909341SAndroid Build Coastguard Worker.endif
1369*c0909341SAndroid Build Coastguard Worker.endm
1370*c0909341SAndroid Build Coastguard Worker.macro vmovn_i32 q0, d0, q1, d1, q2, d2, q3, d3
1371*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d0, \q0
1372*c0909341SAndroid Build Coastguard Worker.ifnb \q1
1373*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d1, \q1
1374*c0909341SAndroid Build Coastguard Worker.endif
1375*c0909341SAndroid Build Coastguard Worker.ifnb \q2
1376*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d2, \q2
1377*c0909341SAndroid Build Coastguard Worker        vmovn.i32       \d3, \q3
1378*c0909341SAndroid Build Coastguard Worker.endif
1379*c0909341SAndroid Build Coastguard Worker.endm
1380*c0909341SAndroid Build Coastguard Worker.macro vrshl_s32 shift, r0, r1, r2, r3
1381*c0909341SAndroid Build Coastguard Worker        vrshl.s32       \r0, \r0, \shift
1382*c0909341SAndroid Build Coastguard Worker        vrshl.s32       \r1, \r1, \shift
1383*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1384*c0909341SAndroid Build Coastguard Worker        vrshl.s32       \r2, \r2, \shift
1385*c0909341SAndroid Build Coastguard Worker        vrshl.s32       \r3, \r3, \shift
1386*c0909341SAndroid Build Coastguard Worker.endif
1387*c0909341SAndroid Build Coastguard Worker.endm
1388*c0909341SAndroid Build Coastguard Worker.macro vst1_32 strd, r0, r1
1389*c0909341SAndroid Build Coastguard Worker        vst1.32         {\r0[0]}, [r0, :32], \strd
1390*c0909341SAndroid Build Coastguard Worker        vst1.32         {\r0[1]}, [r9, :32], \strd
1391*c0909341SAndroid Build Coastguard Worker.ifnb \r1
1392*c0909341SAndroid Build Coastguard Worker        vst1.32         {\r1[0]}, [r0, :32], \strd
1393*c0909341SAndroid Build Coastguard Worker        vst1.32         {\r1[1]}, [r9, :32], \strd
1394*c0909341SAndroid Build Coastguard Worker.endif
1395*c0909341SAndroid Build Coastguard Worker.endm
1396*c0909341SAndroid Build Coastguard Worker.macro vst1_reg strd, align, r0, r1, r2, r3, r4, r5, r6, r7
1397*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r0}, [r0, \align], \strd
1398*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r1}, [r9, \align], \strd
1399*c0909341SAndroid Build Coastguard Worker.ifnb \r2
1400*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r2}, [r0, \align], \strd
1401*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r3}, [r9, \align], \strd
1402*c0909341SAndroid Build Coastguard Worker.endif
1403*c0909341SAndroid Build Coastguard Worker.ifnb \r4
1404*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r4}, [r0, \align], \strd
1405*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r5}, [r9, \align], \strd
1406*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r6}, [r0, \align], \strd
1407*c0909341SAndroid Build Coastguard Worker        vst1.16         {\r7}, [r9, \align], \strd
1408*c0909341SAndroid Build Coastguard Worker.endif
1409*c0909341SAndroid Build Coastguard Worker.endm
1410*c0909341SAndroid Build Coastguard Worker.macro finalize type, q0, q1, d0, d1, q2, q3, d2, d3
1411*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1412*c0909341SAndroid Build Coastguard Worker        vqrshrun_s32    6,   \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
1413*c0909341SAndroid Build Coastguard Worker        vmin_u16        q15, \q0, \q1
1414*c0909341SAndroid Build Coastguard Worker.else
1415*c0909341SAndroid Build Coastguard Worker        vrshl_s32       q14, \q0, \q1, \q2, \q3 // -(6-intermediate_bits)
1416*c0909341SAndroid Build Coastguard Worker        vmovn_i32       \q0, \d0, \q1, \d1, \q2, \d2, \q3, \d3
1417*c0909341SAndroid Build Coastguard Worker        vsub_i16        q15, \q0, \q1           // PREP_BIAS
1418*c0909341SAndroid Build Coastguard Worker.endif
1419*c0909341SAndroid Build Coastguard Worker.endm
1420*c0909341SAndroid Build Coastguard Worker.macro shift_store_4 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
1421*c0909341SAndroid Build Coastguard Worker        finalize        \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
1422*c0909341SAndroid Build Coastguard Worker        vst1_reg        \strd, :64, \d0, \d1, \d2, \d3
1423*c0909341SAndroid Build Coastguard Worker.endm
1424*c0909341SAndroid Build Coastguard Worker.macro shift_store_8 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
1425*c0909341SAndroid Build Coastguard Worker        finalize        \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
1426*c0909341SAndroid Build Coastguard Worker        vst1_reg        \strd, :128, \q0, \q1
1427*c0909341SAndroid Build Coastguard Worker.endm
1428*c0909341SAndroid Build Coastguard Worker.macro shift_store_16 type, strd, q0, q1, d0, d1, q2, q3, d2, d3
1429*c0909341SAndroid Build Coastguard Worker        finalize        \type, \q0, \q1, \d0, \d1, \q2, \q3, \d2, \d3
1430*c0909341SAndroid Build Coastguard Worker        vst1.16         {\q0, \q1}, [r0, :128], \strd
1431*c0909341SAndroid Build Coastguard Worker.endm
1432*c0909341SAndroid Build Coastguard Worker
1433*c0909341SAndroid Build Coastguard Worker.macro make_8tap_fn op, type, type_h, type_v
1434*c0909341SAndroid Build Coastguard Workerfunction \op\()_8tap_\type\()_16bpc_neon, export=1
1435*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
1436*c0909341SAndroid Build Coastguard Worker        movw            r9,  \type_h
1437*c0909341SAndroid Build Coastguard Worker        movw            r10, \type_v
1438*c0909341SAndroid Build Coastguard Worker        b               \op\()_8tap_neon
1439*c0909341SAndroid Build Coastguard Workerendfunc
1440*c0909341SAndroid Build Coastguard Worker.endm
1441*c0909341SAndroid Build Coastguard Worker
1442*c0909341SAndroid Build Coastguard Worker// No spaces in these expressions, due to gas-preprocessor.
1443*c0909341SAndroid Build Coastguard Worker#define REGULAR ((0*15<<7)|3*15)
1444*c0909341SAndroid Build Coastguard Worker#define SMOOTH  ((1*15<<7)|4*15)
1445*c0909341SAndroid Build Coastguard Worker#define SHARP   ((2*15<<7)|3*15)
1446*c0909341SAndroid Build Coastguard Worker
1447*c0909341SAndroid Build Coastguard Worker.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, my, bdmax, ds2, sr2
1448*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular,        REGULAR, REGULAR
1449*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_smooth, REGULAR, SMOOTH
1450*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, regular_sharp,  REGULAR, SHARP
1451*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth,         SMOOTH,  SMOOTH
1452*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_regular, SMOOTH,  REGULAR
1453*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, smooth_sharp,   SMOOTH,  SHARP
1454*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp,          SHARP,   SHARP
1455*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_regular,  SHARP,   REGULAR
1456*c0909341SAndroid Build Coastguard Workermake_8tap_fn \type, sharp_smooth,   SHARP,   SMOOTH
1457*c0909341SAndroid Build Coastguard Worker
1458*c0909341SAndroid Build Coastguard Workerfunction \type\()_8tap_neon
1459*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #36]
1460*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #44]
1461*c0909341SAndroid Build Coastguard Worker.ifc \bdmax, r8
1462*c0909341SAndroid Build Coastguard Worker        ldr             r8,  [sp, #52]
1463*c0909341SAndroid Build Coastguard Worker.endif
1464*c0909341SAndroid Build Coastguard Worker        movw            r11, #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1465*c0909341SAndroid Build Coastguard Worker        mul             \mx, \mx, r11
1466*c0909341SAndroid Build Coastguard Worker        mul             \my, \my, r11
1467*c0909341SAndroid Build Coastguard Worker        add             \mx, \mx, r9  // mx, 8tap_h, 4tap_h
1468*c0909341SAndroid Build Coastguard Worker        add             \my, \my, r10 // my, 8tap_v, 4tap_v
1469*c0909341SAndroid Build Coastguard Worker
1470*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1471*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \w, #1
1472*c0909341SAndroid Build Coastguard Worker.endif
1473*c0909341SAndroid Build Coastguard Worker
1474*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, \bdmax            // bitdepth_max
1475*c0909341SAndroid Build Coastguard Worker        clz             \bdmax,  \bdmax
1476*c0909341SAndroid Build Coastguard Worker        clz             r9,  \w
1477*c0909341SAndroid Build Coastguard Worker        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
1478*c0909341SAndroid Build Coastguard Worker        tst             \mx, #(0x7f << 14)
1479*c0909341SAndroid Build Coastguard Worker        sub             r9,  r9,  #24
1480*c0909341SAndroid Build Coastguard Worker        add             lr,  \bdmax, #6        // 6 + intermediate_bits
1481*c0909341SAndroid Build Coastguard Worker        rsb             r12, \bdmax, #6        // 6 - intermediate_bits
1482*c0909341SAndroid Build Coastguard Worker        movrel          r11, X(mc_subpel_filters), -8
1483*c0909341SAndroid Build Coastguard Worker        bne             L(\type\()_8tap_h)
1484*c0909341SAndroid Build Coastguard Worker        tst             \my, #(0x7f << 14)
1485*c0909341SAndroid Build Coastguard Worker        bne             L(\type\()_8tap_v)
1486*c0909341SAndroid Build Coastguard Worker        b               \type\()_neon
1487*c0909341SAndroid Build Coastguard Worker
1488*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h):
1489*c0909341SAndroid Build Coastguard Worker        cmp             \w,  #4
1490*c0909341SAndroid Build Coastguard Worker        ubfx            r10, \mx, #7,  #7
1491*c0909341SAndroid Build Coastguard Worker        and             \mx, \mx, #0x7f
1492*c0909341SAndroid Build Coastguard Worker        it              gt
1493*c0909341SAndroid Build Coastguard Worker        movgt           \mx, r10
1494*c0909341SAndroid Build Coastguard Worker        tst             \my, #(0x7f << 14)
1495*c0909341SAndroid Build Coastguard Worker        add             \mx, r11, \mx, lsl #3
1496*c0909341SAndroid Build Coastguard Worker        bne             L(\type\()_8tap_hv)
1497*c0909341SAndroid Build Coastguard Worker
1498*c0909341SAndroid Build Coastguard Worker        adr             r10, L(\type\()_8tap_h_tbl)
1499*c0909341SAndroid Build Coastguard Worker        vdup.32         q14, r12           // 6 - intermediate_bits
1500*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
1501*c0909341SAndroid Build Coastguard Worker        vneg.s32        q14, q14           // -(6-intermediate_bits)
1502*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1503*c0909341SAndroid Build Coastguard Worker        vdup.16         q13, \bdmax        // intermediate_bits
1504*c0909341SAndroid Build Coastguard Worker.else
1505*c0909341SAndroid Build Coastguard Worker        vmov.i16        q13, #PREP_BIAS
1506*c0909341SAndroid Build Coastguard Worker.endif
1507*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
1508*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1509*c0909341SAndroid Build Coastguard Worker        vneg.s16        q13, q13           // -intermediate_bits
1510*c0909341SAndroid Build Coastguard Worker.endif
1511*c0909341SAndroid Build Coastguard Worker        bx              r10
1512*c0909341SAndroid Build Coastguard Worker
1513*c0909341SAndroid Build Coastguard Worker        .align 2
1514*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_h_tbl):
1515*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
1516*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
1517*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
1518*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
1519*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
1520*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
1521*c0909341SAndroid Build Coastguard Worker        .word 20f   - L(\type\()_8tap_h_tbl) + CONFIG_THUMB
1522*c0909341SAndroid Build Coastguard Worker
1523*c0909341SAndroid Build Coastguard Worker20:     // 2xN h
1524*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1525*c0909341SAndroid Build Coastguard Worker        add             \mx, \mx, #2
1526*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\mx]
1527*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #2
1528*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1529*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1530*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1531*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1532*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1533*c0909341SAndroid Build Coastguard Worker2:
1534*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2}, [\src], \s_strd
1535*c0909341SAndroid Build Coastguard Worker        vld1.16         {q3}, [\sr2], \s_strd
1536*c0909341SAndroid Build Coastguard Worker        vext.8          d5,  d4,  d5,  #2
1537*c0909341SAndroid Build Coastguard Worker        vext.8          d7,  d6,  d7,  #2
1538*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1539*c0909341SAndroid Build Coastguard Worker        vtrn.32         d4,  d6
1540*c0909341SAndroid Build Coastguard Worker        vtrn.32         d5,  d7
1541*c0909341SAndroid Build Coastguard Worker        vmull.s16       q1,  d4,  d0[0]
1542*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d5,  d0[1]
1543*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d6,  d0[2]
1544*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d7,  d0[3]
1545*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q1,  q1,  q14 // -(6-intermediate_bits)
1546*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d2,  q1
1547*c0909341SAndroid Build Coastguard Worker        vrshl.s16       d2,  d2,  d26 // -intermediate_bits
1548*c0909341SAndroid Build Coastguard Worker        vmin.u16        d2,  d2,  d30
1549*c0909341SAndroid Build Coastguard Worker        vst1.32         {d2[0]}, [\dst, :32], \d_strd
1550*c0909341SAndroid Build Coastguard Worker        vst1.32         {d2[1]}, [\ds2, :32], \d_strd
1551*c0909341SAndroid Build Coastguard Worker        bgt             2b
1552*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1553*c0909341SAndroid Build Coastguard Worker.endif
1554*c0909341SAndroid Build Coastguard Worker
1555*c0909341SAndroid Build Coastguard Worker40:     // 4xN h
1556*c0909341SAndroid Build Coastguard Worker        add             \mx, \mx, #2
1557*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\mx]
1558*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #2
1559*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1560*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1561*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1562*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1563*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1564*c0909341SAndroid Build Coastguard Worker4:
1565*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8},  [\src], \s_strd
1566*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\sr2], \s_strd
1567*c0909341SAndroid Build Coastguard Worker        vext.8          d18, d16, d17, #2
1568*c0909341SAndroid Build Coastguard Worker        vext.8          d19, d16, d17, #4
1569*c0909341SAndroid Build Coastguard Worker        vext.8          d20, d16, d17, #6
1570*c0909341SAndroid Build Coastguard Worker        vext.8          d24, d22, d23, #2
1571*c0909341SAndroid Build Coastguard Worker        vext.8          d25, d22, d23, #4
1572*c0909341SAndroid Build Coastguard Worker        vext.8          d21, d22, d23, #6
1573*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1574*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d16, d0[0]
1575*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d18, d0[1]
1576*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d19, d0[2]
1577*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d0[3]
1578*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d22, d0[0]
1579*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d0[1]
1580*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d0[2]
1581*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d0[3]
1582*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
1583*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
1584*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1585*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2
1586*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
1587*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q2,  q2,  q13 // -intermediate_bits
1588*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15
1589*c0909341SAndroid Build Coastguard Worker.else
1590*c0909341SAndroid Build Coastguard Worker        vmovn.s32       d4,  q2
1591*c0909341SAndroid Build Coastguard Worker        vmovn.s32       d5,  q3
1592*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q13 // PREP_BIAS
1593*c0909341SAndroid Build Coastguard Worker.endif
1594*c0909341SAndroid Build Coastguard Worker        vst1.16         {d4}, [\dst, :64], \d_strd
1595*c0909341SAndroid Build Coastguard Worker        vst1.16         {d5}, [\ds2, :64], \d_strd
1596*c0909341SAndroid Build Coastguard Worker        bgt             4b
1597*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1598*c0909341SAndroid Build Coastguard Worker
1599*c0909341SAndroid Build Coastguard Worker80:
1600*c0909341SAndroid Build Coastguard Worker160:
1601*c0909341SAndroid Build Coastguard Worker320:
1602*c0909341SAndroid Build Coastguard Worker640:
1603*c0909341SAndroid Build Coastguard Worker1280:   // 8xN, 16xN, 32xN, ... h
1604*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q5}
1605*c0909341SAndroid Build Coastguard Worker        vld1.8          {d0}, [\mx, :64]
1606*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #6
1607*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1608*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1609*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1610*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1611*c0909341SAndroid Build Coastguard Worker
1612*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  \w, lsl #1
1613*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  #16
1614*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1615*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1616*c0909341SAndroid Build Coastguard Worker        sub             \d_strd,  \d_strd,  \w, lsl #1
1617*c0909341SAndroid Build Coastguard Worker.endif
1618*c0909341SAndroid Build Coastguard Worker81:
1619*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [\src]!
1620*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [\sr2]!
1621*c0909341SAndroid Build Coastguard Worker        mov             \mx, \w
1622*c0909341SAndroid Build Coastguard Worker
1623*c0909341SAndroid Build Coastguard Worker8:
1624*c0909341SAndroid Build Coastguard Worker        vmull.s16       q1,  d16, d0[0]
1625*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d17, d0[0]
1626*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d20, d0[0]
1627*c0909341SAndroid Build Coastguard Worker        vmull.s16       q4,  d21, d0[0]
1628*c0909341SAndroid Build Coastguard Worker.irpc i, 1234567
1629*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q8,  q9,  #(2*\i)
1630*c0909341SAndroid Build Coastguard Worker        vext.8          q5,  q10, q11, #(2*\i)
1631*c0909341SAndroid Build Coastguard Worker.if \i < 4
1632*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d24, d0[\i]
1633*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d25, d0[\i]
1634*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d10, d0[\i]
1635*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q4,  d11, d0[\i]
1636*c0909341SAndroid Build Coastguard Worker.else
1637*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d24, d1[\i-4]
1638*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d25, d1[\i-4]
1639*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d10, d1[\i-4]
1640*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q4,  d11, d1[\i-4]
1641*c0909341SAndroid Build Coastguard Worker.endif
1642*c0909341SAndroid Build Coastguard Worker.endr
1643*c0909341SAndroid Build Coastguard Worker        subs            \mx, \mx, #8
1644*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q1,  q1,  q14 // -(6-intermediate_bits)
1645*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
1646*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
1647*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q4,  q4,  q14 // -(6-intermediate_bits)
1648*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1649*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d2,  q1
1650*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d3,  q2
1651*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q3
1652*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q4
1653*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q1,  q1,  q13 // -intermediate_bits
1654*c0909341SAndroid Build Coastguard Worker        vrshl.s16       q2,  q2,  q13 // -intermediate_bits
1655*c0909341SAndroid Build Coastguard Worker        vmin.u16        q1,  q1,  q15
1656*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15
1657*c0909341SAndroid Build Coastguard Worker.else
1658*c0909341SAndroid Build Coastguard Worker        vmovn.s32       d2,  q1
1659*c0909341SAndroid Build Coastguard Worker        vmovn.s32       d3,  q2
1660*c0909341SAndroid Build Coastguard Worker        vmovn.s32       d4,  q3
1661*c0909341SAndroid Build Coastguard Worker        vmovn.s32       d5,  q4
1662*c0909341SAndroid Build Coastguard Worker        vsub.i16        q1,  q1,  q13 // PREP_BIAS
1663*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q13 // PREP_BIAS
1664*c0909341SAndroid Build Coastguard Worker.endif
1665*c0909341SAndroid Build Coastguard Worker        vst1.16         {q1}, [\dst, :128]!
1666*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2}, [\ds2, :128]!
1667*c0909341SAndroid Build Coastguard Worker        ble             9f
1668*c0909341SAndroid Build Coastguard Worker
1669*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q9
1670*c0909341SAndroid Build Coastguard Worker        vmov            q10, q11
1671*c0909341SAndroid Build Coastguard Worker        vld1.16         {q9},  [\src]!
1672*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\sr2]!
1673*c0909341SAndroid Build Coastguard Worker        b               8b
1674*c0909341SAndroid Build Coastguard Worker
1675*c0909341SAndroid Build Coastguard Worker9:
1676*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  \d_strd
1677*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \ds2,  \d_strd
1678*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  \s_strd
1679*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \sr2,  \s_strd
1680*c0909341SAndroid Build Coastguard Worker
1681*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1682*c0909341SAndroid Build Coastguard Worker        bgt             81b
1683*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q5}
1684*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1685*c0909341SAndroid Build Coastguard Worker
1686*c0909341SAndroid Build Coastguard Worker
1687*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_v):
1688*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #4
1689*c0909341SAndroid Build Coastguard Worker        ubfx            r10, \my, #7,  #7
1690*c0909341SAndroid Build Coastguard Worker        and             \my, \my, #0x7f
1691*c0909341SAndroid Build Coastguard Worker        it              gt
1692*c0909341SAndroid Build Coastguard Worker        movgt           \my, r10
1693*c0909341SAndroid Build Coastguard Worker        add             \my, r11, \my, lsl #3
1694*c0909341SAndroid Build Coastguard Worker
1695*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1696*c0909341SAndroid Build Coastguard Worker        vdup.32         q14, r12        // 6 - intermediate_bits
1697*c0909341SAndroid Build Coastguard Worker        vmov.i16        q15, #PREP_BIAS
1698*c0909341SAndroid Build Coastguard Worker.endif
1699*c0909341SAndroid Build Coastguard Worker        adr             r10, L(\type\()_8tap_v_tbl)
1700*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
1701*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
1702*c0909341SAndroid Build Coastguard Worker        vneg.s32        q14, q14        // -(6-intermediate_bits)
1703*c0909341SAndroid Build Coastguard Worker.endif
1704*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
1705*c0909341SAndroid Build Coastguard Worker        bx              r10
1706*c0909341SAndroid Build Coastguard Worker
1707*c0909341SAndroid Build Coastguard Worker        .align 2
1708*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_v_tbl):
1709*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
1710*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
1711*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
1712*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
1713*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
1714*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
1715*c0909341SAndroid Build Coastguard Worker        .word 20f   - L(\type\()_8tap_v_tbl) + CONFIG_THUMB
1716*c0909341SAndroid Build Coastguard Worker
1717*c0909341SAndroid Build Coastguard Worker20:     // 2xN v
1718*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1719*c0909341SAndroid Build Coastguard Worker        bgt             28f
1720*c0909341SAndroid Build Coastguard Worker
1721*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1722*c0909341SAndroid Build Coastguard Worker        add             \my, \my, #2
1723*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\my]
1724*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd
1725*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1726*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
1727*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1728*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1729*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1730*c0909341SAndroid Build Coastguard Worker
1731*c0909341SAndroid Build Coastguard Worker        // 2x2 v
1732*c0909341SAndroid Build Coastguard Worker        load_32         \src, \sr2, \s_strd, d1, d2, d3, d4, d5
1733*c0909341SAndroid Build Coastguard Worker        interleave_1_32 d1,  d2,  d3,  d4,  d5
1734*c0909341SAndroid Build Coastguard Worker        bgt             24f
1735*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q8,  d1,  d2,  d3,  d4
1736*c0909341SAndroid Build Coastguard Worker        vqrshrun_s32    6,   q8,  d16
1737*c0909341SAndroid Build Coastguard Worker        vmin_u16        d30, d16
1738*c0909341SAndroid Build Coastguard Worker        vst1_32         \d_strd,  d16
1739*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1740*c0909341SAndroid Build Coastguard Worker
1741*c0909341SAndroid Build Coastguard Worker24:     // 2x4 v
1742*c0909341SAndroid Build Coastguard Worker        load_32         \sr2, \src, \s_strd, d6, d7
1743*c0909341SAndroid Build Coastguard Worker        interleave_1_32 d5,  d6,  d7
1744*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q8,  d1,  d2,  d3,  d4
1745*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q9,  d3,  d4,  d5,  d6
1746*c0909341SAndroid Build Coastguard Worker        vqrshrun_s32    6,   q8,  d16, q9,  d17
1747*c0909341SAndroid Build Coastguard Worker        vmin_u16        q15, q8
1748*c0909341SAndroid Build Coastguard Worker        vst1_32         \d_strd,  d16, d17
1749*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1750*c0909341SAndroid Build Coastguard Worker
1751*c0909341SAndroid Build Coastguard Worker28:     // 2x6, 2x8, 2x12, 2x16 v
1752*c0909341SAndroid Build Coastguard Worker        vld1.8          {d0}, [\my, :64]
1753*c0909341SAndroid Build Coastguard Worker        sub             \sr2,  \src,  \s_strd, lsl #1
1754*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
1755*c0909341SAndroid Build Coastguard Worker        sub             \src,  \sr2,  \s_strd
1756*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
1757*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
1758*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1759*c0909341SAndroid Build Coastguard Worker
1760*c0909341SAndroid Build Coastguard Worker        load_32         \src, \sr2, \s_strd, d2, d3, d4, d5, d6, d7, d16
1761*c0909341SAndroid Build Coastguard Worker        interleave_1_32 d2,  d3,  d4,  d5,  d6
1762*c0909341SAndroid Build Coastguard Worker        interleave_1_32 d6,  d7,  d16
1763*c0909341SAndroid Build Coastguard Worker216:
1764*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #4
1765*c0909341SAndroid Build Coastguard Worker        load_32         \sr2, \src, \s_strd, d17, d18, d19, d20
1766*c0909341SAndroid Build Coastguard Worker        interleave_1_32 d16, d17, d18, d19, d20
1767*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q13, d2,  d3,  d4,  d5,  d6,  d7,  d16, d17
1768*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q1,  d4,  d5,  d6,  d7,  d16, d17, d18, d19
1769*c0909341SAndroid Build Coastguard Worker        vqrshrun_s32    6,   q13, d26, q1,  d27
1770*c0909341SAndroid Build Coastguard Worker        vmin_u16        q15, q13
1771*c0909341SAndroid Build Coastguard Worker        vst1_32         \d_strd,  d26, d27
1772*c0909341SAndroid Build Coastguard Worker        ble             0f
1773*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1774*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q3
1775*c0909341SAndroid Build Coastguard Worker        vmov            q2,  q8
1776*c0909341SAndroid Build Coastguard Worker        vmov            q3,  q9
1777*c0909341SAndroid Build Coastguard Worker        vmov            d16, d20
1778*c0909341SAndroid Build Coastguard Worker        beq             26f
1779*c0909341SAndroid Build Coastguard Worker        b               216b
1780*c0909341SAndroid Build Coastguard Worker26:
1781*c0909341SAndroid Build Coastguard Worker        load_32         \sr2, \src, \s_strd, d17, d18
1782*c0909341SAndroid Build Coastguard Worker        interleave_1_32 d16, d17, d18
1783*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q13, d2,  d3,  d4,  d5,  d6,  d7,  d16, d17
1784*c0909341SAndroid Build Coastguard Worker        vqrshrun_s32    6,   q13, d26
1785*c0909341SAndroid Build Coastguard Worker        vmin_u16        d30, d26
1786*c0909341SAndroid Build Coastguard Worker        vst1_32         \d_strd,  d26
1787*c0909341SAndroid Build Coastguard Worker0:
1788*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1789*c0909341SAndroid Build Coastguard Worker.endif
1790*c0909341SAndroid Build Coastguard Worker
1791*c0909341SAndroid Build Coastguard Worker40:
1792*c0909341SAndroid Build Coastguard Worker        bgt             480f
1793*c0909341SAndroid Build Coastguard Worker
1794*c0909341SAndroid Build Coastguard Worker        // 4x2, 4x4 v
1795*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1796*c0909341SAndroid Build Coastguard Worker        add             \my, \my, #2
1797*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\my]
1798*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
1799*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
1800*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
1801*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
1802*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
1803*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1804*c0909341SAndroid Build Coastguard Worker
1805*c0909341SAndroid Build Coastguard Worker        load_reg        \src, \sr2, \s_strd, d1, d2, d3, d4, d5
1806*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q8,  d1,  d2,  d3,  d4
1807*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q9,  d2,  d3,  d4,  d5
1808*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, q8, q9, d16, d17
1809*c0909341SAndroid Build Coastguard Worker        ble             0f
1810*c0909341SAndroid Build Coastguard Worker        load_reg        \sr2, \src, \s_strd, d6, d7
1811*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q8,  d3,  d4,  d5,  d6
1812*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q9,  d4,  d5,  d6,  d7
1813*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, q8, q9, d16, d17
1814*c0909341SAndroid Build Coastguard Worker0:
1815*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1816*c0909341SAndroid Build Coastguard Worker
1817*c0909341SAndroid Build Coastguard Worker480:    // 4x6, 4x8, 4x12, 4x16 v
1818*c0909341SAndroid Build Coastguard Worker        vld1.8          {d0}, [\my, :64]
1819*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, \s_strd, lsl #1
1820*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
1821*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
1822*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
1823*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
1824*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1825*c0909341SAndroid Build Coastguard Worker
1826*c0909341SAndroid Build Coastguard Worker        load_reg        \src, \sr2, \s_strd, d16, d17, d18, d19, d20, d21, d22
1827*c0909341SAndroid Build Coastguard Worker
1828*c0909341SAndroid Build Coastguard Worker48:
1829*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #4
1830*c0909341SAndroid Build Coastguard Worker        load_reg        \sr2, \src, \s_strd, d23, d24, d25, d26
1831*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q1,  d16, d17, d18, d19, d20, d21, d22, d23
1832*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q2,  d17, d18, d19, d20, d21, d22, d23, d24
1833*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q3,  d18, d19, d20, d21, d22, d23, d24, d25
1834*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q8,  d19, d20, d21, d22, d23, d24, d25, d26
1835*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, q1, q2, d2, d3, q3, q8, d4, d5
1836*c0909341SAndroid Build Coastguard Worker        ble             0f
1837*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1838*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q10
1839*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q11
1840*c0909341SAndroid Build Coastguard Worker        vmov            q10, q12
1841*c0909341SAndroid Build Coastguard Worker        vmov            d22, d26
1842*c0909341SAndroid Build Coastguard Worker        beq             46f
1843*c0909341SAndroid Build Coastguard Worker        b               48b
1844*c0909341SAndroid Build Coastguard Worker46:
1845*c0909341SAndroid Build Coastguard Worker        load_reg        \sr2, \src, \s_strd, d23, d24
1846*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q1,  d16, d17, d18, d19, d20, d21, d22, d23
1847*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q2,  d17, d18, d19, d20, d21, d22, d23, d24
1848*c0909341SAndroid Build Coastguard Worker        shift_store_4   \type, \d_strd, q1, q2, d2, d3
1849*c0909341SAndroid Build Coastguard Worker0:
1850*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1851*c0909341SAndroid Build Coastguard Worker
1852*c0909341SAndroid Build Coastguard Worker80:
1853*c0909341SAndroid Build Coastguard Worker        bgt             880f
1854*c0909341SAndroid Build Coastguard Worker
1855*c0909341SAndroid Build Coastguard Worker        // 8x2, 8x4 v
1856*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
1857*c0909341SAndroid Build Coastguard Worker        add             \my, \my, #2
1858*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\my]
1859*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
1860*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
1861*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
1862*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
1863*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
1864*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1865*c0909341SAndroid Build Coastguard Worker
1866*c0909341SAndroid Build Coastguard Worker        load_reg        \src, \sr2, \s_strd, q1, q2, q3, q8, q9
1867*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q10, d2,  d4,  d6,  d16
1868*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q11, d3,  d5,  d7,  d17
1869*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q12, d4,  d6,  d16, d18
1870*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q13, d5,  d7,  d17, d19
1871*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, q10, q11, d20, d21, q12, q13, d22, d23
1872*c0909341SAndroid Build Coastguard Worker        ble             0f
1873*c0909341SAndroid Build Coastguard Worker        load_reg        \sr2, \src, \s_strd, q10, q11
1874*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q1,  d6,  d16, d18, d20
1875*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q2,  d7,  d17, d19, d21
1876*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q12, d16, d18, d20, d22
1877*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q13, d17, d19, d21, d23
1878*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, q1, q2, d2, d3, q12, q13, d4, d5
1879*c0909341SAndroid Build Coastguard Worker0:
1880*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1881*c0909341SAndroid Build Coastguard Worker
1882*c0909341SAndroid Build Coastguard Worker880:    // 8x6, 8x8, 8x16, 8x32 v
1883*c0909341SAndroid Build Coastguard Worker1680:   // 16x8, 16x16, ...
1884*c0909341SAndroid Build Coastguard Worker320:    // 32x8, 32x16, ...
1885*c0909341SAndroid Build Coastguard Worker640:
1886*c0909341SAndroid Build Coastguard Worker1280:
1887*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
1888*c0909341SAndroid Build Coastguard Worker        vld1.8          {d0}, [\my, :64]
1889*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
1890*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1
1891*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1892*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
1893*c0909341SAndroid Build Coastguard Worker168:
1894*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
1895*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
1896*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
1897*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
1898*c0909341SAndroid Build Coastguard Worker
1899*c0909341SAndroid Build Coastguard Worker        load_reg        \src, \sr2, \s_strd, q5, q6, q7, q8, q9, q10, q11
1900*c0909341SAndroid Build Coastguard Worker
1901*c0909341SAndroid Build Coastguard Worker88:
1902*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1903*c0909341SAndroid Build Coastguard Worker        load_reg        \sr2, \src, \s_strd, q12, q13
1904*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q1,  d10, d12, d14, d16, d18, d20, d22, d24
1905*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q2,  d11, d13, d15, d17, d19, d21, d23, d25
1906*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q3,  d12, d14, d16, d18, d20, d22, d24, d26
1907*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q4,  d13, d15, d17, d19, d21, d23, d25, d27
1908*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, q1, q2,  d2,  d3,  q3,  q4,  d4,  d5
1909*c0909341SAndroid Build Coastguard Worker        ble             9f
1910*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
1911*c0909341SAndroid Build Coastguard Worker        load_reg        \sr2, \src, \s_strd, q1,  q2
1912*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q3,  d14, d16, d18, d20, d22, d24, d26, d2
1913*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q4,  d15, d17, d19, d21, d23, d25, d27, d3
1914*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q5,  d16, d18, d20, d22, d24, d26, d2,  d4
1915*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_8   q6,  d17, d19, d21, d23, d25, d27, d3,  d5
1916*c0909341SAndroid Build Coastguard Worker        shift_store_8   \type, \d_strd, q3, q4,  d6,  d7,  q5,  q6,  d8,  d9
1917*c0909341SAndroid Build Coastguard Worker        ble             9f
1918*c0909341SAndroid Build Coastguard Worker        vmov            q5,  q9
1919*c0909341SAndroid Build Coastguard Worker        vmov            q6,  q10
1920*c0909341SAndroid Build Coastguard Worker        vmov            q7,  q11
1921*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q12
1922*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q13
1923*c0909341SAndroid Build Coastguard Worker        vmov            q10, q1
1924*c0909341SAndroid Build Coastguard Worker        vmov            q11, q2
1925*c0909341SAndroid Build Coastguard Worker        b               88b
1926*c0909341SAndroid Build Coastguard Worker9:
1927*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
1928*c0909341SAndroid Build Coastguard Worker        ble             0f
1929*c0909341SAndroid Build Coastguard Worker        asr             \s_strd, \s_strd, #1
1930*c0909341SAndroid Build Coastguard Worker        asr             \d_strd, \d_strd, #1
1931*c0909341SAndroid Build Coastguard Worker        mls             \src, \s_strd, \my, \src
1932*c0909341SAndroid Build Coastguard Worker        mls             \dst, \d_strd, \my, \dst
1933*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #3
1934*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
1935*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #16
1936*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #16
1937*c0909341SAndroid Build Coastguard Worker        b               168b
1938*c0909341SAndroid Build Coastguard Worker0:
1939*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
1940*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1941*c0909341SAndroid Build Coastguard Worker
1942*c0909341SAndroid Build Coastguard Worker160:
1943*c0909341SAndroid Build Coastguard Worker        bgt             1680b
1944*c0909341SAndroid Build Coastguard Worker
1945*c0909341SAndroid Build Coastguard Worker        // 16x2, 16x4 v
1946*c0909341SAndroid Build Coastguard Worker        vpush           {q6-q7}
1947*c0909341SAndroid Build Coastguard Worker        add             \my, \my, #2
1948*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\my]
1949*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd
1950*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
1951*c0909341SAndroid Build Coastguard Worker
1952*c0909341SAndroid Build Coastguard Worker        load_16s16      \src, \src, \s_strd, q6,  q7,  q8,  q9, q10, q11
1953*c0909341SAndroid Build Coastguard Worker16:
1954*c0909341SAndroid Build Coastguard Worker        load_16s16      \src, \src, \s_strd, q12, q13
1955*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #1
1956*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q1,  d12, d16, d20, d24
1957*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q2,  d13, d17, d21, d25
1958*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q3,  d14, d18, d22, d26
1959*c0909341SAndroid Build Coastguard Worker        vmull_vmlal_4   q6,  d15, d19, d23, d27
1960*c0909341SAndroid Build Coastguard Worker        shift_store_16  \type, \d_strd, q1, q2, d2, d3, q3, q6, d4, d5
1961*c0909341SAndroid Build Coastguard Worker        ble             0f
1962*c0909341SAndroid Build Coastguard Worker        vmov            q6,  q8
1963*c0909341SAndroid Build Coastguard Worker        vmov            q7,  q9
1964*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q10
1965*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q11
1966*c0909341SAndroid Build Coastguard Worker        vmov            q10, q12
1967*c0909341SAndroid Build Coastguard Worker        vmov            q11, q13
1968*c0909341SAndroid Build Coastguard Worker        b               16b
1969*c0909341SAndroid Build Coastguard Worker0:
1970*c0909341SAndroid Build Coastguard Worker        vpop            {q6-q7}
1971*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
1972*c0909341SAndroid Build Coastguard Worker
1973*c0909341SAndroid Build Coastguard Worker
1974*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_hv):
1975*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #4
1976*c0909341SAndroid Build Coastguard Worker        ubfx            r10, \my, #7,  #7
1977*c0909341SAndroid Build Coastguard Worker        and             \my, \my, #0x7f
1978*c0909341SAndroid Build Coastguard Worker        it              gt
1979*c0909341SAndroid Build Coastguard Worker        movgt           \my, r10
1980*c0909341SAndroid Build Coastguard Worker4:
1981*c0909341SAndroid Build Coastguard Worker        add             \my, r11, \my, lsl #3
1982*c0909341SAndroid Build Coastguard Worker
1983*c0909341SAndroid Build Coastguard Worker        adr             r10, L(\type\()_8tap_hv_tbl)
1984*c0909341SAndroid Build Coastguard Worker        neg             r12, r12           // -(6-intermediate_bits)
1985*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
1986*c0909341SAndroid Build Coastguard Worker        vdup.32         q14, r12           // -(6-intermediate_bits)
1987*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1988*c0909341SAndroid Build Coastguard Worker        neg             r8,  lr            // -(6+intermeidate_bits)
1989*c0909341SAndroid Build Coastguard Worker.else
1990*c0909341SAndroid Build Coastguard Worker        vmov.i16        q13, #PREP_BIAS
1991*c0909341SAndroid Build Coastguard Worker.endif
1992*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
1993*c0909341SAndroid Build Coastguard Worker.ifc \type, put
1994*c0909341SAndroid Build Coastguard Worker        vdup.32         q13, r8            // -(6+intermediate_bits)
1995*c0909341SAndroid Build Coastguard Worker.endif
1996*c0909341SAndroid Build Coastguard Worker        bx              r10
1997*c0909341SAndroid Build Coastguard Worker
1998*c0909341SAndroid Build Coastguard Worker        .align 2
1999*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_hv_tbl):
2000*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
2001*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
2002*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
2003*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
2004*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
2005*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
2006*c0909341SAndroid Build Coastguard Worker        .word 20f   - L(\type\()_8tap_hv_tbl) + CONFIG_THUMB
2007*c0909341SAndroid Build Coastguard Worker
2008*c0909341SAndroid Build Coastguard Worker20:
2009*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2010*c0909341SAndroid Build Coastguard Worker        add             \mx, \mx, #2
2011*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\mx]
2012*c0909341SAndroid Build Coastguard Worker        bgt             280f
2013*c0909341SAndroid Build Coastguard Worker        add             \my, \my, #2
2014*c0909341SAndroid Build Coastguard Worker        vld1.32         {d2[]}, [\my]
2015*c0909341SAndroid Build Coastguard Worker
2016*c0909341SAndroid Build Coastguard Worker        // 2x2, 2x4 hv
2017*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, #2
2018*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2019*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2020*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2021*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2022*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
2023*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2
2024*c0909341SAndroid Build Coastguard Worker
2025*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\src], \s_strd
2026*c0909341SAndroid Build Coastguard Worker        vext.8          d24, d22, d23, #2
2027*c0909341SAndroid Build Coastguard Worker        vmull.s16       q11, d22, d0
2028*c0909341SAndroid Build Coastguard Worker        vmull.s16       q12, d24, d0
2029*c0909341SAndroid Build Coastguard Worker        vpadd.s32       d22, d22, d23
2030*c0909341SAndroid Build Coastguard Worker        vpadd.s32       d23, d24, d25
2031*c0909341SAndroid Build Coastguard Worker        vpadd.s32       d22, d22, d23
2032*c0909341SAndroid Build Coastguard Worker        vrshl.s32       d16, d22, d28 // -(6-intermediate_bits)
2033*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d16, q8
2034*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_2)
2035*c0909341SAndroid Build Coastguard Worker
2036*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d16, #4
2037*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d24, #4
2038*c0909341SAndroid Build Coastguard Worker        vmov            d17, d24
2039*c0909341SAndroid Build Coastguard Worker
2040*c0909341SAndroid Build Coastguard Worker2:
2041*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_2)
2042*c0909341SAndroid Build Coastguard Worker
2043*c0909341SAndroid Build Coastguard Worker        vext.8          d18, d17, d24, #4
2044*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d16, d2[0]
2045*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d17, d2[1]
2046*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d18, d2[2]
2047*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d24, d2[3]
2048*c0909341SAndroid Build Coastguard Worker
2049*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q13 // -(6+intermediate_bits)
2050*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2
2051*c0909341SAndroid Build Coastguard Worker        vmin.u16        d4,  d4,  d30
2052*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2053*c0909341SAndroid Build Coastguard Worker        vst1.32         {d4[0]}, [\dst, :32], \d_strd
2054*c0909341SAndroid Build Coastguard Worker        vst1.32         {d4[1]}, [\ds2, :32], \d_strd
2055*c0909341SAndroid Build Coastguard Worker        ble             0f
2056*c0909341SAndroid Build Coastguard Worker        vmov            d16, d18
2057*c0909341SAndroid Build Coastguard Worker        vmov            d17, d24
2058*c0909341SAndroid Build Coastguard Worker        b               2b
2059*c0909341SAndroid Build Coastguard Worker
2060*c0909341SAndroid Build Coastguard Worker280:    // 2x8, 2x16, 2x32 hv
2061*c0909341SAndroid Build Coastguard Worker        vld1.8          {d2},  [\my, :64]
2062*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, #2
2063*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, \s_strd, lsl #1
2064*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2065*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2066*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2067*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2068*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
2069*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2
2070*c0909341SAndroid Build Coastguard Worker
2071*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\src], \s_strd
2072*c0909341SAndroid Build Coastguard Worker        vext.8          d24, d22, d23, #2
2073*c0909341SAndroid Build Coastguard Worker        vmull.s16       q11, d22, d0
2074*c0909341SAndroid Build Coastguard Worker        vmull.s16       q12, d24, d0
2075*c0909341SAndroid Build Coastguard Worker        vpadd.s32       d22, d22, d23
2076*c0909341SAndroid Build Coastguard Worker        vpadd.s32       d23, d24, d25
2077*c0909341SAndroid Build Coastguard Worker        vpadd.s32       d22, d22, d23
2078*c0909341SAndroid Build Coastguard Worker        vrshl.s32       d16, d22, d28 // -(6-intermediate_bits)
2079*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d16, q8
2080*c0909341SAndroid Build Coastguard Worker
2081*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_2)
2082*c0909341SAndroid Build Coastguard Worker
2083*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d16, #4
2084*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d24, #4
2085*c0909341SAndroid Build Coastguard Worker        vmov            d17, d24
2086*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_2)
2087*c0909341SAndroid Build Coastguard Worker        vext.8          d18, d17, d24, #4
2088*c0909341SAndroid Build Coastguard Worker        vmov            d19, d24
2089*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_2)
2090*c0909341SAndroid Build Coastguard Worker        vext.8          d20, d19, d24, #4
2091*c0909341SAndroid Build Coastguard Worker        vmov            d21, d24
2092*c0909341SAndroid Build Coastguard Worker
2093*c0909341SAndroid Build Coastguard Worker28:
2094*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_2)
2095*c0909341SAndroid Build Coastguard Worker        vext.8          d22, d21, d24, #4
2096*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d16, d2[0]
2097*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d17, d2[1]
2098*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d18, d2[2]
2099*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d19, d2[3]
2100*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d20, d3[0]
2101*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d3[1]
2102*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d22, d3[2]
2103*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d3[3]
2104*c0909341SAndroid Build Coastguard Worker
2105*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q13 // -(6+intermediate_bits)
2106*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d6,  q3
2107*c0909341SAndroid Build Coastguard Worker        vmin.u16        d6,  d6,  d30
2108*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2109*c0909341SAndroid Build Coastguard Worker        vst1.32         {d6[0]}, [\dst, :32], \d_strd
2110*c0909341SAndroid Build Coastguard Worker        vst1.32         {d6[1]}, [\ds2, :32], \d_strd
2111*c0909341SAndroid Build Coastguard Worker        ble             0f
2112*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q9
2113*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q10
2114*c0909341SAndroid Build Coastguard Worker        vmov            d20, d22
2115*c0909341SAndroid Build Coastguard Worker        vmov            d21, d24
2116*c0909341SAndroid Build Coastguard Worker        b               28b
2117*c0909341SAndroid Build Coastguard Worker0:
2118*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2119*c0909341SAndroid Build Coastguard Worker
2120*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_filter_2):
2121*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\sr2], \s_strd
2122*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12}, [\src], \s_strd
2123*c0909341SAndroid Build Coastguard Worker        vext.8          d23, d22, d23, #2
2124*c0909341SAndroid Build Coastguard Worker        vext.8          d25, d24, d25, #2
2125*c0909341SAndroid Build Coastguard Worker        vtrn.32         q11, q12
2126*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d22, d0[0]
2127*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d23, d0[1]
2128*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d0[2]
2129*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d0[3]
2130*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
2131*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d24, q3
2132*c0909341SAndroid Build Coastguard Worker        bx              lr
2133*c0909341SAndroid Build Coastguard Worker.endif
2134*c0909341SAndroid Build Coastguard Worker
2135*c0909341SAndroid Build Coastguard Worker40:
2136*c0909341SAndroid Build Coastguard Worker        add             \mx, \mx, #2
2137*c0909341SAndroid Build Coastguard Worker        vld1.32         {d0[]}, [\mx]
2138*c0909341SAndroid Build Coastguard Worker        bgt             480f
2139*c0909341SAndroid Build Coastguard Worker        add             \my, \my, #2
2140*c0909341SAndroid Build Coastguard Worker        vld1.32         {d2[]}, [\my]
2141*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, #2
2142*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2143*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2144*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2145*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2146*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
2147*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2
2148*c0909341SAndroid Build Coastguard Worker
2149*c0909341SAndroid Build Coastguard Worker        // 4x2, 4x4 hv
2150*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\src], \s_strd
2151*c0909341SAndroid Build Coastguard Worker        vext.8          d24, d22, d23, #2
2152*c0909341SAndroid Build Coastguard Worker        vext.8          d25, d22, d23, #4
2153*c0909341SAndroid Build Coastguard Worker        vext.8          d23, d22, d23, #6
2154*c0909341SAndroid Build Coastguard Worker        vmull.s16       q10, d22, d0[0]
2155*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d24, d0[1]
2156*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d25, d0[2]
2157*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d23, d0[3]
2158*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q10, q10, q14 // -(6-intermediate_bits)
2159*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d17, q10
2160*c0909341SAndroid Build Coastguard Worker
2161*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_4)
2162*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q12
2163*c0909341SAndroid Build Coastguard Worker
2164*c0909341SAndroid Build Coastguard Worker4:
2165*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_4)
2166*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d17, d2[0]
2167*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d18, d2[1]
2168*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d19, d2[2]
2169*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d24, d2[3]
2170*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d18, d2[0]
2171*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d19, d2[1]
2172*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d2[2]
2173*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d2[3]
2174*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2175*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q13 // -(6+intermediate_bits)
2176*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q13 // -(6+intermediate_bits)
2177*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2
2178*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
2179*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15
2180*c0909341SAndroid Build Coastguard Worker.else
2181*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d4,  q2,  #6
2182*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d5,  q3,  #6
2183*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q13 // PREP_BIAS
2184*c0909341SAndroid Build Coastguard Worker.endif
2185*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2186*c0909341SAndroid Build Coastguard Worker
2187*c0909341SAndroid Build Coastguard Worker        vst1.16         {d4}, [\dst, :64], \d_strd
2188*c0909341SAndroid Build Coastguard Worker        vst1.16         {d5}, [\ds2, :64], \d_strd
2189*c0909341SAndroid Build Coastguard Worker        ble             0f
2190*c0909341SAndroid Build Coastguard Worker        vmov            d17, d19
2191*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q12
2192*c0909341SAndroid Build Coastguard Worker        b               4b
2193*c0909341SAndroid Build Coastguard Worker0:
2194*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2195*c0909341SAndroid Build Coastguard Worker
2196*c0909341SAndroid Build Coastguard Worker480:    // 4x8, 4x16, 4x32 hv
2197*c0909341SAndroid Build Coastguard Worker        vpush           {d13-d15}
2198*c0909341SAndroid Build Coastguard Worker        vld1.8          {d2},  [\my, :64]
2199*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, #2
2200*c0909341SAndroid Build Coastguard Worker        sub             \sr2, \src, \s_strd, lsl #1
2201*c0909341SAndroid Build Coastguard Worker        sub             \src, \sr2, \s_strd
2202*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2203*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2204*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2205*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
2206*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2
2207*c0909341SAndroid Build Coastguard Worker
2208*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\src], \s_strd
2209*c0909341SAndroid Build Coastguard Worker        vext.8          d24, d22, d23, #2
2210*c0909341SAndroid Build Coastguard Worker        vext.8          d25, d22, d23, #4
2211*c0909341SAndroid Build Coastguard Worker        vext.8          d23, d22, d23, #6
2212*c0909341SAndroid Build Coastguard Worker        vmull.s16       q10, d22, d0[0]
2213*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d24, d0[1]
2214*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d25, d0[2]
2215*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d23, d0[3]
2216*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q10, q10, q14 // -(6-intermediate_bits)
2217*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d13, q10
2218*c0909341SAndroid Build Coastguard Worker
2219*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_4)
2220*c0909341SAndroid Build Coastguard Worker        vmov            q7,  q12
2221*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_4)
2222*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q12
2223*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_4)
2224*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q12
2225*c0909341SAndroid Build Coastguard Worker
2226*c0909341SAndroid Build Coastguard Worker48:
2227*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_4)
2228*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d13, d2[0]
2229*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d14, d2[1]
2230*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d15, d2[2]
2231*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d16, d2[3]
2232*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d17, d3[0]
2233*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d18, d3[1]
2234*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d19, d3[2]
2235*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d24, d3[3]
2236*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d14, d2[0]
2237*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d15, d2[1]
2238*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d16, d2[2]
2239*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d17, d2[3]
2240*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d18, d3[0]
2241*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d19, d3[1]
2242*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d3[2]
2243*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d3[3]
2244*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2245*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q13 // -(6+intermediate_bits)
2246*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q13 // -(6+intermediate_bits)
2247*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2
2248*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
2249*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15
2250*c0909341SAndroid Build Coastguard Worker.else
2251*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d4,  q2,  #6
2252*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d5,  q3,  #6
2253*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q13 // PREP_BIAS
2254*c0909341SAndroid Build Coastguard Worker.endif
2255*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2256*c0909341SAndroid Build Coastguard Worker        vst1.16         {d4}, [\dst, :64], \d_strd
2257*c0909341SAndroid Build Coastguard Worker        vst1.16         {d5}, [\ds2, :64], \d_strd
2258*c0909341SAndroid Build Coastguard Worker        ble             0f
2259*c0909341SAndroid Build Coastguard Worker        vmov            d13, d15
2260*c0909341SAndroid Build Coastguard Worker        vmov            q7,  q8
2261*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q9
2262*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q12
2263*c0909341SAndroid Build Coastguard Worker        b               48b
2264*c0909341SAndroid Build Coastguard Worker0:
2265*c0909341SAndroid Build Coastguard Worker        vpop            {d13-d15}
2266*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2267*c0909341SAndroid Build Coastguard Worker
2268*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_filter_4):
2269*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10}, [\sr2], \s_strd
2270*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\src], \s_strd
2271*c0909341SAndroid Build Coastguard Worker        vext.8          d24, d20, d21, #2
2272*c0909341SAndroid Build Coastguard Worker        vext.8          d25, d20, d21, #4
2273*c0909341SAndroid Build Coastguard Worker        vext.8          d21, d20, d21, #6
2274*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d20, d0[0]
2275*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d0[1]
2276*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d0[2]
2277*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d0[3]
2278*c0909341SAndroid Build Coastguard Worker        vext.8          d24, d22, d23, #2
2279*c0909341SAndroid Build Coastguard Worker        vext.8          d25, d22, d23, #4
2280*c0909341SAndroid Build Coastguard Worker        vext.8          d23, d22, d23, #6
2281*c0909341SAndroid Build Coastguard Worker        vmull.s16       q10, d22, d0[0]
2282*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d24, d0[1]
2283*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d25, d0[2]
2284*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q10, d23, d0[3]
2285*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
2286*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q10, q10, q14 // -(6-intermediate_bits)
2287*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d24, q3
2288*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d25, q10
2289*c0909341SAndroid Build Coastguard Worker        bx              lr
2290*c0909341SAndroid Build Coastguard Worker
2291*c0909341SAndroid Build Coastguard Worker80:
2292*c0909341SAndroid Build Coastguard Worker160:
2293*c0909341SAndroid Build Coastguard Worker320:
2294*c0909341SAndroid Build Coastguard Worker        bgt             880f
2295*c0909341SAndroid Build Coastguard Worker        add             \my, \my, #2
2296*c0909341SAndroid Build Coastguard Worker        vld1.8          {d0},  [\mx, :64]
2297*c0909341SAndroid Build Coastguard Worker        vld1.32         {d2[]}, [\my]
2298*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #6
2299*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd
2300*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
2301*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2
2302*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
2303*c0909341SAndroid Build Coastguard Worker
2304*c0909341SAndroid Build Coastguard Worker164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2305*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2306*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2307*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2308*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2309*c0909341SAndroid Build Coastguard Worker
2310*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11, q12}, [\src], \s_strd
2311*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d22, d0[0]
2312*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d23, d0[0]
2313*c0909341SAndroid Build Coastguard Worker        vdup.32         q14, r12      // -(6-intermediate_bits)
2314*c0909341SAndroid Build Coastguard Worker.irpc i, 1234567
2315*c0909341SAndroid Build Coastguard Worker        vext.8          q10, q11, q12, #(2*\i)
2316*c0909341SAndroid Build Coastguard Worker.if \i < 4
2317*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d0[\i]
2318*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d0[\i]
2319*c0909341SAndroid Build Coastguard Worker.else
2320*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d1[\i - 4]
2321*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d1[\i - 4]
2322*c0909341SAndroid Build Coastguard Worker.endif
2323*c0909341SAndroid Build Coastguard Worker.endr
2324*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
2325*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
2326*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d16, q2
2327*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d17, q3
2328*c0909341SAndroid Build Coastguard Worker
2329*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_8)
2330*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q11
2331*c0909341SAndroid Build Coastguard Worker        vmov            q10, q12
2332*c0909341SAndroid Build Coastguard Worker
2333*c0909341SAndroid Build Coastguard Worker8:
2334*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_8)
2335*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d16, d2[0]
2336*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d17, d2[0]
2337*c0909341SAndroid Build Coastguard Worker        vmull.s16       q13, d18, d2[0]
2338*c0909341SAndroid Build Coastguard Worker        vmull.s16       q14, d19, d2[0]
2339*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2340*c0909341SAndroid Build Coastguard Worker        vdup.32         q8,  r8      // -(6+intermediate_bits)
2341*c0909341SAndroid Build Coastguard Worker.endif
2342*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d18, d2[1]
2343*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d19, d2[1]
2344*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d20, d2[1]
2345*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d21, d2[1]
2346*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d2[2]
2347*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d2[2]
2348*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d22, d2[2]
2349*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d23, d2[2]
2350*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d22, d2[3]
2351*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d23, d2[3]
2352*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d24, d2[3]
2353*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d25, d2[3]
2354*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2355*c0909341SAndroid Build Coastguard Worker        vdup.16         q9,  \bdmax  // bitdepth_max
2356*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q8 // -(6+intermediate_bits)
2357*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q8 // -(6+intermediate_bits)
2358*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q13, q13, q8 // -(6+intermediate_bits)
2359*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q14, q14, q8 // -(6+intermediate_bits)
2360*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2
2361*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
2362*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d6,  q13
2363*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d7,  q14
2364*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15
2365*c0909341SAndroid Build Coastguard Worker        vmin.u16        q3,  q3,  q15
2366*c0909341SAndroid Build Coastguard Worker.else
2367*c0909341SAndroid Build Coastguard Worker        vmov.i16        q9,  #PREP_BIAS
2368*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d4,  q2,  #6
2369*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d5,  q3,  #6
2370*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d6,  q13, #6
2371*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d7,  q14, #6
2372*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q9 // PREP_BIAS
2373*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q3,  q9 // PREP_BIAS
2374*c0909341SAndroid Build Coastguard Worker.endif
2375*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2376*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2}, [\dst, :128], \d_strd
2377*c0909341SAndroid Build Coastguard Worker        vst1.16         {q3}, [\ds2, :128], \d_strd
2378*c0909341SAndroid Build Coastguard Worker        ble             9f
2379*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q10
2380*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q11
2381*c0909341SAndroid Build Coastguard Worker        vmov            q10, q12
2382*c0909341SAndroid Build Coastguard Worker        b               8b
2383*c0909341SAndroid Build Coastguard Worker9:
2384*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
2385*c0909341SAndroid Build Coastguard Worker        ble             0f
2386*c0909341SAndroid Build Coastguard Worker        asr             \s_strd,  \s_strd,  #1
2387*c0909341SAndroid Build Coastguard Worker        asr             \d_strd,  \d_strd,  #1
2388*c0909341SAndroid Build Coastguard Worker        mls             \src,  \s_strd,  \my,  \src
2389*c0909341SAndroid Build Coastguard Worker        mls             \dst,  \d_strd,  \my,  \dst
2390*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd,  lsl #2
2391*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
2392*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  #16
2393*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  #16
2394*c0909341SAndroid Build Coastguard Worker        b               164b
2395*c0909341SAndroid Build Coastguard Worker0:
2396*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2397*c0909341SAndroid Build Coastguard Worker
2398*c0909341SAndroid Build Coastguard Worker880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2399*c0909341SAndroid Build Coastguard Worker640:
2400*c0909341SAndroid Build Coastguard Worker1280:
2401*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
2402*c0909341SAndroid Build Coastguard Worker        vld1.8          {d0}, [\mx, :64]
2403*c0909341SAndroid Build Coastguard Worker        vld1.8          {d2}, [\my, :64]
2404*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  #6
2405*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd
2406*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd, lsl #1
2407*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0
2408*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2
2409*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
2410*c0909341SAndroid Build Coastguard Worker
2411*c0909341SAndroid Build Coastguard Worker168:
2412*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2413*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2414*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2415*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2416*c0909341SAndroid Build Coastguard Worker
2417*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11, q12}, [\src], \s_strd
2418*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d22, d0[0]
2419*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d23, d0[0]
2420*c0909341SAndroid Build Coastguard Worker        vdup.32         q14, r12      // -(6-intermediate_bits)
2421*c0909341SAndroid Build Coastguard Worker.irpc i, 1234567
2422*c0909341SAndroid Build Coastguard Worker        vext.8          q10, q11, q12, #(2*\i)
2423*c0909341SAndroid Build Coastguard Worker.if \i < 4
2424*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d0[\i]
2425*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d0[\i]
2426*c0909341SAndroid Build Coastguard Worker.else
2427*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d1[\i - 4]
2428*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d1[\i - 4]
2429*c0909341SAndroid Build Coastguard Worker.endif
2430*c0909341SAndroid Build Coastguard Worker.endr
2431*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q14 // -(6-intermediate_bits)
2432*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q14 // -(6-intermediate_bits)
2433*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d8,  q2
2434*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d9,  q3
2435*c0909341SAndroid Build Coastguard Worker
2436*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_8)
2437*c0909341SAndroid Build Coastguard Worker        vmov            q5,  q11
2438*c0909341SAndroid Build Coastguard Worker        vmov            q6,  q12
2439*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_8)
2440*c0909341SAndroid Build Coastguard Worker        vmov            q7,  q11
2441*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q12
2442*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_8)
2443*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q11
2444*c0909341SAndroid Build Coastguard Worker        vmov            q10, q12
2445*c0909341SAndroid Build Coastguard Worker
2446*c0909341SAndroid Build Coastguard Worker88:
2447*c0909341SAndroid Build Coastguard Worker        bl              L(\type\()_8tap_filter_8)
2448*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d8,  d2[0]
2449*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d9,  d2[0]
2450*c0909341SAndroid Build Coastguard Worker        vmull.s16       q13, d10, d2[0]
2451*c0909341SAndroid Build Coastguard Worker        vmull.s16       q14, d11, d2[0]
2452*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2453*c0909341SAndroid Build Coastguard Worker        vdup.32         q4,  r8      // -(6+intermediate_bits)
2454*c0909341SAndroid Build Coastguard Worker.endif
2455*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d10, d2[1]
2456*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d11, d2[1]
2457*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d12, d2[1]
2458*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d13, d2[1]
2459*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d12, d2[2]
2460*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d13, d2[2]
2461*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d14, d2[2]
2462*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d15, d2[2]
2463*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d14, d2[3]
2464*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d15, d2[3]
2465*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d16, d2[3]
2466*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d17, d2[3]
2467*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d16, d3[0]
2468*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d17, d3[0]
2469*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d18, d3[0]
2470*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d19, d3[0]
2471*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d18, d3[1]
2472*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d19, d3[1]
2473*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d20, d3[1]
2474*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d21, d3[1]
2475*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d20, d3[2]
2476*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d21, d3[2]
2477*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d22, d3[2]
2478*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d23, d3[2]
2479*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d22, d3[3]
2480*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d23, d3[3]
2481*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q13, d24, d3[3]
2482*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q14, d25, d3[3]
2483*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2484*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q4 // -(6+intermediate_bits)
2485*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q4 // -(6+intermediate_bits)
2486*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q13, q13, q4 // -(6+intermediate_bits)
2487*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q14, q14, q4 // -(6+intermediate_bits)
2488*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d4,  q2
2489*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d5,  q3
2490*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d6,  q13
2491*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d7,  q14
2492*c0909341SAndroid Build Coastguard Worker        vmin.u16        q2,  q2,  q15
2493*c0909341SAndroid Build Coastguard Worker        vmin.u16        q3,  q3,  q15
2494*c0909341SAndroid Build Coastguard Worker.else
2495*c0909341SAndroid Build Coastguard Worker        vmov.i16        q5,  #PREP_BIAS
2496*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d4,  q2,  #6
2497*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d5,  q3,  #6
2498*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d6,  q13, #6
2499*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d7,  q14, #6
2500*c0909341SAndroid Build Coastguard Worker        vsub.i16        q2,  q2,  q5 // PREP_BIAS
2501*c0909341SAndroid Build Coastguard Worker        vsub.i16        q3,  q3,  q5 // PREP_BIAS
2502*c0909341SAndroid Build Coastguard Worker.endif
2503*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2504*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2}, [\dst, :128], \d_strd
2505*c0909341SAndroid Build Coastguard Worker        vst1.16         {q3}, [\ds2, :128], \d_strd
2506*c0909341SAndroid Build Coastguard Worker        ble             9f
2507*c0909341SAndroid Build Coastguard Worker        vmov            q4,  q6
2508*c0909341SAndroid Build Coastguard Worker        vmov            q5,  q7
2509*c0909341SAndroid Build Coastguard Worker        vmov            q6,  q8
2510*c0909341SAndroid Build Coastguard Worker        vmov            q7,  q9
2511*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q10
2512*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q11
2513*c0909341SAndroid Build Coastguard Worker        vmov            q10, q12
2514*c0909341SAndroid Build Coastguard Worker        b               88b
2515*c0909341SAndroid Build Coastguard Worker9:
2516*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
2517*c0909341SAndroid Build Coastguard Worker        ble             0f
2518*c0909341SAndroid Build Coastguard Worker        asr             \s_strd,  \s_strd,  #1
2519*c0909341SAndroid Build Coastguard Worker        asr             \d_strd,  \d_strd,  #1
2520*c0909341SAndroid Build Coastguard Worker        mls             \src,  \s_strd,  \my,  \src
2521*c0909341SAndroid Build Coastguard Worker        mls             \dst,  \d_strd,  \my,  \dst
2522*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd,  lsl #3
2523*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
2524*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  #16
2525*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  #16
2526*c0909341SAndroid Build Coastguard Worker        b               168b
2527*c0909341SAndroid Build Coastguard Worker0:
2528*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
2529*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2530*c0909341SAndroid Build Coastguard Worker
2531*c0909341SAndroid Build Coastguard WorkerL(\type\()_8tap_filter_8):
2532*c0909341SAndroid Build Coastguard Worker        vld1.16         {q13, q14}, [\sr2], \s_strd
2533*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d26, d0[0]
2534*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d27, d0[0]
2535*c0909341SAndroid Build Coastguard Worker.irpc i, 1234567
2536*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q13, q14, #(2*\i)
2537*c0909341SAndroid Build Coastguard Worker.if \i < 4
2538*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d24, d0[\i]
2539*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d0[\i]
2540*c0909341SAndroid Build Coastguard Worker.else
2541*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q2,  d24, d1[\i - 4]
2542*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d25, d1[\i - 4]
2543*c0909341SAndroid Build Coastguard Worker.endif
2544*c0909341SAndroid Build Coastguard Worker.endr
2545*c0909341SAndroid Build Coastguard Worker        vdup.32         q12, r12      // -(6-intermediate_bits)
2546*c0909341SAndroid Build Coastguard Worker        vld1.16         {q13, q14}, [\src], \s_strd
2547*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q2,  q2,  q12 // -(6-intermediate_bits)
2548*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q12 // -(6-intermediate_bits)
2549*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d4,  q2
2550*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d5,  q3
2551*c0909341SAndroid Build Coastguard Worker
2552*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d26, d0[0]
2553*c0909341SAndroid Build Coastguard Worker        vmull.s16       q11, d27, d0[0]
2554*c0909341SAndroid Build Coastguard Worker.irpc i, 1234567
2555*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q13, q14, #(2*\i)
2556*c0909341SAndroid Build Coastguard Worker.if \i < 4
2557*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d0[\i]
2558*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q11, d25, d0[\i]
2559*c0909341SAndroid Build Coastguard Worker.else
2560*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q3,  d24, d1[\i - 4]
2561*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q11, d25, d1[\i - 4]
2562*c0909341SAndroid Build Coastguard Worker.endif
2563*c0909341SAndroid Build Coastguard Worker.endr
2564*c0909341SAndroid Build Coastguard Worker        vdup.32         q13, r12      // -(6-intermediate_bits)
2565*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q3,  q3,  q13 // -(6-intermediate_bits)
2566*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q11, q11, q13 // -(6-intermediate_bits)
2567*c0909341SAndroid Build Coastguard Worker
2568*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d24, q3
2569*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d25, q11
2570*c0909341SAndroid Build Coastguard Worker        vmov            q11, q2
2571*c0909341SAndroid Build Coastguard Worker        bx              lr
2572*c0909341SAndroid Build Coastguard Workerendfunc
2573*c0909341SAndroid Build Coastguard Worker
2574*c0909341SAndroid Build Coastguard Workerfunction \type\()_bilin_16bpc_neon, export=1
2575*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
2576*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #36]
2577*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #44]
2578*c0909341SAndroid Build Coastguard Worker.ifc \bdmax, r8
2579*c0909341SAndroid Build Coastguard Worker        ldr             r8,  [sp, #52]
2580*c0909341SAndroid Build Coastguard Worker.endif
2581*c0909341SAndroid Build Coastguard Worker        vdup.16         q1,  \mx
2582*c0909341SAndroid Build Coastguard Worker        vdup.16         q3,  \my
2583*c0909341SAndroid Build Coastguard Worker        rsb             r9,  \mx, #16
2584*c0909341SAndroid Build Coastguard Worker        rsb             r10, \my, #16
2585*c0909341SAndroid Build Coastguard Worker        vdup.16         q0,  r9
2586*c0909341SAndroid Build Coastguard Worker        vdup.16         q2,  r10
2587*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
2588*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \w, #1
2589*c0909341SAndroid Build Coastguard Worker.endif
2590*c0909341SAndroid Build Coastguard Worker        clz             \bdmax,   \bdmax       // bitdepth_max
2591*c0909341SAndroid Build Coastguard Worker        clz             r9,  \w
2592*c0909341SAndroid Build Coastguard Worker        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
2593*c0909341SAndroid Build Coastguard Worker        cmp             \mx, #0
2594*c0909341SAndroid Build Coastguard Worker        sub             r9,  r9,  #24
2595*c0909341SAndroid Build Coastguard Worker        rsb             r11, \bdmax, #4        // 4 - intermediate_bits
2596*c0909341SAndroid Build Coastguard Worker        add             r12, \bdmax, #4        // 4 + intermediate_bits
2597*c0909341SAndroid Build Coastguard Worker        bne             L(\type\()_bilin_h)
2598*c0909341SAndroid Build Coastguard Worker        cmp             \my, #0
2599*c0909341SAndroid Build Coastguard Worker        bne             L(\type\()_bilin_v)
2600*c0909341SAndroid Build Coastguard Worker        b               \type\()_neon
2601*c0909341SAndroid Build Coastguard Worker
2602*c0909341SAndroid Build Coastguard WorkerL(\type\()_bilin_h):
2603*c0909341SAndroid Build Coastguard Worker        cmp             \my, #0
2604*c0909341SAndroid Build Coastguard Worker        bne             L(\type\()_bilin_hv)
2605*c0909341SAndroid Build Coastguard Worker
2606*c0909341SAndroid Build Coastguard Worker        adr             r10, L(\type\()_bilin_h_tbl)
2607*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r11               // 4 - intermediate_bits
2608*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
2609*c0909341SAndroid Build Coastguard Worker        vneg.s16        q15, q15               // -(4-intermediate_bits)
2610*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2611*c0909341SAndroid Build Coastguard Worker        vdup.16         q14, \bdmax            // intermediate_bits
2612*c0909341SAndroid Build Coastguard Worker.else
2613*c0909341SAndroid Build Coastguard Worker        vmov.i16        q14, #PREP_BIAS
2614*c0909341SAndroid Build Coastguard Worker.endif
2615*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
2616*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2617*c0909341SAndroid Build Coastguard Worker        vneg.s16        q14, q14               // -intermediate_bits
2618*c0909341SAndroid Build Coastguard Worker.endif
2619*c0909341SAndroid Build Coastguard Worker        bx              r10
2620*c0909341SAndroid Build Coastguard Worker
2621*c0909341SAndroid Build Coastguard Worker        .align 2
2622*c0909341SAndroid Build Coastguard WorkerL(\type\()_bilin_h_tbl):
2623*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
2624*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
2625*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
2626*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
2627*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
2628*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
2629*c0909341SAndroid Build Coastguard Worker        .word 20f   - L(\type\()_bilin_h_tbl) + CONFIG_THUMB
2630*c0909341SAndroid Build Coastguard Worker
2631*c0909341SAndroid Build Coastguard Worker20:     // 2xN h
2632*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2633*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2634*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2635*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2636*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2637*c0909341SAndroid Build Coastguard Worker2:
2638*c0909341SAndroid Build Coastguard Worker        vld1.16         {d16}, [\src], \s_strd
2639*c0909341SAndroid Build Coastguard Worker        vld1.16         {d18}, [\sr2], \s_strd
2640*c0909341SAndroid Build Coastguard Worker        vext.8          d17, d16, d16, #2
2641*c0909341SAndroid Build Coastguard Worker        vext.8          d19, d18, d18, #2
2642*c0909341SAndroid Build Coastguard Worker        vtrn.32         d16, d18
2643*c0909341SAndroid Build Coastguard Worker        vtrn.32         d17, d19
2644*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2645*c0909341SAndroid Build Coastguard Worker        vmul.i16        d16, d16, d0
2646*c0909341SAndroid Build Coastguard Worker        vmla.i16        d16, d17, d2
2647*c0909341SAndroid Build Coastguard Worker        vrshl.u16       d16, d16, d30
2648*c0909341SAndroid Build Coastguard Worker        vrshl.u16       d16, d16, d28
2649*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[0]}, [\dst, :32], \d_strd
2650*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[1]}, [\ds2, :32], \d_strd
2651*c0909341SAndroid Build Coastguard Worker        bgt             2b
2652*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2653*c0909341SAndroid Build Coastguard Worker.endif
2654*c0909341SAndroid Build Coastguard Worker
2655*c0909341SAndroid Build Coastguard Worker40:     // 4xN h
2656*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2657*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2658*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2659*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2660*c0909341SAndroid Build Coastguard Worker4:
2661*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8},  [\src], \s_strd
2662*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10}, [\sr2], \s_strd
2663*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q8,  q8,  #2
2664*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q10, q10, #2
2665*c0909341SAndroid Build Coastguard Worker        vmov            d17, d20
2666*c0909341SAndroid Build Coastguard Worker        vmov            d19, d22
2667*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2668*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q0
2669*c0909341SAndroid Build Coastguard Worker        vmla.i16        q8,  q9,  q1
2670*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q15
2671*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2672*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q14
2673*c0909341SAndroid Build Coastguard Worker.else
2674*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
2675*c0909341SAndroid Build Coastguard Worker.endif
2676*c0909341SAndroid Build Coastguard Worker        vst1.16         {d16}, [\dst, :64], \d_strd
2677*c0909341SAndroid Build Coastguard Worker        vst1.16         {d17}, [\ds2, :64], \d_strd
2678*c0909341SAndroid Build Coastguard Worker        bgt             4b
2679*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2680*c0909341SAndroid Build Coastguard Worker
2681*c0909341SAndroid Build Coastguard Worker80:     // 8xN h
2682*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2683*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2684*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2685*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2686*c0909341SAndroid Build Coastguard Worker8:
2687*c0909341SAndroid Build Coastguard Worker        vld1.16         {d16, d17, d18}, [\src], \s_strd
2688*c0909341SAndroid Build Coastguard Worker        vld1.16         {d20, d21, d22}, [\sr2], \s_strd
2689*c0909341SAndroid Build Coastguard Worker        vext.8          q9,  q8,  q9,  #2
2690*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q10, q11, #2
2691*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2692*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q0
2693*c0909341SAndroid Build Coastguard Worker        vmla.i16        q8,  q9,  q1
2694*c0909341SAndroid Build Coastguard Worker        vmul.i16        q10, q10, q0
2695*c0909341SAndroid Build Coastguard Worker        vmla.i16        q10, q11, q1
2696*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q15
2697*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q10, q10, q15
2698*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2699*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q14
2700*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q10, q10, q14
2701*c0909341SAndroid Build Coastguard Worker.else
2702*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
2703*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q10, q14
2704*c0909341SAndroid Build Coastguard Worker.endif
2705*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8},  [\dst, :128], \d_strd
2706*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10}, [\ds2, :128], \d_strd
2707*c0909341SAndroid Build Coastguard Worker        bgt             8b
2708*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2709*c0909341SAndroid Build Coastguard Worker160:
2710*c0909341SAndroid Build Coastguard Worker320:
2711*c0909341SAndroid Build Coastguard Worker640:
2712*c0909341SAndroid Build Coastguard Worker1280:   // 16xN, 32xN, ... h
2713*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
2714*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2715*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2716*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2717*c0909341SAndroid Build Coastguard Worker
2718*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  \w, lsl #1
2719*c0909341SAndroid Build Coastguard Worker        sub             \s_strd,  \s_strd,  #16
2720*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2721*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2722*c0909341SAndroid Build Coastguard Worker        sub             \d_strd,  \d_strd,  \w, lsl #1
2723*c0909341SAndroid Build Coastguard Worker.endif
2724*c0909341SAndroid Build Coastguard Worker161:
2725*c0909341SAndroid Build Coastguard Worker        vld1.16         {q4}, [\src]!
2726*c0909341SAndroid Build Coastguard Worker        vld1.16         {q9}, [\sr2]!
2727*c0909341SAndroid Build Coastguard Worker        mov             \mx, \w
2728*c0909341SAndroid Build Coastguard Worker
2729*c0909341SAndroid Build Coastguard Worker16:
2730*c0909341SAndroid Build Coastguard Worker        vld1.16         {q5,  q6},  [\src]!
2731*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [\sr2]!
2732*c0909341SAndroid Build Coastguard Worker        vext.8          q7,  q4,  q5,  #2
2733*c0909341SAndroid Build Coastguard Worker        vext.8          q8,  q5,  q6,  #2
2734*c0909341SAndroid Build Coastguard Worker        vext.8          q12, q9,  q10, #2
2735*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q10, q11, #2
2736*c0909341SAndroid Build Coastguard Worker        vmul.i16        q4,  q4,  q0
2737*c0909341SAndroid Build Coastguard Worker        vmla.i16        q4,  q7,  q1
2738*c0909341SAndroid Build Coastguard Worker        vmul.i16        q5,  q5,  q0
2739*c0909341SAndroid Build Coastguard Worker        vmla.i16        q5,  q8,  q1
2740*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q9,  q0
2741*c0909341SAndroid Build Coastguard Worker        vmla.i16        q9,  q12, q1
2742*c0909341SAndroid Build Coastguard Worker        vmul.i16        q10, q10, q0
2743*c0909341SAndroid Build Coastguard Worker        vmla.i16        q10, q13, q1
2744*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q4,  q4,  q15
2745*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q5,  q5,  q15
2746*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q9,  q9,  q15
2747*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q10, q10, q15
2748*c0909341SAndroid Build Coastguard Worker        subs            \mx, \mx, #16
2749*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2750*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q4,  q4,  q14
2751*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q5,  q5,  q14
2752*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q9,  q9,  q14
2753*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q10, q10, q14
2754*c0909341SAndroid Build Coastguard Worker.else
2755*c0909341SAndroid Build Coastguard Worker        vsub.i16        q4,  q4,  q14
2756*c0909341SAndroid Build Coastguard Worker        vsub.i16        q5,  q5,  q14
2757*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q9,  q14
2758*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q10, q14
2759*c0909341SAndroid Build Coastguard Worker.endif
2760*c0909341SAndroid Build Coastguard Worker        vst1.16         {q4, q5},  [\dst, :128]!
2761*c0909341SAndroid Build Coastguard Worker        vst1.16         {q9, q10}, [\ds2, :128]!
2762*c0909341SAndroid Build Coastguard Worker        ble             9f
2763*c0909341SAndroid Build Coastguard Worker
2764*c0909341SAndroid Build Coastguard Worker        vmov            q4,  q6
2765*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q11
2766*c0909341SAndroid Build Coastguard Worker        b               16b
2767*c0909341SAndroid Build Coastguard Worker
2768*c0909341SAndroid Build Coastguard Worker9:
2769*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  \d_strd
2770*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \ds2,  \d_strd
2771*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  \s_strd
2772*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \sr2,  \s_strd
2773*c0909341SAndroid Build Coastguard Worker
2774*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2775*c0909341SAndroid Build Coastguard Worker        bgt             161b
2776*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
2777*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2778*c0909341SAndroid Build Coastguard Worker
2779*c0909341SAndroid Build Coastguard Worker
2780*c0909341SAndroid Build Coastguard WorkerL(\type\()_bilin_v):
2781*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #4
2782*c0909341SAndroid Build Coastguard Worker        adr             r10, L(\type\()_bilin_v_tbl)
2783*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
2784*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r11      // 4 - intermediate_bits
2785*c0909341SAndroid Build Coastguard Worker.endif
2786*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
2787*c0909341SAndroid Build Coastguard Worker.ifc \type, prep
2788*c0909341SAndroid Build Coastguard Worker        vmov.i16        q14, #PREP_BIAS
2789*c0909341SAndroid Build Coastguard Worker        vneg.s16        q15, q15      // -(4-intermediate_bits)
2790*c0909341SAndroid Build Coastguard Worker.endif
2791*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
2792*c0909341SAndroid Build Coastguard Worker        bx              r10
2793*c0909341SAndroid Build Coastguard Worker
2794*c0909341SAndroid Build Coastguard Worker        .align 2
2795*c0909341SAndroid Build Coastguard WorkerL(\type\()_bilin_v_tbl):
2796*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
2797*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
2798*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
2799*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
2800*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
2801*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
2802*c0909341SAndroid Build Coastguard Worker        .word 20f   - L(\type\()_bilin_v_tbl) + CONFIG_THUMB
2803*c0909341SAndroid Build Coastguard Worker
2804*c0909341SAndroid Build Coastguard Worker20:     // 2xN v
2805*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2806*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
2807*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2808*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2809*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2810*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2811*c0909341SAndroid Build Coastguard Worker
2812*c0909341SAndroid Build Coastguard Worker        // 2x2 v
2813*c0909341SAndroid Build Coastguard Worker        vld1.32         {d16[]}, [\src], \s_strd
2814*c0909341SAndroid Build Coastguard Worker        bgt             24f
2815*c0909341SAndroid Build Coastguard Worker22:
2816*c0909341SAndroid Build Coastguard Worker        vld1.32         {d17[]}, [\sr2], \s_strd
2817*c0909341SAndroid Build Coastguard Worker        vld1.32         {d18[]}, [\src], \s_strd
2818*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d17, #4
2819*c0909341SAndroid Build Coastguard Worker        vext.8          d17, d17, d18, #4
2820*c0909341SAndroid Build Coastguard Worker        vmul.i16        d16, d16, d4
2821*c0909341SAndroid Build Coastguard Worker        vmla.i16        d16, d17, d6
2822*c0909341SAndroid Build Coastguard Worker        vrshr.u16       d16, d16, #4
2823*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[0]}, [\dst, :32]
2824*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[1]}, [\ds2, :32]
2825*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2826*c0909341SAndroid Build Coastguard Worker24:     // 2x4, 2x6, 2x8, ... v
2827*c0909341SAndroid Build Coastguard Worker        vld1.32         {d17[]}, [\sr2], \s_strd
2828*c0909341SAndroid Build Coastguard Worker        vld1.32         {d18[]}, [\src], \s_strd
2829*c0909341SAndroid Build Coastguard Worker        vld1.32         {d19[]}, [\sr2], \s_strd
2830*c0909341SAndroid Build Coastguard Worker        vld1.32         {d20[]}, [\src], \s_strd
2831*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #4
2832*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d17, #4
2833*c0909341SAndroid Build Coastguard Worker        vext.8          d17, d17, d18, #4
2834*c0909341SAndroid Build Coastguard Worker        vext.8          d18, d18, d19, #4
2835*c0909341SAndroid Build Coastguard Worker        vext.8          d19, d19, d20, #4
2836*c0909341SAndroid Build Coastguard Worker        vswp            d17, d18
2837*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q2
2838*c0909341SAndroid Build Coastguard Worker        vmla.i16        q8,  q9,  q3
2839*c0909341SAndroid Build Coastguard Worker        cmp             \h,  #2
2840*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q8,  q8,  #4
2841*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[0]}, [\dst, :32], \d_strd
2842*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[1]}, [\ds2, :32], \d_strd
2843*c0909341SAndroid Build Coastguard Worker        vst1.32         {d17[0]}, [\dst, :32], \d_strd
2844*c0909341SAndroid Build Coastguard Worker        vst1.32         {d17[1]}, [\ds2, :32], \d_strd
2845*c0909341SAndroid Build Coastguard Worker        blt             0f
2846*c0909341SAndroid Build Coastguard Worker        vmov            d16, d20
2847*c0909341SAndroid Build Coastguard Worker        beq             22b
2848*c0909341SAndroid Build Coastguard Worker        b               24b
2849*c0909341SAndroid Build Coastguard Worker0:
2850*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2851*c0909341SAndroid Build Coastguard Worker.endif
2852*c0909341SAndroid Build Coastguard Worker
2853*c0909341SAndroid Build Coastguard Worker40:     // 4xN v
2854*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2855*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2856*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2857*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2858*c0909341SAndroid Build Coastguard Worker        vld1.16         {d16}, [\src], \s_strd
2859*c0909341SAndroid Build Coastguard Worker4:
2860*c0909341SAndroid Build Coastguard Worker        vld1.16         {d17}, [\sr2], \s_strd
2861*c0909341SAndroid Build Coastguard Worker        vld1.16         {d19}, [\src], \s_strd
2862*c0909341SAndroid Build Coastguard Worker        vmov            d18, d17
2863*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q2
2864*c0909341SAndroid Build Coastguard Worker        vmla.i16        q8,  q9,  q3
2865*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2866*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2867*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q8,  q8,  #4
2868*c0909341SAndroid Build Coastguard Worker.else
2869*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q15
2870*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
2871*c0909341SAndroid Build Coastguard Worker.endif
2872*c0909341SAndroid Build Coastguard Worker        vst1.16         {d16}, [\dst, :64], \d_strd
2873*c0909341SAndroid Build Coastguard Worker        vst1.16         {d17}, [\ds2, :64], \d_strd
2874*c0909341SAndroid Build Coastguard Worker        ble             0f
2875*c0909341SAndroid Build Coastguard Worker        vmov            d16, d19
2876*c0909341SAndroid Build Coastguard Worker        b               4b
2877*c0909341SAndroid Build Coastguard Worker0:
2878*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2879*c0909341SAndroid Build Coastguard Worker
2880*c0909341SAndroid Build Coastguard Worker80:     // 8xN v
2881*c0909341SAndroid Build Coastguard Worker        add             \ds2,  \dst,  \d_strd
2882*c0909341SAndroid Build Coastguard Worker        add             \sr2,  \src,  \s_strd
2883*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd,  \s_strd,  #1
2884*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd,  \d_strd,  #1
2885*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8},  [\src], \s_strd
2886*c0909341SAndroid Build Coastguard Worker8:
2887*c0909341SAndroid Build Coastguard Worker        vld1.16         {q9},  [\sr2], \s_strd
2888*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10}, [\src], \s_strd
2889*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q2
2890*c0909341SAndroid Build Coastguard Worker        vmla.i16        q8,  q9,  q3
2891*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q9,  q2
2892*c0909341SAndroid Build Coastguard Worker        vmla.i16        q9,  q10, q3
2893*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2894*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2895*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q8,  q8,  #4
2896*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q9,  q9,  #4
2897*c0909341SAndroid Build Coastguard Worker.else
2898*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q15
2899*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q9,  q9,  q15
2900*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
2901*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q9,  q14
2902*c0909341SAndroid Build Coastguard Worker.endif
2903*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8}, [\dst, :128], \d_strd
2904*c0909341SAndroid Build Coastguard Worker        vst1.16         {q9}, [\ds2, :128], \d_strd
2905*c0909341SAndroid Build Coastguard Worker        ble             0f
2906*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q10
2907*c0909341SAndroid Build Coastguard Worker        b               8b
2908*c0909341SAndroid Build Coastguard Worker0:
2909*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2910*c0909341SAndroid Build Coastguard Worker
2911*c0909341SAndroid Build Coastguard Worker160:    // 16xN, 32xN, ...
2912*c0909341SAndroid Build Coastguard Worker320:
2913*c0909341SAndroid Build Coastguard Worker640:
2914*c0909341SAndroid Build Coastguard Worker1280:
2915*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
2916*c0909341SAndroid Build Coastguard Worker1:
2917*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
2918*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
2919*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
2920*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
2921*c0909341SAndroid Build Coastguard Worker
2922*c0909341SAndroid Build Coastguard Worker        vld1.16         {q8,  q9},  [\src], \s_strd
2923*c0909341SAndroid Build Coastguard Worker2:
2924*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10, q11}, [\sr2], \s_strd
2925*c0909341SAndroid Build Coastguard Worker        vld1.16         {q12, q13}, [\src], \s_strd
2926*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q8,  q2
2927*c0909341SAndroid Build Coastguard Worker        vmla.i16        q8,  q10, q3
2928*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q9,  q2
2929*c0909341SAndroid Build Coastguard Worker        vmla.i16        q9,  q11, q3
2930*c0909341SAndroid Build Coastguard Worker        vmul.i16        q10, q10, q2
2931*c0909341SAndroid Build Coastguard Worker        vmla.i16        q10, q12, q3
2932*c0909341SAndroid Build Coastguard Worker        vmul.i16        q11, q11, q2
2933*c0909341SAndroid Build Coastguard Worker        vmla.i16        q11, q13, q3
2934*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
2935*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2936*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q8,  q8,  #4
2937*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q9,  q9,  #4
2938*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q10, q10, #4
2939*c0909341SAndroid Build Coastguard Worker        vrshr.u16       q11, q11, #4
2940*c0909341SAndroid Build Coastguard Worker.else
2941*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q15
2942*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q9,  q9,  q15
2943*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q10, q10, q15
2944*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q11, q11, q15
2945*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
2946*c0909341SAndroid Build Coastguard Worker        vsub.i16        q9,  q9,  q14
2947*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q10, q14
2948*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q11, q14
2949*c0909341SAndroid Build Coastguard Worker.endif
2950*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8,  q9},  [\dst, :128], \d_strd
2951*c0909341SAndroid Build Coastguard Worker        vst1.16         {q10, q11}, [\ds2, :128], \d_strd
2952*c0909341SAndroid Build Coastguard Worker        ble             9f
2953*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q12
2954*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q13
2955*c0909341SAndroid Build Coastguard Worker        b               2b
2956*c0909341SAndroid Build Coastguard Worker9:
2957*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #16
2958*c0909341SAndroid Build Coastguard Worker        ble             0f
2959*c0909341SAndroid Build Coastguard Worker        asr             \s_strd, \s_strd, #1
2960*c0909341SAndroid Build Coastguard Worker        asr             \d_strd, \d_strd, #1
2961*c0909341SAndroid Build Coastguard Worker        mls             \src, \s_strd, \my, \src
2962*c0909341SAndroid Build Coastguard Worker        mls             \dst, \d_strd, \my, \dst
2963*c0909341SAndroid Build Coastguard Worker        sub             \src, \src, \s_strd, lsl #1
2964*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
2965*c0909341SAndroid Build Coastguard Worker        add             \src, \src, #32
2966*c0909341SAndroid Build Coastguard Worker        add             \dst, \dst, #32
2967*c0909341SAndroid Build Coastguard Worker        b               1b
2968*c0909341SAndroid Build Coastguard Worker0:
2969*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
2970*c0909341SAndroid Build Coastguard Worker
2971*c0909341SAndroid Build Coastguard WorkerL(\type\()_bilin_hv):
2972*c0909341SAndroid Build Coastguard Worker        adr             r10, L(\type\()_bilin_hv_tbl)
2973*c0909341SAndroid Build Coastguard Worker        vdup.16         q15, r11          // 4 - intermediate_bits
2974*c0909341SAndroid Build Coastguard Worker        ldr             r9,  [r10, r9, lsl #2]
2975*c0909341SAndroid Build Coastguard Worker        vneg.s16        q15, q15          // -(4-intermediate_bits)
2976*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2977*c0909341SAndroid Build Coastguard Worker        vdup.32         q14, r12          // 4 + intermediate_bits
2978*c0909341SAndroid Build Coastguard Worker.else
2979*c0909341SAndroid Build Coastguard Worker        vmov.i16        q14, #PREP_BIAS
2980*c0909341SAndroid Build Coastguard Worker.endif
2981*c0909341SAndroid Build Coastguard Worker        add             r10, r10, r9
2982*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2983*c0909341SAndroid Build Coastguard Worker        vneg.s32        q14, q14          // -(4+intermediate_bits)
2984*c0909341SAndroid Build Coastguard Worker.endif
2985*c0909341SAndroid Build Coastguard Worker        bx              r10
2986*c0909341SAndroid Build Coastguard Worker
2987*c0909341SAndroid Build Coastguard Worker        .align 2
2988*c0909341SAndroid Build Coastguard WorkerL(\type\()_bilin_hv_tbl):
2989*c0909341SAndroid Build Coastguard Worker        .word 1280f - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
2990*c0909341SAndroid Build Coastguard Worker        .word 640f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
2991*c0909341SAndroid Build Coastguard Worker        .word 320f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
2992*c0909341SAndroid Build Coastguard Worker        .word 160f  - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
2993*c0909341SAndroid Build Coastguard Worker        .word 80f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
2994*c0909341SAndroid Build Coastguard Worker        .word 40f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
2995*c0909341SAndroid Build Coastguard Worker        .word 20f   - L(\type\()_bilin_hv_tbl) + CONFIG_THUMB
2996*c0909341SAndroid Build Coastguard Worker
2997*c0909341SAndroid Build Coastguard Worker20:     // 2xN hv
2998*c0909341SAndroid Build Coastguard Worker.ifc \type, put
2999*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
3000*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
3001*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
3002*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
3003*c0909341SAndroid Build Coastguard Worker
3004*c0909341SAndroid Build Coastguard Worker        vld1.16         {d20}, [\src], \s_strd
3005*c0909341SAndroid Build Coastguard Worker        vext.8          d21, d20, d20, #2
3006*c0909341SAndroid Build Coastguard Worker        vmul.i16        d16, d20, d0
3007*c0909341SAndroid Build Coastguard Worker        vmla.i16        d16, d21, d2
3008*c0909341SAndroid Build Coastguard Worker        vrshl.u16       d16, d16, d30
3009*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d16, #4
3010*c0909341SAndroid Build Coastguard Worker
3011*c0909341SAndroid Build Coastguard Worker2:
3012*c0909341SAndroid Build Coastguard Worker        vld1.16         {d20}, [\sr2], \s_strd
3013*c0909341SAndroid Build Coastguard Worker        vld1.16         {d22}, [\src], \s_strd
3014*c0909341SAndroid Build Coastguard Worker        vext.8          d21, d20, d20, #2
3015*c0909341SAndroid Build Coastguard Worker        vext.8          d23, d22, d22, #2
3016*c0909341SAndroid Build Coastguard Worker        vtrn.32         d20, d22
3017*c0909341SAndroid Build Coastguard Worker        vtrn.32         d21, d23
3018*c0909341SAndroid Build Coastguard Worker        vmul.i16        d18, d20, d0
3019*c0909341SAndroid Build Coastguard Worker        vmla.i16        d18, d21, d2
3020*c0909341SAndroid Build Coastguard Worker        vrshl.u16       d18, d18, d30
3021*c0909341SAndroid Build Coastguard Worker
3022*c0909341SAndroid Build Coastguard Worker        vext.8          d16, d16, d18, #4
3023*c0909341SAndroid Build Coastguard Worker
3024*c0909341SAndroid Build Coastguard Worker        vmull.u16       q8,  d16, d4
3025*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q8,  d18, d6
3026*c0909341SAndroid Build Coastguard Worker        vrshl.u32       q8,  q8,  q14
3027*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d16, q8
3028*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3029*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[0]}, [\dst, :32], \d_strd
3030*c0909341SAndroid Build Coastguard Worker        vst1.32         {d16[1]}, [\ds2, :32], \d_strd
3031*c0909341SAndroid Build Coastguard Worker        ble             0f
3032*c0909341SAndroid Build Coastguard Worker        vmov            d16, d18
3033*c0909341SAndroid Build Coastguard Worker        b               2b
3034*c0909341SAndroid Build Coastguard Worker0:
3035*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
3036*c0909341SAndroid Build Coastguard Worker.endif
3037*c0909341SAndroid Build Coastguard Worker
3038*c0909341SAndroid Build Coastguard Worker40:     // 4xN hv
3039*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
3040*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
3041*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
3042*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
3043*c0909341SAndroid Build Coastguard Worker
3044*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10}, [\src], \s_strd
3045*c0909341SAndroid Build Coastguard Worker        vext.8          d21, d20, d21, #2
3046*c0909341SAndroid Build Coastguard Worker        vmul.i16        d16, d20, d0
3047*c0909341SAndroid Build Coastguard Worker        vmla.i16        d16, d21, d2
3048*c0909341SAndroid Build Coastguard Worker        vrshl.u16       d16, d16, d30
3049*c0909341SAndroid Build Coastguard Worker
3050*c0909341SAndroid Build Coastguard Worker4:
3051*c0909341SAndroid Build Coastguard Worker        vld1.16         {q10}, [\sr2], \s_strd
3052*c0909341SAndroid Build Coastguard Worker        vld1.16         {q11}, [\src], \s_strd
3053*c0909341SAndroid Build Coastguard Worker        vext.8          d21, d20, d21, #2
3054*c0909341SAndroid Build Coastguard Worker        vext.8          d23, d22, d23, #2
3055*c0909341SAndroid Build Coastguard Worker        vswp            d21, d22
3056*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q10, q0
3057*c0909341SAndroid Build Coastguard Worker        vmla.i16        q9,  q11, q1
3058*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q9,  q9,  q15
3059*c0909341SAndroid Build Coastguard Worker
3060*c0909341SAndroid Build Coastguard Worker        vmull.u16       q10, d16, d4
3061*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q10, d18, d6
3062*c0909341SAndroid Build Coastguard Worker        vmull.u16       q11, d18, d4
3063*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q11, d19, d6
3064*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3065*c0909341SAndroid Build Coastguard Worker        vrshl.u32       q10, q10, q14
3066*c0909341SAndroid Build Coastguard Worker        vrshl.u32       q11, q11, q14
3067*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d20, q10
3068*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d21, q11
3069*c0909341SAndroid Build Coastguard Worker.else
3070*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d20, q10, #4
3071*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d21, q11, #4
3072*c0909341SAndroid Build Coastguard Worker        vsub.i16        q10, q10, q14
3073*c0909341SAndroid Build Coastguard Worker.endif
3074*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3075*c0909341SAndroid Build Coastguard Worker        vst1.16         {d20}, [\dst, :64], \d_strd
3076*c0909341SAndroid Build Coastguard Worker        vst1.16         {d21}, [\ds2, :64], \d_strd
3077*c0909341SAndroid Build Coastguard Worker        ble             0f
3078*c0909341SAndroid Build Coastguard Worker        vmov            d16, d19
3079*c0909341SAndroid Build Coastguard Worker        b               4b
3080*c0909341SAndroid Build Coastguard Worker0:
3081*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
3082*c0909341SAndroid Build Coastguard Worker
3083*c0909341SAndroid Build Coastguard Worker80:     // 8xN, 16xN, ... hv
3084*c0909341SAndroid Build Coastguard Worker160:
3085*c0909341SAndroid Build Coastguard Worker320:
3086*c0909341SAndroid Build Coastguard Worker640:
3087*c0909341SAndroid Build Coastguard Worker1280:
3088*c0909341SAndroid Build Coastguard Worker        mov             \my, \h
3089*c0909341SAndroid Build Coastguard Worker
3090*c0909341SAndroid Build Coastguard Worker1:
3091*c0909341SAndroid Build Coastguard Worker        add             \sr2, \src, \s_strd
3092*c0909341SAndroid Build Coastguard Worker        add             \ds2, \dst, \d_strd
3093*c0909341SAndroid Build Coastguard Worker        lsl             \s_strd, \s_strd, #1
3094*c0909341SAndroid Build Coastguard Worker        lsl             \d_strd, \d_strd, #1
3095*c0909341SAndroid Build Coastguard Worker
3096*c0909341SAndroid Build Coastguard Worker        vld1.16         {d20, d21, d22}, [\src], \s_strd
3097*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q10, q11, #2
3098*c0909341SAndroid Build Coastguard Worker        vmul.i16        q8,  q10, q0
3099*c0909341SAndroid Build Coastguard Worker        vmla.i16        q8,  q11, q1
3100*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q8,  q8,  q15
3101*c0909341SAndroid Build Coastguard Worker
3102*c0909341SAndroid Build Coastguard Worker2:
3103*c0909341SAndroid Build Coastguard Worker        vld1.16         {d20, d21, d22}, [\sr2], \s_strd
3104*c0909341SAndroid Build Coastguard Worker        vld1.16         {d24, d25, d26}, [\src], \s_strd
3105*c0909341SAndroid Build Coastguard Worker        vext.8          q11, q10, q11, #2
3106*c0909341SAndroid Build Coastguard Worker        vext.8          q13, q12, q13, #2
3107*c0909341SAndroid Build Coastguard Worker        vmul.i16        q9,  q10, q0
3108*c0909341SAndroid Build Coastguard Worker        vmla.i16        q9,  q11, q1
3109*c0909341SAndroid Build Coastguard Worker        vmul.i16        q10, q12, q0
3110*c0909341SAndroid Build Coastguard Worker        vmla.i16        q10, q13, q1
3111*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q9,  q9,  q15
3112*c0909341SAndroid Build Coastguard Worker        vrshl.u16       q10, q10, q15
3113*c0909341SAndroid Build Coastguard Worker
3114*c0909341SAndroid Build Coastguard Worker        vmull.u16       q11, d16, d4
3115*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q11, d18, d6
3116*c0909341SAndroid Build Coastguard Worker        vmull.u16       q12, d17, d4
3117*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q12, d19, d6
3118*c0909341SAndroid Build Coastguard Worker        vmull.u16       q8,  d18, d4
3119*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q8,  d20, d6
3120*c0909341SAndroid Build Coastguard Worker        vmull.u16       q9,  d19, d4
3121*c0909341SAndroid Build Coastguard Worker        vmlal.u16       q9,  d21, d6
3122*c0909341SAndroid Build Coastguard Worker.ifc \type, put
3123*c0909341SAndroid Build Coastguard Worker        vrshl.u32       q11, q11, q14
3124*c0909341SAndroid Build Coastguard Worker        vrshl.u32       q12, q12, q14
3125*c0909341SAndroid Build Coastguard Worker        vrshl.u32       q8,  q8,  q14
3126*c0909341SAndroid Build Coastguard Worker        vrshl.u32       q9,  q9,  q14
3127*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d22, q11
3128*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d23, q12
3129*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d16, q8
3130*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d17, q9
3131*c0909341SAndroid Build Coastguard Worker.else
3132*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d22, q11, #4
3133*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d23, q12, #4
3134*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d16, q8,  #4
3135*c0909341SAndroid Build Coastguard Worker        vrshrn.i32      d17, q9,  #4
3136*c0909341SAndroid Build Coastguard Worker        vsub.i16        q11, q11, q14
3137*c0909341SAndroid Build Coastguard Worker        vsub.i16        q8,  q8,  q14
3138*c0909341SAndroid Build Coastguard Worker.endif
3139*c0909341SAndroid Build Coastguard Worker        subs            \h,  \h,  #2
3140*c0909341SAndroid Build Coastguard Worker        vst1.16         {q11}, [\dst, :128], \d_strd
3141*c0909341SAndroid Build Coastguard Worker        vst1.16         {q8},  [\ds2, :128], \d_strd
3142*c0909341SAndroid Build Coastguard Worker        ble             9f
3143*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q10
3144*c0909341SAndroid Build Coastguard Worker        b               2b
3145*c0909341SAndroid Build Coastguard Worker9:
3146*c0909341SAndroid Build Coastguard Worker        subs            \w,  \w,  #8
3147*c0909341SAndroid Build Coastguard Worker        ble             0f
3148*c0909341SAndroid Build Coastguard Worker        asr             \s_strd,  \s_strd,  #1
3149*c0909341SAndroid Build Coastguard Worker        asr             \d_strd,  \d_strd,  #1
3150*c0909341SAndroid Build Coastguard Worker        mls             \src,  \s_strd,  \my,  \src
3151*c0909341SAndroid Build Coastguard Worker        mls             \dst,  \d_strd,  \my,  \dst
3152*c0909341SAndroid Build Coastguard Worker        sub             \src,  \src,  \s_strd,  lsl #1
3153*c0909341SAndroid Build Coastguard Worker        mov             \h,  \my
3154*c0909341SAndroid Build Coastguard Worker        add             \src,  \src,  #16
3155*c0909341SAndroid Build Coastguard Worker        add             \dst,  \dst,  #16
3156*c0909341SAndroid Build Coastguard Worker        b               1b
3157*c0909341SAndroid Build Coastguard Worker0:
3158*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
3159*c0909341SAndroid Build Coastguard Workerendfunc
3160*c0909341SAndroid Build Coastguard Worker.endm
3161*c0909341SAndroid Build Coastguard Worker
3162*c0909341SAndroid Build Coastguard Workerfilter_fn put,  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
3163*c0909341SAndroid Build Coastguard Workerfilter_fn prep, r0, r8, r1, r2, r3, r4, r5, r6, r7, r9, r10
3164*c0909341SAndroid Build Coastguard Worker
3165*c0909341SAndroid Build Coastguard Worker.macro load_filter_ptr src
3166*c0909341SAndroid Build Coastguard Worker        asr             r12, \src, #10
3167*c0909341SAndroid Build Coastguard Worker        add             r12, r11, r12, lsl #3
3168*c0909341SAndroid Build Coastguard Worker.endm
3169*c0909341SAndroid Build Coastguard Worker
3170*c0909341SAndroid Build Coastguard Worker.macro load_filter_coef dst, src, inc
3171*c0909341SAndroid Build Coastguard Worker        add             \src, \src, \inc
3172*c0909341SAndroid Build Coastguard Worker        vld1.8          {\dst}, [r12, :64]
3173*c0909341SAndroid Build Coastguard Worker.endm
3174*c0909341SAndroid Build Coastguard Worker
3175*c0909341SAndroid Build Coastguard Worker.macro load_filter_row dst, src, inc
3176*c0909341SAndroid Build Coastguard Worker        load_filter_ptr \src
3177*c0909341SAndroid Build Coastguard Worker        load_filter_coef \dst, \src, \inc
3178*c0909341SAndroid Build Coastguard Worker.endm
3179*c0909341SAndroid Build Coastguard Worker
3180*c0909341SAndroid Build Coastguard Workerfunction warp_filter_horz_neon
3181*c0909341SAndroid Build Coastguard Worker        load_filter_ptr r5                  // filter 0
3182*c0909341SAndroid Build Coastguard Worker        vld1.16         {q6,q7}, [r2], r3
3183*c0909341SAndroid Build Coastguard Worker
3184*c0909341SAndroid Build Coastguard Worker        load_filter_coef d0, r5,  r7        // filter 0
3185*c0909341SAndroid Build Coastguard Worker        load_filter_row d2,  r5,  r7        // filter 1
3186*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0             // filter 0
3187*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q6,  q7,  #2*1 // filter 1 pixels
3188*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2             // filter 1
3189*c0909341SAndroid Build Coastguard Worker
3190*c0909341SAndroid Build Coastguard Worker        vmull.s16       q4,  d12, d0        // filter 0 output (0-3)
3191*c0909341SAndroid Build Coastguard Worker        vmull.s16       q5,  d13, d1        // filter 0 output (4-7)
3192*c0909341SAndroid Build Coastguard Worker
3193*c0909341SAndroid Build Coastguard Worker        load_filter_ptr r5                  // filter 2
3194*c0909341SAndroid Build Coastguard Worker
3195*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d6,  d2        // filter 1 output (0-3)
3196*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d7,  d3        // filter 1 output (4-7)
3197*c0909341SAndroid Build Coastguard Worker
3198*c0909341SAndroid Build Coastguard Worker        load_filter_coef d0, r5,  r7        // filter 2
3199*c0909341SAndroid Build Coastguard Worker
3200*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d8,  d8,  d9        // half pixel 0 (2x32)
3201*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d9,  d10, d11       // half pixel 0 (2x32)
3202*c0909341SAndroid Build Coastguard Worker
3203*c0909341SAndroid Build Coastguard Worker        load_filter_ptr r5                  // filter 3
3204*c0909341SAndroid Build Coastguard Worker
3205*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d4,  d4,  d5        // half pixel 1 (2x32)
3206*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d5,  d6,  d7        // half pixel 1 (2x32)
3207*c0909341SAndroid Build Coastguard Worker
3208*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0             // filter 2
3209*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q6,  q7,  #2*2 // filter 2 pixels
3210*c0909341SAndroid Build Coastguard Worker
3211*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d8,  d8,  d9        // pixel 0 (2x32)
3212*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d9,  d4,  d5        // pixel 1 (2x32)
3213*c0909341SAndroid Build Coastguard Worker
3214*c0909341SAndroid Build Coastguard Worker        load_filter_coef d2, r5,  r7        // filter 3
3215*c0909341SAndroid Build Coastguard Worker
3216*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d6,  d0        // filter 2 output (0-3)
3217*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d7,  d1        // filter 2 output (4-7)
3218*c0909341SAndroid Build Coastguard Worker
3219*c0909341SAndroid Build Coastguard Worker        load_filter_ptr r5                  // filter 4
3220*c0909341SAndroid Build Coastguard Worker
3221*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d8,  d8,  d9        // pixel 0,1
3222*c0909341SAndroid Build Coastguard Worker
3223*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d9,  d4,  d5        // half pixel 2 (2x32)
3224*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d10, d6,  d7        // half pixel 2 (2x32)
3225*c0909341SAndroid Build Coastguard Worker
3226*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2             // filter 3
3227*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q6,  q7,  #2*3 // filter 3 pixels
3228*c0909341SAndroid Build Coastguard Worker
3229*c0909341SAndroid Build Coastguard Worker        load_filter_coef d0, r5,  r7        // filter 4
3230*c0909341SAndroid Build Coastguard Worker
3231*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d9,  d9,  d10       // pixel 2 (2x32)
3232*c0909341SAndroid Build Coastguard Worker
3233*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d6,  d2        // filter 3 output (0-3)
3234*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d7,  d3        // filter 3 output (4-7)
3235*c0909341SAndroid Build Coastguard Worker
3236*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0             // filter 4
3237*c0909341SAndroid Build Coastguard Worker        load_filter_ptr r5                  // filter 5
3238*c0909341SAndroid Build Coastguard Worker
3239*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d10, d4,  d5        // half pixel 3 (2x32)
3240*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d11, d6,  d7        // half pixel 3 (2x32)
3241*c0909341SAndroid Build Coastguard Worker
3242*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q6,  q7,  #2*4 // filter 4 pixels
3243*c0909341SAndroid Build Coastguard Worker        load_filter_coef d2, r5,  r7        // filter 5
3244*c0909341SAndroid Build Coastguard Worker
3245*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d10, d10, d11       // pixel 3 (2x32)
3246*c0909341SAndroid Build Coastguard Worker
3247*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d9,  d9,  d10       // pixel 2,3
3248*c0909341SAndroid Build Coastguard Worker
3249*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d6,  d0        // filter 4 output (0-3)
3250*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d7,  d1        // filter 4 output (4-7)
3251*c0909341SAndroid Build Coastguard Worker
3252*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2             // filter 5
3253*c0909341SAndroid Build Coastguard Worker        load_filter_ptr r5                  // filter 6
3254*c0909341SAndroid Build Coastguard Worker
3255*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d10, d4,  d5        // half pixel 4 (2x32)
3256*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d11, d6,  d7        // half pixel 4 (2x32)
3257*c0909341SAndroid Build Coastguard Worker
3258*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q6,  q7,  #2*5 // filter 5 pixels
3259*c0909341SAndroid Build Coastguard Worker        load_filter_coef d0, r5,  r7        // filter 6
3260*c0909341SAndroid Build Coastguard Worker
3261*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d10, d10, d11       // pixel 4 (2x32)
3262*c0909341SAndroid Build Coastguard Worker
3263*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d6,  d2        // filter 5 output (0-3)
3264*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d7,  d3        // filter 5 output (4-7)
3265*c0909341SAndroid Build Coastguard Worker
3266*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q0,  d0             // filter 6
3267*c0909341SAndroid Build Coastguard Worker        load_filter_ptr r5                  // filter 7
3268*c0909341SAndroid Build Coastguard Worker
3269*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d4,  d4,  d5        // half pixel 5 (2x32)
3270*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d5,  d6,  d7        // half pixel 5 (2x32)
3271*c0909341SAndroid Build Coastguard Worker
3272*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q6,  q7,  #2*6 // filter 6 pixels
3273*c0909341SAndroid Build Coastguard Worker        load_filter_coef d2, r5,  r7        // filter 7
3274*c0909341SAndroid Build Coastguard Worker
3275*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d11, d4,  d5        // pixel 5 (2x32)
3276*c0909341SAndroid Build Coastguard Worker
3277*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d6,  d0        // filter 6 output (0-3)
3278*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d7,  d1        // filter 6 output (4-7)
3279*c0909341SAndroid Build Coastguard Worker
3280*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d2             // filter 7
3281*c0909341SAndroid Build Coastguard Worker
3282*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d10, d10, d11       // pixel 4,5
3283*c0909341SAndroid Build Coastguard Worker
3284*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d4,  d4,  d5        // half pixel 6 (2x32)
3285*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d5,  d6,  d7        // half pixel 6 (2x32)
3286*c0909341SAndroid Build Coastguard Worker
3287*c0909341SAndroid Build Coastguard Worker        vext.8          q3,  q6,  q7,  #2*7 // filter 7 pixels
3288*c0909341SAndroid Build Coastguard Worker
3289*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d11, d4,  d5        // pixel 6 (2x32)
3290*c0909341SAndroid Build Coastguard Worker
3291*c0909341SAndroid Build Coastguard Worker        vmull.s16       q2,  d6,  d2        // filter 7 output (0-3)
3292*c0909341SAndroid Build Coastguard Worker        vmull.s16       q3,  d7,  d3        // filter 7 output (4-7)
3293*c0909341SAndroid Build Coastguard Worker
3294*c0909341SAndroid Build Coastguard Worker        vld1.32         {d14[],d15[]}, [sp] // -(7 - intermediate_bits)
3295*c0909341SAndroid Build Coastguard Worker
3296*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d4,  d4,  d5        // half pixel 7 (2x32)
3297*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d5,  d6,  d7        // half pixel 7 (2x32)
3298*c0909341SAndroid Build Coastguard Worker
3299*c0909341SAndroid Build Coastguard Worker        sub             r5,  r5,  r7, lsl #3
3300*c0909341SAndroid Build Coastguard Worker
3301*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d4,  d4,  d5        // pixel 7 (2x32)
3302*c0909341SAndroid Build Coastguard Worker
3303*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  r8
3304*c0909341SAndroid Build Coastguard Worker
3305*c0909341SAndroid Build Coastguard Worker        vpadd.i32       d11, d11, d4        // pixel 6,7
3306*c0909341SAndroid Build Coastguard Worker
3307*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q4,  q4,  q7        // -(7 - intermediate_bits)
3308*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q5,  q5,  q7        // -(7 - intermediate_bits)
3309*c0909341SAndroid Build Coastguard Worker
3310*c0909341SAndroid Build Coastguard Worker        bx              lr
3311*c0909341SAndroid Build Coastguard Workerendfunc
3312*c0909341SAndroid Build Coastguard Worker
3313*c0909341SAndroid Build Coastguard Worker// void dav1d_warp_affine_8x8_16bpc_neon(
3314*c0909341SAndroid Build Coastguard Worker//         pixel *dst, const ptrdiff_t dst_stride,
3315*c0909341SAndroid Build Coastguard Worker//         const pixel *src, const ptrdiff_t src_stride,
3316*c0909341SAndroid Build Coastguard Worker//         const int16_t *const abcd, int mx, int my,
3317*c0909341SAndroid Build Coastguard Worker//         const int bitdepth_max)
3318*c0909341SAndroid Build Coastguard Worker.macro warp t
3319*c0909341SAndroid Build Coastguard Workerfunction warp_affine_8x8\t\()_16bpc_neon, export=1
3320*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
3321*c0909341SAndroid Build Coastguard Worker        vpush           {q4-q7}
3322*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #100]
3323*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #108]
3324*c0909341SAndroid Build Coastguard Worker        sub             sp,  sp,  #8
3325*c0909341SAndroid Build Coastguard Worker
3326*c0909341SAndroid Build Coastguard Worker        clz             r7,  r7
3327*c0909341SAndroid Build Coastguard Worker                                      // intermediate_bits = clz(bitdepth_max) - 18
3328*c0909341SAndroid Build Coastguard Worker.ifb \t
3329*c0909341SAndroid Build Coastguard Worker        sub             r8,  r7,  #11 // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
3330*c0909341SAndroid Build Coastguard Worker.endif
3331*c0909341SAndroid Build Coastguard Worker        sub             r7,  r7,  #25 // -(7 - intermediate_bits)
3332*c0909341SAndroid Build Coastguard Worker.ifb \t
3333*c0909341SAndroid Build Coastguard Worker        neg             r8,  r8       // -(7 + intermediate_bits)
3334*c0909341SAndroid Build Coastguard Worker.endif
3335*c0909341SAndroid Build Coastguard Worker        str             r7,  [sp]     // spill -(7 - intermediate_bits) on stack
3336*c0909341SAndroid Build Coastguard Worker.ifb \t
3337*c0909341SAndroid Build Coastguard Worker        str             r8,  [sp, #4] // spill -(7 + intermediate_bits) on stack
3338*c0909341SAndroid Build Coastguard Worker.endif
3339*c0909341SAndroid Build Coastguard Worker
3340*c0909341SAndroid Build Coastguard Worker        ldrd            r8,  r9,  [r4]
3341*c0909341SAndroid Build Coastguard Worker        sxth            r7,  r8
3342*c0909341SAndroid Build Coastguard Worker        asr             r8,  r8,  #16
3343*c0909341SAndroid Build Coastguard Worker        asr             r4,  r9,  #16
3344*c0909341SAndroid Build Coastguard Worker        sxth            r9,  r9
3345*c0909341SAndroid Build Coastguard Worker        mov             r10, #8
3346*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  r3, lsl #1
3347*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  r3
3348*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  #6
3349*c0909341SAndroid Build Coastguard Worker        movrel          r11, X(mc_warp_filter), 64*8
3350*c0909341SAndroid Build Coastguard Worker.ifnb \t
3351*c0909341SAndroid Build Coastguard Worker        lsl             r1,  r1,  #1
3352*c0909341SAndroid Build Coastguard Worker.endif
3353*c0909341SAndroid Build Coastguard Worker        add             r5,  r5,  #512
3354*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  #512
3355*c0909341SAndroid Build Coastguard Worker
3356*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3357*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d16, q4
3358*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d17, q5
3359*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3360*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d18, q4
3361*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d19, q5
3362*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3363*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d20, q4
3364*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d21, q5
3365*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3366*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d22, q4
3367*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d23, q5
3368*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3369*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d24, q4
3370*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d25, q5
3371*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3372*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d26, q4
3373*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d27, q5
3374*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3375*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d28, q4
3376*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d29, q5
3377*c0909341SAndroid Build Coastguard Worker
3378*c0909341SAndroid Build Coastguard Worker1:
3379*c0909341SAndroid Build Coastguard Worker        bl              warp_filter_horz_neon
3380*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d30, q4
3381*c0909341SAndroid Build Coastguard Worker        vmovn.i32       d31, q5
3382*c0909341SAndroid Build Coastguard Worker
3383*c0909341SAndroid Build Coastguard Worker        load_filter_row d8,  r6,  r9
3384*c0909341SAndroid Build Coastguard Worker        load_filter_row d9,  r6,  r9
3385*c0909341SAndroid Build Coastguard Worker        load_filter_row d10, r6,  r9
3386*c0909341SAndroid Build Coastguard Worker        load_filter_row d11, r6,  r9
3387*c0909341SAndroid Build Coastguard Worker        load_filter_row d12, r6,  r9
3388*c0909341SAndroid Build Coastguard Worker        load_filter_row d13, r6,  r9
3389*c0909341SAndroid Build Coastguard Worker        load_filter_row d14, r6,  r9
3390*c0909341SAndroid Build Coastguard Worker        load_filter_row d15, r6,  r9
3391*c0909341SAndroid Build Coastguard Worker        transpose_8x8b  q4,  q5,  q6,  q7,  d8,  d9,  d10, d11, d12, d13, d14, d15
3392*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q1,  d8
3393*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q2,  d9
3394*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q3,  d10
3395*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q4,  d11
3396*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q5,  d12
3397*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q6,  d13
3398*c0909341SAndroid Build Coastguard Worker
3399*c0909341SAndroid Build Coastguard Worker        sub             r6,  r6,  r9, lsl #3
3400*c0909341SAndroid Build Coastguard Worker
3401*c0909341SAndroid Build Coastguard Worker        // This ordering of vmull/vmlal is highly beneficial for
3402*c0909341SAndroid Build Coastguard Worker        // Cortex A8/A9/A53 here, but harmful for Cortex A7.
3403*c0909341SAndroid Build Coastguard Worker        vmull.s16       q0,  d16, d2
3404*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d18, d4
3405*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d20, d6
3406*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d22, d8
3407*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d24, d10
3408*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d26, d12
3409*c0909341SAndroid Build Coastguard Worker        vmull.s16       q1,  d17, d3
3410*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d19, d5
3411*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d21, d7
3412*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d23, d9
3413*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d25, d11
3414*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d27, d13
3415*c0909341SAndroid Build Coastguard Worker
3416*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q2,  d14
3417*c0909341SAndroid Build Coastguard Worker        vmovl.s8        q3,  d15
3418*c0909341SAndroid Build Coastguard Worker
3419*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d28, d4
3420*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q0,  d30, d6
3421*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d29, d5
3422*c0909341SAndroid Build Coastguard Worker        vmlal.s16       q1,  d31, d7
3423*c0909341SAndroid Build Coastguard Worker
3424*c0909341SAndroid Build Coastguard Worker.ifb \t
3425*c0909341SAndroid Build Coastguard Worker        ldr             lr,  [sp, #4]   // -(7 + intermediate_bits)
3426*c0909341SAndroid Build Coastguard Worker        ldr             r12, [sp, #120] // bitdepth_max
3427*c0909341SAndroid Build Coastguard Worker        vdup.32         q2,  lr         // -(7 + intermediate_bits)
3428*c0909341SAndroid Build Coastguard Worker        vdup.16         q3,  r12        // bitdepth_max
3429*c0909341SAndroid Build Coastguard Worker.endif
3430*c0909341SAndroid Build Coastguard Worker
3431*c0909341SAndroid Build Coastguard Worker        vmov            q8,  q9
3432*c0909341SAndroid Build Coastguard Worker        vmov            q9,  q10
3433*c0909341SAndroid Build Coastguard Worker.ifb \t
3434*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q0,  q0,  q2    // -(7 + intermediate_bits)
3435*c0909341SAndroid Build Coastguard Worker        vrshl.s32       q1,  q1,  q2    // -(7 + intermediate_bits)
3436*c0909341SAndroid Build Coastguard Worker.else
3437*c0909341SAndroid Build Coastguard Worker        vrshrn.s32      d0,  q0,  #7
3438*c0909341SAndroid Build Coastguard Worker        vrshrn.s32      d1,  q1,  #7
3439*c0909341SAndroid Build Coastguard Worker        vmov.i16        q3,  #PREP_BIAS
3440*c0909341SAndroid Build Coastguard Worker.endif
3441*c0909341SAndroid Build Coastguard Worker        vmov            q10, q11
3442*c0909341SAndroid Build Coastguard Worker.ifb \t
3443*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d0,  q0
3444*c0909341SAndroid Build Coastguard Worker        vqmovun.s32     d1,  q1
3445*c0909341SAndroid Build Coastguard Worker.else
3446*c0909341SAndroid Build Coastguard Worker        vsub.i16        q0,  q0,  q3    // PREP_BIAS
3447*c0909341SAndroid Build Coastguard Worker.endif
3448*c0909341SAndroid Build Coastguard Worker        vmov            q11, q12
3449*c0909341SAndroid Build Coastguard Worker        vmov            q12, q13
3450*c0909341SAndroid Build Coastguard Worker.ifb \t
3451*c0909341SAndroid Build Coastguard Worker        vmin.u16        q0,  q0,  q3    // bitdepth_max
3452*c0909341SAndroid Build Coastguard Worker.endif
3453*c0909341SAndroid Build Coastguard Worker        vmov            q13, q14
3454*c0909341SAndroid Build Coastguard Worker        vmov            q14, q15
3455*c0909341SAndroid Build Coastguard Worker        subs            r10, r10, #1
3456*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0}, [r0, :128], r1
3457*c0909341SAndroid Build Coastguard Worker
3458*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  r4
3459*c0909341SAndroid Build Coastguard Worker        bgt             1b
3460*c0909341SAndroid Build Coastguard Worker
3461*c0909341SAndroid Build Coastguard Worker        add             sp,  sp,  #8
3462*c0909341SAndroid Build Coastguard Worker        vpop            {q4-q7}
3463*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
3464*c0909341SAndroid Build Coastguard Workerendfunc
3465*c0909341SAndroid Build Coastguard Worker.endm
3466*c0909341SAndroid Build Coastguard Worker
3467*c0909341SAndroid Build Coastguard Workerwarp
3468*c0909341SAndroid Build Coastguard Workerwarp t
3469*c0909341SAndroid Build Coastguard Worker
3470*c0909341SAndroid Build Coastguard Worker// void dav1d_emu_edge_16bpc_neon(
3471*c0909341SAndroid Build Coastguard Worker//         const intptr_t bw, const intptr_t bh,
3472*c0909341SAndroid Build Coastguard Worker//         const intptr_t iw, const intptr_t ih,
3473*c0909341SAndroid Build Coastguard Worker//         const intptr_t x, const intptr_t y,
3474*c0909341SAndroid Build Coastguard Worker//         pixel *dst, const ptrdiff_t dst_stride,
3475*c0909341SAndroid Build Coastguard Worker//         const pixel *ref, const ptrdiff_t ref_stride)
3476*c0909341SAndroid Build Coastguard Workerfunction emu_edge_16bpc_neon, export=1
3477*c0909341SAndroid Build Coastguard Worker        push            {r4-r11,lr}
3478*c0909341SAndroid Build Coastguard Worker        ldrd            r4,  r5,  [sp, #36]
3479*c0909341SAndroid Build Coastguard Worker        ldrd            r6,  r7,  [sp, #44]
3480*c0909341SAndroid Build Coastguard Worker        ldrd            r8,  r9,  [sp, #52]
3481*c0909341SAndroid Build Coastguard Worker
3482*c0909341SAndroid Build Coastguard Worker        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3483*c0909341SAndroid Build Coastguard Worker        // ref += iclip(x, 0, iw - 1)
3484*c0909341SAndroid Build Coastguard Worker        sub             r12, r3,  #1           // ih - 1
3485*c0909341SAndroid Build Coastguard Worker        cmp             r5,  r3
3486*c0909341SAndroid Build Coastguard Worker        sub             lr,  r2,  #1           // iw - 1
3487*c0909341SAndroid Build Coastguard Worker        it              lt
3488*c0909341SAndroid Build Coastguard Worker        movlt           r12, r5                // min(y, ih - 1)
3489*c0909341SAndroid Build Coastguard Worker        cmp             r4,  r2
3490*c0909341SAndroid Build Coastguard Worker        bic             r12, r12, r12, asr #31 // max(min(y, ih - 1), 0)
3491*c0909341SAndroid Build Coastguard Worker        it              lt
3492*c0909341SAndroid Build Coastguard Worker        movlt           lr,  r4                // min(x, iw - 1)
3493*c0909341SAndroid Build Coastguard Worker        bic             lr,  lr,  lr,  asr #31 // max(min(x, iw - 1), 0)
3494*c0909341SAndroid Build Coastguard Worker        mla             r8,  r12, r9,  r8      // ref += iclip() * stride
3495*c0909341SAndroid Build Coastguard Worker        add             r8,  r8,  lr,  lsl #1  // ref += iclip()
3496*c0909341SAndroid Build Coastguard Worker
3497*c0909341SAndroid Build Coastguard Worker        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3498*c0909341SAndroid Build Coastguard Worker        // top_ext = iclip(-y, 0, bh - 1)
3499*c0909341SAndroid Build Coastguard Worker        add             r10, r5,  r1           // y + bh
3500*c0909341SAndroid Build Coastguard Worker        neg             r5,  r5                // -y
3501*c0909341SAndroid Build Coastguard Worker        sub             r10, r10, r3           // y + bh - ih
3502*c0909341SAndroid Build Coastguard Worker        sub             r12, r1,  #1           // bh - 1
3503*c0909341SAndroid Build Coastguard Worker        cmp             r10, r1
3504*c0909341SAndroid Build Coastguard Worker        bic             r5,  r5,  r5,  asr #31 // max(-y, 0)
3505*c0909341SAndroid Build Coastguard Worker        it              ge
3506*c0909341SAndroid Build Coastguard Worker        movge           r10, r12               // min(y + bh - ih, bh-1)
3507*c0909341SAndroid Build Coastguard Worker        cmp             r5,  r1
3508*c0909341SAndroid Build Coastguard Worker        bic             r10, r10, r10, asr #31 // max(min(y + bh - ih, bh-1), 0)
3509*c0909341SAndroid Build Coastguard Worker        it              ge
3510*c0909341SAndroid Build Coastguard Worker        movge           r5,  r12               // min(max(-y, 0), bh-1)
3511*c0909341SAndroid Build Coastguard Worker
3512*c0909341SAndroid Build Coastguard Worker        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3513*c0909341SAndroid Build Coastguard Worker        // left_ext = iclip(-x, 0, bw - 1)
3514*c0909341SAndroid Build Coastguard Worker        add             r11, r4,  r0           // x + bw
3515*c0909341SAndroid Build Coastguard Worker        neg             r4,  r4                // -x
3516*c0909341SAndroid Build Coastguard Worker        sub             r11, r11, r2           // x + bw - iw
3517*c0909341SAndroid Build Coastguard Worker        sub             lr,  r0,  #1           // bw - 1
3518*c0909341SAndroid Build Coastguard Worker        cmp             r11, r0
3519*c0909341SAndroid Build Coastguard Worker        bic             r4,  r4,  r4,  asr #31 // max(-x, 0)
3520*c0909341SAndroid Build Coastguard Worker        it              ge
3521*c0909341SAndroid Build Coastguard Worker        movge           r11, lr                // min(x + bw - iw, bw-1)
3522*c0909341SAndroid Build Coastguard Worker        cmp             r4,  r0
3523*c0909341SAndroid Build Coastguard Worker        bic             r11, r11, r11, asr #31 // max(min(x + bw - iw, bw-1), 0)
3524*c0909341SAndroid Build Coastguard Worker        it              ge
3525*c0909341SAndroid Build Coastguard Worker        movge           r4,  lr                // min(max(-x, 0), bw - 1)
3526*c0909341SAndroid Build Coastguard Worker
3527*c0909341SAndroid Build Coastguard Worker        // center_h = bh - top_ext - bottom_ext
3528*c0909341SAndroid Build Coastguard Worker        // dst += top_ext * PXSTRIDE(dst_stride)
3529*c0909341SAndroid Build Coastguard Worker        // center_w = bw - left_ext - right_ext
3530*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  r5           // bh - top_ext
3531*c0909341SAndroid Build Coastguard Worker        mla             r6,  r5,  r7,  r6
3532*c0909341SAndroid Build Coastguard Worker        sub             r2,  r0,  r4           // bw - left_ext
3533*c0909341SAndroid Build Coastguard Worker        sub             r1,  r1,  r10          // center_h = bh - top_ext - bottom_ext
3534*c0909341SAndroid Build Coastguard Worker        sub             r2,  r2,  r11          // center_w = bw - left_ext - right_ext
3535*c0909341SAndroid Build Coastguard Worker
3536*c0909341SAndroid Build Coastguard Worker        mov             r0,  r6                // backup of dst
3537*c0909341SAndroid Build Coastguard Worker
3538*c0909341SAndroid Build Coastguard Worker.macro v_loop need_left, need_right
3539*c0909341SAndroid Build Coastguard Worker0:
3540*c0909341SAndroid Build Coastguard Worker.if \need_left
3541*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[], d1[]}, [r8]
3542*c0909341SAndroid Build Coastguard Worker        mov             r12, r6                // out = dst
3543*c0909341SAndroid Build Coastguard Worker        mov             r3,  r4
3544*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q0
3545*c0909341SAndroid Build Coastguard Worker1:
3546*c0909341SAndroid Build Coastguard Worker        subs            r3,  r3,  #16
3547*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r12, :128]!
3548*c0909341SAndroid Build Coastguard Worker        bgt             1b
3549*c0909341SAndroid Build Coastguard Worker.endif
3550*c0909341SAndroid Build Coastguard Worker        mov             lr,  r8
3551*c0909341SAndroid Build Coastguard Worker        add             r12, r6,  r4,  lsl #1  // out = dst + left_ext
3552*c0909341SAndroid Build Coastguard Worker        mov             r3,  r2
3553*c0909341SAndroid Build Coastguard Worker1:
3554*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [lr]!
3555*c0909341SAndroid Build Coastguard Worker        subs            r3,  r3,  #32
3556*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [lr]!
3557*c0909341SAndroid Build Coastguard Worker.if \need_left
3558*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r12]!
3559*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r12]!
3560*c0909341SAndroid Build Coastguard Worker.else
3561*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r12, :128]!
3562*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r12, :128]!
3563*c0909341SAndroid Build Coastguard Worker.endif
3564*c0909341SAndroid Build Coastguard Worker        bgt             1b
3565*c0909341SAndroid Build Coastguard Worker.if \need_right
3566*c0909341SAndroid Build Coastguard Worker        add             r3,  r8,  r2,  lsl #1  // in + center_w
3567*c0909341SAndroid Build Coastguard Worker        sub             r3,  r3,  #2           // in + center_w - 1
3568*c0909341SAndroid Build Coastguard Worker        add             r12, r6,  r4,  lsl #1  // dst + left_ext
3569*c0909341SAndroid Build Coastguard Worker        vld1.16         {d0[], d1[]}, [r3]
3570*c0909341SAndroid Build Coastguard Worker        add             r12, r12, r2,  lsl #1  // out = dst + left_ext + center_w
3571*c0909341SAndroid Build Coastguard Worker        mov             r3,  r11
3572*c0909341SAndroid Build Coastguard Worker        vmov            q1,  q0
3573*c0909341SAndroid Build Coastguard Worker1:
3574*c0909341SAndroid Build Coastguard Worker        subs            r3,  r3,  #16
3575*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r12]!
3576*c0909341SAndroid Build Coastguard Worker        bgt             1b
3577*c0909341SAndroid Build Coastguard Worker.endif
3578*c0909341SAndroid Build Coastguard Worker
3579*c0909341SAndroid Build Coastguard Worker        subs            r1,  r1,  #1           // center_h--
3580*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  r7
3581*c0909341SAndroid Build Coastguard Worker        add             r8,  r8,  r9
3582*c0909341SAndroid Build Coastguard Worker        bgt             0b
3583*c0909341SAndroid Build Coastguard Worker.endm
3584*c0909341SAndroid Build Coastguard Worker
3585*c0909341SAndroid Build Coastguard Worker        cmp             r4,  #0
3586*c0909341SAndroid Build Coastguard Worker        beq             2f
3587*c0909341SAndroid Build Coastguard Worker        // need_left
3588*c0909341SAndroid Build Coastguard Worker        cmp             r11, #0
3589*c0909341SAndroid Build Coastguard Worker        beq             3f
3590*c0909341SAndroid Build Coastguard Worker        // need_left + need_right
3591*c0909341SAndroid Build Coastguard Worker        v_loop          1,   1
3592*c0909341SAndroid Build Coastguard Worker        b               5f
3593*c0909341SAndroid Build Coastguard Worker
3594*c0909341SAndroid Build Coastguard Worker2:
3595*c0909341SAndroid Build Coastguard Worker        // !need_left
3596*c0909341SAndroid Build Coastguard Worker        cmp             r11, #0
3597*c0909341SAndroid Build Coastguard Worker        beq             4f
3598*c0909341SAndroid Build Coastguard Worker        // !need_left + need_right
3599*c0909341SAndroid Build Coastguard Worker        v_loop          0,   1
3600*c0909341SAndroid Build Coastguard Worker        b               5f
3601*c0909341SAndroid Build Coastguard Worker
3602*c0909341SAndroid Build Coastguard Worker3:
3603*c0909341SAndroid Build Coastguard Worker        // need_left + !need_right
3604*c0909341SAndroid Build Coastguard Worker        v_loop          1,   0
3605*c0909341SAndroid Build Coastguard Worker        b               5f
3606*c0909341SAndroid Build Coastguard Worker
3607*c0909341SAndroid Build Coastguard Worker4:
3608*c0909341SAndroid Build Coastguard Worker        // !need_left + !need_right
3609*c0909341SAndroid Build Coastguard Worker        v_loop          0,   0
3610*c0909341SAndroid Build Coastguard Worker
3611*c0909341SAndroid Build Coastguard Worker5:
3612*c0909341SAndroid Build Coastguard Worker        cmp             r10, #0
3613*c0909341SAndroid Build Coastguard Worker        // Storing the original dst in r0 overwrote bw, recalculate it here
3614*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  r4           // center_w + left_ext
3615*c0909341SAndroid Build Coastguard Worker        add             r2,  r2,  r11          // bw = center_w + left_ext + right_ext
3616*c0909341SAndroid Build Coastguard Worker
3617*c0909341SAndroid Build Coastguard Worker        beq             3f
3618*c0909341SAndroid Build Coastguard Worker        // need_bottom
3619*c0909341SAndroid Build Coastguard Worker        sub             r8,  r6,  r7           // ref = dst - stride
3620*c0909341SAndroid Build Coastguard Worker        mov             r4,  r2
3621*c0909341SAndroid Build Coastguard Worker        sub             r12, r7,  #32
3622*c0909341SAndroid Build Coastguard Worker1:
3623*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r8, :128]!
3624*c0909341SAndroid Build Coastguard Worker        mov             r3,  r10
3625*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r8, :128]!
3626*c0909341SAndroid Build Coastguard Worker2:
3627*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r6, :128]!
3628*c0909341SAndroid Build Coastguard Worker        subs            r3,  r3,  #1
3629*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r6, :128], r12
3630*c0909341SAndroid Build Coastguard Worker        bgt             2b
3631*c0909341SAndroid Build Coastguard Worker        mls             r6,  r7,  r10, r6      // dst -= bottom_ext * stride
3632*c0909341SAndroid Build Coastguard Worker        subs            r4,  r4,  #32          // bw -= 32
3633*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  #64          // dst += 32
3634*c0909341SAndroid Build Coastguard Worker        bgt             1b
3635*c0909341SAndroid Build Coastguard Worker
3636*c0909341SAndroid Build Coastguard Worker3:
3637*c0909341SAndroid Build Coastguard Worker        cmp             r5,  #0
3638*c0909341SAndroid Build Coastguard Worker        beq             3f
3639*c0909341SAndroid Build Coastguard Worker        // need_top
3640*c0909341SAndroid Build Coastguard Worker        mls             r6,  r7,  r5,  r0      // dst = stored_dst - top_ext * stride
3641*c0909341SAndroid Build Coastguard Worker        sub             r12, r7,  #32
3642*c0909341SAndroid Build Coastguard Worker1:
3643*c0909341SAndroid Build Coastguard Worker        vld1.16         {q0, q1}, [r0, :128]!
3644*c0909341SAndroid Build Coastguard Worker        mov             r3,  r5
3645*c0909341SAndroid Build Coastguard Worker        vld1.16         {q2, q3}, [r0, :128]!
3646*c0909341SAndroid Build Coastguard Worker2:
3647*c0909341SAndroid Build Coastguard Worker        vst1.16         {q0, q1}, [r6, :128]!
3648*c0909341SAndroid Build Coastguard Worker        subs            r3,  r3,  #1
3649*c0909341SAndroid Build Coastguard Worker        vst1.16         {q2, q3}, [r6, :128], r12
3650*c0909341SAndroid Build Coastguard Worker        bgt             2b
3651*c0909341SAndroid Build Coastguard Worker        mls             r6,  r7,  r5,  r6      // dst -= top_ext * stride
3652*c0909341SAndroid Build Coastguard Worker        subs            r2,  r2,  #32          // bw -= 32
3653*c0909341SAndroid Build Coastguard Worker        add             r6,  r6,  #64          // dst += 32
3654*c0909341SAndroid Build Coastguard Worker        bgt             1b
3655*c0909341SAndroid Build Coastguard Worker
3656*c0909341SAndroid Build Coastguard Worker3:
3657*c0909341SAndroid Build Coastguard Worker        pop             {r4-r11,pc}
3658*c0909341SAndroid Build Coastguard Workerendfunc
3659