xref: /aosp_15_r20/external/libdav1d/src/arm/64/ipred.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2019, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
32*c0909341SAndroid Build Coastguard Worker//                             const pixel *const topleft,
33*c0909341SAndroid Build Coastguard Worker//                             const int width, const int height, const int a,
34*c0909341SAndroid Build Coastguard Worker//                             const int max_width, const int max_height);
35*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_128_8bpc_neon, export=1
36*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
37*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_128_tbl
38*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
39*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
40*c0909341SAndroid Build Coastguard Worker        movi            v0.16b,  #128
41*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
42*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
43*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
44*c0909341SAndroid Build Coastguard Worker        br              x5
45*c0909341SAndroid Build Coastguard Worker40:
46*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
47*c0909341SAndroid Build Coastguard Worker4:
48*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
49*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
50*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
51*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
52*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
53*c0909341SAndroid Build Coastguard Worker        b.gt            4b
54*c0909341SAndroid Build Coastguard Worker        ret
55*c0909341SAndroid Build Coastguard Worker80:
56*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
57*c0909341SAndroid Build Coastguard Worker8:
58*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
59*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
60*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
61*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
62*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
63*c0909341SAndroid Build Coastguard Worker        b.gt            8b
64*c0909341SAndroid Build Coastguard Worker        ret
65*c0909341SAndroid Build Coastguard Worker160:
66*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
67*c0909341SAndroid Build Coastguard Worker16:
68*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
69*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
70*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
71*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
72*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
73*c0909341SAndroid Build Coastguard Worker        b.gt            16b
74*c0909341SAndroid Build Coastguard Worker        ret
75*c0909341SAndroid Build Coastguard Worker320:
76*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
77*c0909341SAndroid Build Coastguard Worker        movi            v1.16b,  #128
78*c0909341SAndroid Build Coastguard Worker32:
79*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
80*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
81*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
82*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
83*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
84*c0909341SAndroid Build Coastguard Worker        b.gt            32b
85*c0909341SAndroid Build Coastguard Worker        ret
86*c0909341SAndroid Build Coastguard Worker640:
87*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
88*c0909341SAndroid Build Coastguard Worker        movi            v1.16b,  #128
89*c0909341SAndroid Build Coastguard Worker        movi            v2.16b,  #128
90*c0909341SAndroid Build Coastguard Worker        movi            v3.16b,  #128
91*c0909341SAndroid Build Coastguard Worker64:
92*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
93*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
94*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
95*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
96*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
97*c0909341SAndroid Build Coastguard Worker        b.gt            64b
98*c0909341SAndroid Build Coastguard Worker        ret
99*c0909341SAndroid Build Coastguard Workerendfunc
100*c0909341SAndroid Build Coastguard Worker
101*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_128_tbl
102*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_dc_128_tbl
103*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_dc_128_tbl
104*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_dc_128_tbl
105*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_dc_128_tbl
106*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_dc_128_tbl
107*c0909341SAndroid Build Coastguard Workerendjumptable
108*c0909341SAndroid Build Coastguard Worker
109*c0909341SAndroid Build Coastguard Worker// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
110*c0909341SAndroid Build Coastguard Worker//                        const pixel *const topleft,
111*c0909341SAndroid Build Coastguard Worker//                        const int width, const int height, const int a,
112*c0909341SAndroid Build Coastguard Worker//                        const int max_width, const int max_height);
113*c0909341SAndroid Build Coastguard Workerfunction ipred_v_8bpc_neon, export=1
114*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
115*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_v_tbl
116*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
117*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
118*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
119*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
120*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
121*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
122*c0909341SAndroid Build Coastguard Worker        br              x5
123*c0909341SAndroid Build Coastguard Worker40:
124*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
125*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[0],  [x2]
126*c0909341SAndroid Build Coastguard Worker4:
127*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
128*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
129*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
130*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
131*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
132*c0909341SAndroid Build Coastguard Worker        b.gt            4b
133*c0909341SAndroid Build Coastguard Worker        ret
134*c0909341SAndroid Build Coastguard Worker80:
135*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
136*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [x2]
137*c0909341SAndroid Build Coastguard Worker8:
138*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
139*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
140*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
141*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
142*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
143*c0909341SAndroid Build Coastguard Worker        b.gt            8b
144*c0909341SAndroid Build Coastguard Worker        ret
145*c0909341SAndroid Build Coastguard Worker160:
146*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
147*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2]
148*c0909341SAndroid Build Coastguard Worker16:
149*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
150*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
151*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
152*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
153*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
154*c0909341SAndroid Build Coastguard Worker        b.gt            16b
155*c0909341SAndroid Build Coastguard Worker        ret
156*c0909341SAndroid Build Coastguard Worker320:
157*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
158*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x2]
159*c0909341SAndroid Build Coastguard Worker32:
160*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
161*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
162*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
163*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
164*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
165*c0909341SAndroid Build Coastguard Worker        b.gt            32b
166*c0909341SAndroid Build Coastguard Worker        ret
167*c0909341SAndroid Build Coastguard Worker640:
168*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
169*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
170*c0909341SAndroid Build Coastguard Worker64:
171*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
172*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
173*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
174*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
175*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
176*c0909341SAndroid Build Coastguard Worker        b.gt            64b
177*c0909341SAndroid Build Coastguard Worker        ret
178*c0909341SAndroid Build Coastguard Workerendfunc
179*c0909341SAndroid Build Coastguard Worker
180*c0909341SAndroid Build Coastguard Workerjumptable ipred_v_tbl
181*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_v_tbl
182*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_v_tbl
183*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_v_tbl
184*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_v_tbl
185*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_v_tbl
186*c0909341SAndroid Build Coastguard Workerendjumptable
187*c0909341SAndroid Build Coastguard Worker
188*c0909341SAndroid Build Coastguard Worker// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
189*c0909341SAndroid Build Coastguard Worker//                        const pixel *const topleft,
190*c0909341SAndroid Build Coastguard Worker//                        const int width, const int height, const int a,
191*c0909341SAndroid Build Coastguard Worker//                        const int max_width, const int max_height);
192*c0909341SAndroid Build Coastguard Workerfunction ipred_h_8bpc_neon, export=1
193*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
194*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_h_tbl
195*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
196*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
197*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
198*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
199*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
200*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
201*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
202*c0909341SAndroid Build Coastguard Worker        br              x5
203*c0909341SAndroid Build Coastguard Worker40:
204*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
205*c0909341SAndroid Build Coastguard Worker4:
206*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
207*c0909341SAndroid Build Coastguard Worker        st1             {v3.s}[0],  [x0], x1
208*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[0],  [x6], x1
209*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
210*c0909341SAndroid Build Coastguard Worker        st1             {v1.s}[0],  [x0], x1
211*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
212*c0909341SAndroid Build Coastguard Worker        b.gt            4b
213*c0909341SAndroid Build Coastguard Worker        ret
214*c0909341SAndroid Build Coastguard Worker80:
215*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
216*c0909341SAndroid Build Coastguard Worker8:
217*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
218*c0909341SAndroid Build Coastguard Worker        st1             {v3.8b},  [x0], x1
219*c0909341SAndroid Build Coastguard Worker        st1             {v2.8b},  [x6], x1
220*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
221*c0909341SAndroid Build Coastguard Worker        st1             {v1.8b},  [x0], x1
222*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
223*c0909341SAndroid Build Coastguard Worker        b.gt            8b
224*c0909341SAndroid Build Coastguard Worker        ret
225*c0909341SAndroid Build Coastguard Worker160:
226*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
227*c0909341SAndroid Build Coastguard Worker16:
228*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
229*c0909341SAndroid Build Coastguard Worker        st1             {v3.16b}, [x0], x1
230*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x6], x1
231*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
232*c0909341SAndroid Build Coastguard Worker        st1             {v1.16b}, [x0], x1
233*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
234*c0909341SAndroid Build Coastguard Worker        b.gt            16b
235*c0909341SAndroid Build Coastguard Worker        ret
236*c0909341SAndroid Build Coastguard Worker320:
237*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
238*c0909341SAndroid Build Coastguard Worker32:
239*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
240*c0909341SAndroid Build Coastguard Worker        str             q3,  [x0, #16]
241*c0909341SAndroid Build Coastguard Worker        str             q2,  [x6, #16]
242*c0909341SAndroid Build Coastguard Worker        st1             {v3.16b}, [x0], x1
243*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x6], x1
244*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
245*c0909341SAndroid Build Coastguard Worker        str             q1,  [x0, #16]
246*c0909341SAndroid Build Coastguard Worker        str             q0,  [x6, #16]
247*c0909341SAndroid Build Coastguard Worker        st1             {v1.16b}, [x0], x1
248*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
249*c0909341SAndroid Build Coastguard Worker        b.gt            32b
250*c0909341SAndroid Build Coastguard Worker        ret
251*c0909341SAndroid Build Coastguard Worker640:
252*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
253*c0909341SAndroid Build Coastguard Worker64:
254*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
255*c0909341SAndroid Build Coastguard Worker        str             q3,  [x0, #16]
256*c0909341SAndroid Build Coastguard Worker        str             q2,  [x6, #16]
257*c0909341SAndroid Build Coastguard Worker        stp             q3,  q3,  [x0, #32]
258*c0909341SAndroid Build Coastguard Worker        stp             q2,  q2,  [x6, #32]
259*c0909341SAndroid Build Coastguard Worker        st1             {v3.16b}, [x0], x1
260*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x6], x1
261*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
262*c0909341SAndroid Build Coastguard Worker        str             q1,  [x0, #16]
263*c0909341SAndroid Build Coastguard Worker        str             q0,  [x6, #16]
264*c0909341SAndroid Build Coastguard Worker        stp             q1,  q1,  [x0, #32]
265*c0909341SAndroid Build Coastguard Worker        stp             q0,  q0,  [x6, #32]
266*c0909341SAndroid Build Coastguard Worker        st1             {v1.16b}, [x0], x1
267*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
268*c0909341SAndroid Build Coastguard Worker        b.gt            64b
269*c0909341SAndroid Build Coastguard Worker        ret
270*c0909341SAndroid Build Coastguard Workerendfunc
271*c0909341SAndroid Build Coastguard Worker
272*c0909341SAndroid Build Coastguard Workerjumptable ipred_h_tbl
273*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_h_tbl
274*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_h_tbl
275*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_h_tbl
276*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_h_tbl
277*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_h_tbl
278*c0909341SAndroid Build Coastguard Workerendjumptable
279*c0909341SAndroid Build Coastguard Worker
280*c0909341SAndroid Build Coastguard Worker// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
281*c0909341SAndroid Build Coastguard Worker//                             const pixel *const topleft,
282*c0909341SAndroid Build Coastguard Worker//                             const int width, const int height, const int a,
283*c0909341SAndroid Build Coastguard Worker//                             const int max_width, const int max_height);
284*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_top_8bpc_neon, export=1
285*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
286*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_top_tbl
287*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
288*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
289*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
290*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
291*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
292*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
293*c0909341SAndroid Build Coastguard Worker        br              x5
294*c0909341SAndroid Build Coastguard Worker40:
295*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
296*c0909341SAndroid Build Coastguard Worker        ld1r            {v0.2s},  [x2]
297*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
298*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #3
299*c0909341SAndroid Build Coastguard Worker        dup             v0.8b,   v0.b[0]
300*c0909341SAndroid Build Coastguard Worker4:
301*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
302*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
303*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
304*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
305*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
306*c0909341SAndroid Build Coastguard Worker        b.gt            4b
307*c0909341SAndroid Build Coastguard Worker        ret
308*c0909341SAndroid Build Coastguard Worker80:
309*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
310*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [x2]
311*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
312*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #3
313*c0909341SAndroid Build Coastguard Worker        dup             v0.8b,   v0.b[0]
314*c0909341SAndroid Build Coastguard Worker8:
315*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
316*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
317*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
318*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
319*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
320*c0909341SAndroid Build Coastguard Worker        b.gt            8b
321*c0909341SAndroid Build Coastguard Worker        ret
322*c0909341SAndroid Build Coastguard Worker160:
323*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
324*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2]
325*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
326*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #4
327*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v0.b[0]
328*c0909341SAndroid Build Coastguard Worker16:
329*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
330*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
331*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
332*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
333*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
334*c0909341SAndroid Build Coastguard Worker        b.gt            16b
335*c0909341SAndroid Build Coastguard Worker        ret
336*c0909341SAndroid Build Coastguard Worker320:
337*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
338*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x2]
339*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
340*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
341*c0909341SAndroid Build Coastguard Worker        add             v2.4h,   v0.4h,   v1.4h
342*c0909341SAndroid Build Coastguard Worker        rshrn           v2.8b,   v2.8h,   #5
343*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v2.b[0]
344*c0909341SAndroid Build Coastguard Worker        dup             v1.16b,  v2.b[0]
345*c0909341SAndroid Build Coastguard Worker32:
346*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
347*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
348*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
349*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
350*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
351*c0909341SAndroid Build Coastguard Worker        b.gt            32b
352*c0909341SAndroid Build Coastguard Worker        ret
353*c0909341SAndroid Build Coastguard Worker640:
354*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
355*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
356*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
357*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
358*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
359*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
360*c0909341SAndroid Build Coastguard Worker        add             v4.4h,   v0.4h,   v1.4h
361*c0909341SAndroid Build Coastguard Worker        add             v5.4h,   v2.4h,   v3.4h
362*c0909341SAndroid Build Coastguard Worker        add             v4.4h,   v4.4h,   v5.4h
363*c0909341SAndroid Build Coastguard Worker        rshrn           v4.8b,   v4.8h,   #6
364*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v4.b[0]
365*c0909341SAndroid Build Coastguard Worker        dup             v1.16b,  v4.b[0]
366*c0909341SAndroid Build Coastguard Worker        dup             v2.16b,  v4.b[0]
367*c0909341SAndroid Build Coastguard Worker        dup             v3.16b,  v4.b[0]
368*c0909341SAndroid Build Coastguard Worker64:
369*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
370*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
371*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
372*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
373*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
374*c0909341SAndroid Build Coastguard Worker        b.gt            64b
375*c0909341SAndroid Build Coastguard Worker        ret
376*c0909341SAndroid Build Coastguard Workerendfunc
377*c0909341SAndroid Build Coastguard Worker
378*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_top_tbl
379*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_dc_top_tbl
380*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_dc_top_tbl
381*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_dc_top_tbl
382*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_dc_top_tbl
383*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_dc_top_tbl
384*c0909341SAndroid Build Coastguard Workerendjumptable
385*c0909341SAndroid Build Coastguard Worker
386*c0909341SAndroid Build Coastguard Worker// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
387*c0909341SAndroid Build Coastguard Worker//                              const pixel *const topleft,
388*c0909341SAndroid Build Coastguard Worker//                              const int width, const int height, const int a,
389*c0909341SAndroid Build Coastguard Worker//                              const int max_width, const int max_height);
390*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_left_8bpc_neon, export=1
391*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw
392*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
393*c0909341SAndroid Build Coastguard Worker        clz             w7,  w4
394*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_left_tbl
395*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
396*c0909341SAndroid Build Coastguard Worker        sub             w7,  w7,  #25
397*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
398*c0909341SAndroid Build Coastguard Worker        ldrsw           x7,  [x5, w7, uxtw #2]
399*c0909341SAndroid Build Coastguard Worker        add             x3,  x5,  x3
400*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x7
401*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
402*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
403*c0909341SAndroid Build Coastguard Worker        br              x5
404*c0909341SAndroid Build Coastguard Worker
405*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h4):
406*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
407*c0909341SAndroid Build Coastguard Worker        ld1r            {v0.2s},  [x2]
408*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
409*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #3
410*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v0.b[0]
411*c0909341SAndroid Build Coastguard Worker        br              x3
412*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w4):
413*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
414*c0909341SAndroid Build Coastguard Worker1:
415*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
416*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
417*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
418*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
419*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
420*c0909341SAndroid Build Coastguard Worker        b.gt            1b
421*c0909341SAndroid Build Coastguard Worker        ret
422*c0909341SAndroid Build Coastguard Worker
423*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h8):
424*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
425*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [x2]
426*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
427*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #3
428*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v0.b[0]
429*c0909341SAndroid Build Coastguard Worker        br              x3
430*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w8):
431*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
432*c0909341SAndroid Build Coastguard Worker1:
433*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
434*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
435*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
436*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
437*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
438*c0909341SAndroid Build Coastguard Worker        b.gt            1b
439*c0909341SAndroid Build Coastguard Worker        ret
440*c0909341SAndroid Build Coastguard Worker
441*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h16):
442*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
443*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2]
444*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
445*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #4
446*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v0.b[0]
447*c0909341SAndroid Build Coastguard Worker        br              x3
448*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w16):
449*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
450*c0909341SAndroid Build Coastguard Worker1:
451*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
452*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
453*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
454*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
455*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
456*c0909341SAndroid Build Coastguard Worker        b.gt            1b
457*c0909341SAndroid Build Coastguard Worker        ret
458*c0909341SAndroid Build Coastguard Worker
459*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h32):
460*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
461*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x2]
462*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
463*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
464*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
465*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #5
466*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v0.b[0]
467*c0909341SAndroid Build Coastguard Worker        br              x3
468*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w32):
469*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
470*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
471*c0909341SAndroid Build Coastguard Worker1:
472*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
473*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
474*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
475*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
476*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
477*c0909341SAndroid Build Coastguard Worker        b.gt            1b
478*c0909341SAndroid Build Coastguard Worker        ret
479*c0909341SAndroid Build Coastguard Worker
480*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h64):
481*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
482*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
483*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
484*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
485*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
486*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
487*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
488*c0909341SAndroid Build Coastguard Worker        add             v2.4h,   v2.4h,   v3.4h
489*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
490*c0909341SAndroid Build Coastguard Worker        rshrn           v0.8b,   v0.8h,   #6
491*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v0.b[0]
492*c0909341SAndroid Build Coastguard Worker        br              x3
493*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w64):
494*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
495*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
496*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v0.16b
497*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v0.16b
498*c0909341SAndroid Build Coastguard Worker1:
499*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
500*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
501*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
502*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
503*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
504*c0909341SAndroid Build Coastguard Worker        b.gt            1b
505*c0909341SAndroid Build Coastguard Worker        ret
506*c0909341SAndroid Build Coastguard Workerendfunc
507*c0909341SAndroid Build Coastguard Worker
508*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_left_tbl
509*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
510*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
511*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
512*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
513*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
514*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
515*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
516*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
517*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
518*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
519*c0909341SAndroid Build Coastguard Workerendjumptable
520*c0909341SAndroid Build Coastguard Worker
521*c0909341SAndroid Build Coastguard Worker// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
522*c0909341SAndroid Build Coastguard Worker//                         const pixel *const topleft,
523*c0909341SAndroid Build Coastguard Worker//                         const int width, const int height, const int a,
524*c0909341SAndroid Build Coastguard Worker//                         const int max_width, const int max_height);
525*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_8bpc_neon, export=1
526*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw
527*c0909341SAndroid Build Coastguard Worker        add             w7,  w3,  w4             // width + height
528*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
529*c0909341SAndroid Build Coastguard Worker        clz             w6,  w4
530*c0909341SAndroid Build Coastguard Worker        dup             v16.8h, w7               // width + height
531*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_tbl
532*c0909341SAndroid Build Coastguard Worker        rbit            w7,  w7                  // rbit(width + height)
533*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
534*c0909341SAndroid Build Coastguard Worker        sub             w6,  w6,  #25
535*c0909341SAndroid Build Coastguard Worker        clz             w7,  w7                  // ctz(width + height)
536*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
537*c0909341SAndroid Build Coastguard Worker        ldrsw           x6,  [x5, w6, uxtw #2]
538*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                  // -ctz(width + height)
539*c0909341SAndroid Build Coastguard Worker        add             x3,  x5,  x3
540*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x6
541*c0909341SAndroid Build Coastguard Worker        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
542*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,  w7              // -ctz(width + height)
543*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
544*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
545*c0909341SAndroid Build Coastguard Worker        br              x5
546*c0909341SAndroid Build Coastguard Worker
547*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h4):
548*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
549*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[0],  [x2], #4
550*c0909341SAndroid Build Coastguard Worker        ins             v0.s[1], wzr
551*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
552*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
553*c0909341SAndroid Build Coastguard Worker        br              x3
554*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w4):
555*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
556*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[0],  [x2]
557*c0909341SAndroid Build Coastguard Worker        ins             v1.s[1], wzr
558*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
559*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.8b
560*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #4
561*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
562*c0909341SAndroid Build Coastguard Worker        ushl            v0.4h,   v0.4h,   v17.4h
563*c0909341SAndroid Build Coastguard Worker        b.eq            1f
564*c0909341SAndroid Build Coastguard Worker        // h = 8/16
565*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x3334/2)
566*c0909341SAndroid Build Coastguard Worker        movk            w16, #(0x5556/2), lsl #16
567*c0909341SAndroid Build Coastguard Worker        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
568*c0909341SAndroid Build Coastguard Worker        lsr             w16, w16, w17
569*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
570*c0909341SAndroid Build Coastguard Worker        sqdmulh         v0.4h,   v0.4h,   v16.4h
571*c0909341SAndroid Build Coastguard Worker1:
572*c0909341SAndroid Build Coastguard Worker        dup             v0.8b,   v0.b[0]
573*c0909341SAndroid Build Coastguard Worker2:
574*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
575*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
576*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
577*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x0], x1
578*c0909341SAndroid Build Coastguard Worker        st1             {v0.s}[0],  [x6], x1
579*c0909341SAndroid Build Coastguard Worker        b.gt            2b
580*c0909341SAndroid Build Coastguard Worker        ret
581*c0909341SAndroid Build Coastguard Worker
582*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h8):
583*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
584*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [x2], #8
585*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
586*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
587*c0909341SAndroid Build Coastguard Worker        br              x3
588*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w8):
589*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
590*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b},  [x2]
591*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
592*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.8b
593*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
594*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
595*c0909341SAndroid Build Coastguard Worker        ushl            v0.4h,   v0.4h,   v17.4h
596*c0909341SAndroid Build Coastguard Worker        b.eq            1f
597*c0909341SAndroid Build Coastguard Worker        // h = 4/16/32
598*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
599*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x3334/2)
600*c0909341SAndroid Build Coastguard Worker        mov             w17, #(0x5556/2)
601*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
602*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
603*c0909341SAndroid Build Coastguard Worker        sqdmulh         v0.4h,   v0.4h,   v16.4h
604*c0909341SAndroid Build Coastguard Worker1:
605*c0909341SAndroid Build Coastguard Worker        dup             v0.8b,   v0.b[0]
606*c0909341SAndroid Build Coastguard Worker2:
607*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
608*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
609*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
610*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x0], x1
611*c0909341SAndroid Build Coastguard Worker        st1             {v0.8b},  [x6], x1
612*c0909341SAndroid Build Coastguard Worker        b.gt            2b
613*c0909341SAndroid Build Coastguard Worker        ret
614*c0909341SAndroid Build Coastguard Worker
615*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h16):
616*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
617*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2], #16
618*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
619*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
620*c0909341SAndroid Build Coastguard Worker        br              x3
621*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w16):
622*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
623*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x2]
624*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
625*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
626*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #16
627*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
628*c0909341SAndroid Build Coastguard Worker        ushl            v0.4h,   v0.4h,   v17.4h
629*c0909341SAndroid Build Coastguard Worker        b.eq            1f
630*c0909341SAndroid Build Coastguard Worker        // h = 4/8/32/64
631*c0909341SAndroid Build Coastguard Worker        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
632*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x3334/2)
633*c0909341SAndroid Build Coastguard Worker        mov             w17, #(0x5556/2)
634*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
635*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
636*c0909341SAndroid Build Coastguard Worker        sqdmulh         v0.4h,   v0.4h,   v16.4h
637*c0909341SAndroid Build Coastguard Worker1:
638*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v0.b[0]
639*c0909341SAndroid Build Coastguard Worker2:
640*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
641*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
642*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
643*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], x1
644*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x6], x1
645*c0909341SAndroid Build Coastguard Worker        b.gt            2b
646*c0909341SAndroid Build Coastguard Worker        ret
647*c0909341SAndroid Build Coastguard Worker
648*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h32):
649*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
650*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x2], #32
651*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
652*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
653*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
654*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
655*c0909341SAndroid Build Coastguard Worker        br              x3
656*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w32):
657*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
658*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b, v2.16b}, [x2]
659*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
660*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
661*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
662*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
663*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
664*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
665*c0909341SAndroid Build Coastguard Worker        ushl            v4.4h,   v0.4h,   v17.4h
666*c0909341SAndroid Build Coastguard Worker        b.eq            1f
667*c0909341SAndroid Build Coastguard Worker        // h = 8/16/64
668*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
669*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x3334/2)
670*c0909341SAndroid Build Coastguard Worker        mov             w17, #(0x5556/2)
671*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
672*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
673*c0909341SAndroid Build Coastguard Worker        sqdmulh         v4.4h,   v4.4h,   v16.4h
674*c0909341SAndroid Build Coastguard Worker1:
675*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v4.b[0]
676*c0909341SAndroid Build Coastguard Worker        dup             v1.16b,  v4.b[0]
677*c0909341SAndroid Build Coastguard Worker2:
678*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
679*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
680*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
681*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0], x1
682*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x6], x1
683*c0909341SAndroid Build Coastguard Worker        b.gt            2b
684*c0909341SAndroid Build Coastguard Worker        ret
685*c0909341SAndroid Build Coastguard Worker
686*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h64):
687*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
688*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
689*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
690*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
691*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
692*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
693*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
694*c0909341SAndroid Build Coastguard Worker        add             v2.4h,   v2.4h,   v3.4h
695*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
696*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
697*c0909341SAndroid Build Coastguard Worker        br              x3
698*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w64):
699*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
700*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
701*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
702*c0909341SAndroid Build Coastguard Worker        uaddlv          h1,      v1.16b
703*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
704*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
705*c0909341SAndroid Build Coastguard Worker        uaddlv          h4,      v4.16b
706*c0909341SAndroid Build Coastguard Worker        add             v1.4h,   v1.4h,   v2.4h
707*c0909341SAndroid Build Coastguard Worker        add             v3.4h,   v3.4h,   v4.4h
708*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #64
709*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v1.4h
710*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v3.4h
711*c0909341SAndroid Build Coastguard Worker        ushl            v4.4h,   v0.4h,   v17.4h
712*c0909341SAndroid Build Coastguard Worker        b.eq            1f
713*c0909341SAndroid Build Coastguard Worker        // h = 16/32
714*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x5556/2)
715*c0909341SAndroid Build Coastguard Worker        movk            w16, #(0x3334/2), lsl #16
716*c0909341SAndroid Build Coastguard Worker        lsr             w16, w16, w4
717*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
718*c0909341SAndroid Build Coastguard Worker        sqdmulh         v4.4h,   v4.4h,   v16.4h
719*c0909341SAndroid Build Coastguard Worker1:
720*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  v4.b[0]
721*c0909341SAndroid Build Coastguard Worker        dup             v1.16b,  v4.b[0]
722*c0909341SAndroid Build Coastguard Worker        dup             v2.16b,  v4.b[0]
723*c0909341SAndroid Build Coastguard Worker        dup             v3.16b,  v4.b[0]
724*c0909341SAndroid Build Coastguard Worker2:
725*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
726*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
727*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
728*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
729*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
730*c0909341SAndroid Build Coastguard Worker        b.gt            2b
731*c0909341SAndroid Build Coastguard Worker        ret
732*c0909341SAndroid Build Coastguard Workerendfunc
733*c0909341SAndroid Build Coastguard Worker
734*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_tbl
735*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h64) - ipred_dc_tbl
736*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h32) - ipred_dc_tbl
737*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h16) - ipred_dc_tbl
738*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h8)  - ipred_dc_tbl
739*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h4)  - ipred_dc_tbl
740*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w64) - ipred_dc_tbl
741*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w32) - ipred_dc_tbl
742*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w16) - ipred_dc_tbl
743*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w8)  - ipred_dc_tbl
744*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w4)  - ipred_dc_tbl
745*c0909341SAndroid Build Coastguard Workerendjumptable
746*c0909341SAndroid Build Coastguard Worker
747*c0909341SAndroid Build Coastguard Worker// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
748*c0909341SAndroid Build Coastguard Worker//                            const pixel *const topleft,
749*c0909341SAndroid Build Coastguard Worker//                            const int width, const int height, const int a,
750*c0909341SAndroid Build Coastguard Worker//                            const int max_width, const int max_height);
751*c0909341SAndroid Build Coastguard Workerfunction ipred_paeth_8bpc_neon, export=1
752*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
753*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_paeth_tbl
754*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
755*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
756*c0909341SAndroid Build Coastguard Worker        ld1r            {v4.16b},  [x2]
757*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  #1
758*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
759*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
760*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
761*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
762*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
763*c0909341SAndroid Build Coastguard Worker        br              x5
764*c0909341SAndroid Build Coastguard Worker40:
765*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
766*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.4s},  [x8]
767*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
768*c0909341SAndroid Build Coastguard Worker4:
769*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
770*c0909341SAndroid Build Coastguard Worker        zip1            v0.2s,   v0.2s,   v1.2s
771*c0909341SAndroid Build Coastguard Worker        zip1            v2.2s,   v2.2s,   v3.2s
772*c0909341SAndroid Build Coastguard Worker        uaddw           v16.8h,  v6.8h,   v0.8b
773*c0909341SAndroid Build Coastguard Worker        uaddw           v17.8h,  v6.8h,   v2.8b
774*c0909341SAndroid Build Coastguard Worker        sqxtun          v16.8b,  v16.8h           // base
775*c0909341SAndroid Build Coastguard Worker        sqxtun2         v16.16b, v17.8h
776*c0909341SAndroid Build Coastguard Worker        zip1            v0.2d,   v0.2d,   v2.2d
777*c0909341SAndroid Build Coastguard Worker        uabd            v20.16b, v5.16b,  v16.16b // tdiff
778*c0909341SAndroid Build Coastguard Worker        uabd            v22.16b, v4.16b,  v16.16b // tldiff
779*c0909341SAndroid Build Coastguard Worker        uabd            v16.16b, v0.16b,  v16.16b // ldiff
780*c0909341SAndroid Build Coastguard Worker        umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
781*c0909341SAndroid Build Coastguard Worker        cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
782*c0909341SAndroid Build Coastguard Worker        cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
783*c0909341SAndroid Build Coastguard Worker        bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
784*c0909341SAndroid Build Coastguard Worker        bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
785*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[3], [x0], x1
786*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[2], [x6], x1
787*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
788*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[1], [x0], x1
789*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[0], [x6], x1
790*c0909341SAndroid Build Coastguard Worker        b.gt            4b
791*c0909341SAndroid Build Coastguard Worker        ret
792*c0909341SAndroid Build Coastguard Worker80:
793*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
794*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.2d},  [x8]
795*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
796*c0909341SAndroid Build Coastguard Worker8:
797*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
798*c0909341SAndroid Build Coastguard Worker        uaddw           v16.8h,  v6.8h,   v0.8b
799*c0909341SAndroid Build Coastguard Worker        uaddw           v17.8h,  v6.8h,   v1.8b
800*c0909341SAndroid Build Coastguard Worker        uaddw           v18.8h,  v6.8h,   v2.8b
801*c0909341SAndroid Build Coastguard Worker        uaddw           v19.8h,  v6.8h,   v3.8b
802*c0909341SAndroid Build Coastguard Worker        sqxtun          v16.8b,  v16.8h           // base
803*c0909341SAndroid Build Coastguard Worker        sqxtun2         v16.16b, v17.8h
804*c0909341SAndroid Build Coastguard Worker        sqxtun          v18.8b,  v18.8h
805*c0909341SAndroid Build Coastguard Worker        sqxtun2         v18.16b, v19.8h
806*c0909341SAndroid Build Coastguard Worker        zip1            v2.2d,   v2.2d,   v3.2d
807*c0909341SAndroid Build Coastguard Worker        zip1            v0.2d,   v0.2d,   v1.2d
808*c0909341SAndroid Build Coastguard Worker        uabd            v21.16b, v5.16b,  v18.16b // tdiff
809*c0909341SAndroid Build Coastguard Worker        uabd            v20.16b, v5.16b,  v16.16b
810*c0909341SAndroid Build Coastguard Worker        uabd            v23.16b, v4.16b,  v18.16b // tldiff
811*c0909341SAndroid Build Coastguard Worker        uabd            v22.16b, v4.16b,  v16.16b
812*c0909341SAndroid Build Coastguard Worker        uabd            v17.16b, v2.16b,  v18.16b // ldiff
813*c0909341SAndroid Build Coastguard Worker        uabd            v16.16b, v0.16b,  v16.16b
814*c0909341SAndroid Build Coastguard Worker        umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
815*c0909341SAndroid Build Coastguard Worker        umin            v18.16b, v20.16b, v22.16b
816*c0909341SAndroid Build Coastguard Worker        cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
817*c0909341SAndroid Build Coastguard Worker        cmhs            v20.16b, v22.16b, v20.16b
818*c0909341SAndroid Build Coastguard Worker        cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
819*c0909341SAndroid Build Coastguard Worker        cmhs            v16.16b, v18.16b, v16.16b
820*c0909341SAndroid Build Coastguard Worker        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
821*c0909341SAndroid Build Coastguard Worker        bsl             v20.16b, v5.16b,  v4.16b
822*c0909341SAndroid Build Coastguard Worker        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
823*c0909341SAndroid Build Coastguard Worker        bit             v20.16b, v0.16b,  v16.16b
824*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[1], [x0], x1
825*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[0], [x6], x1
826*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
827*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[1], [x0], x1
828*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[0], [x6], x1
829*c0909341SAndroid Build Coastguard Worker        b.gt            8b
830*c0909341SAndroid Build Coastguard Worker        ret
831*c0909341SAndroid Build Coastguard Worker160:
832*c0909341SAndroid Build Coastguard Worker320:
833*c0909341SAndroid Build Coastguard Worker640:
834*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
835*c0909341SAndroid Build Coastguard Worker        ld1             {v5.16b},  [x8], #16
836*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
837*c0909341SAndroid Build Coastguard Worker        // Set up pointers for four rows in parallel; x0, x6, x5, x10
838*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  x1
839*c0909341SAndroid Build Coastguard Worker        add             x10, x6,  x1
840*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
841*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw
842*c0909341SAndroid Build Coastguard Worker1:
843*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
844*c0909341SAndroid Build Coastguard Worker2:
845*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
846*c0909341SAndroid Build Coastguard Worker        usubl2          v7.8h,   v5.16b,  v4.16b
847*c0909341SAndroid Build Coastguard Worker        uaddw           v24.8h,  v6.8h,   v0.8b
848*c0909341SAndroid Build Coastguard Worker        uaddw           v25.8h,  v7.8h,   v0.8b
849*c0909341SAndroid Build Coastguard Worker        uaddw           v26.8h,  v6.8h,   v1.8b
850*c0909341SAndroid Build Coastguard Worker        uaddw           v27.8h,  v7.8h,   v1.8b
851*c0909341SAndroid Build Coastguard Worker        uaddw           v28.8h,  v6.8h,   v2.8b
852*c0909341SAndroid Build Coastguard Worker        uaddw           v29.8h,  v7.8h,   v2.8b
853*c0909341SAndroid Build Coastguard Worker        uaddw           v30.8h,  v6.8h,   v3.8b
854*c0909341SAndroid Build Coastguard Worker        uaddw           v31.8h,  v7.8h,   v3.8b
855*c0909341SAndroid Build Coastguard Worker        sqxtun          v17.8b,  v26.8h           // base
856*c0909341SAndroid Build Coastguard Worker        sqxtun2         v17.16b, v27.8h
857*c0909341SAndroid Build Coastguard Worker        sqxtun          v16.8b,  v24.8h
858*c0909341SAndroid Build Coastguard Worker        sqxtun2         v16.16b, v25.8h
859*c0909341SAndroid Build Coastguard Worker        sqxtun          v19.8b,  v30.8h
860*c0909341SAndroid Build Coastguard Worker        sqxtun2         v19.16b, v31.8h
861*c0909341SAndroid Build Coastguard Worker        sqxtun          v18.8b,  v28.8h
862*c0909341SAndroid Build Coastguard Worker        sqxtun2         v18.16b, v29.8h
863*c0909341SAndroid Build Coastguard Worker        uabd            v23.16b, v5.16b,  v19.16b // tdiff
864*c0909341SAndroid Build Coastguard Worker        uabd            v22.16b, v5.16b,  v18.16b
865*c0909341SAndroid Build Coastguard Worker        uabd            v21.16b, v5.16b,  v17.16b
866*c0909341SAndroid Build Coastguard Worker        uabd            v20.16b, v5.16b,  v16.16b
867*c0909341SAndroid Build Coastguard Worker        uabd            v27.16b, v4.16b,  v19.16b // tldiff
868*c0909341SAndroid Build Coastguard Worker        uabd            v26.16b, v4.16b,  v18.16b
869*c0909341SAndroid Build Coastguard Worker        uabd            v25.16b, v4.16b,  v17.16b
870*c0909341SAndroid Build Coastguard Worker        uabd            v24.16b, v4.16b,  v16.16b
871*c0909341SAndroid Build Coastguard Worker        uabd            v19.16b, v3.16b,  v19.16b // ldiff
872*c0909341SAndroid Build Coastguard Worker        uabd            v18.16b, v2.16b,  v18.16b
873*c0909341SAndroid Build Coastguard Worker        uabd            v17.16b, v1.16b,  v17.16b
874*c0909341SAndroid Build Coastguard Worker        uabd            v16.16b, v0.16b,  v16.16b
875*c0909341SAndroid Build Coastguard Worker        umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
876*c0909341SAndroid Build Coastguard Worker        umin            v30.16b, v22.16b, v26.16b
877*c0909341SAndroid Build Coastguard Worker        umin            v29.16b, v21.16b, v25.16b
878*c0909341SAndroid Build Coastguard Worker        umin            v28.16b, v20.16b, v24.16b
879*c0909341SAndroid Build Coastguard Worker        cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
880*c0909341SAndroid Build Coastguard Worker        cmhs            v22.16b, v26.16b, v22.16b
881*c0909341SAndroid Build Coastguard Worker        cmhs            v21.16b, v25.16b, v21.16b
882*c0909341SAndroid Build Coastguard Worker        cmhs            v20.16b, v24.16b, v20.16b
883*c0909341SAndroid Build Coastguard Worker        cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
884*c0909341SAndroid Build Coastguard Worker        cmhs            v18.16b, v30.16b, v18.16b
885*c0909341SAndroid Build Coastguard Worker        cmhs            v17.16b, v29.16b, v17.16b
886*c0909341SAndroid Build Coastguard Worker        cmhs            v16.16b, v28.16b, v16.16b
887*c0909341SAndroid Build Coastguard Worker        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
888*c0909341SAndroid Build Coastguard Worker        bsl             v22.16b, v5.16b,  v4.16b
889*c0909341SAndroid Build Coastguard Worker        bsl             v21.16b, v5.16b,  v4.16b
890*c0909341SAndroid Build Coastguard Worker        bsl             v20.16b, v5.16b,  v4.16b
891*c0909341SAndroid Build Coastguard Worker        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
892*c0909341SAndroid Build Coastguard Worker        bit             v22.16b, v2.16b,  v18.16b
893*c0909341SAndroid Build Coastguard Worker        bit             v21.16b, v1.16b,  v17.16b
894*c0909341SAndroid Build Coastguard Worker        bit             v20.16b, v0.16b,  v16.16b
895*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
896*c0909341SAndroid Build Coastguard Worker        st1             {v23.16b}, [x0],  #16
897*c0909341SAndroid Build Coastguard Worker        st1             {v22.16b}, [x6],  #16
898*c0909341SAndroid Build Coastguard Worker        st1             {v21.16b}, [x5],  #16
899*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b}, [x10], #16
900*c0909341SAndroid Build Coastguard Worker        b.le            8f
901*c0909341SAndroid Build Coastguard Worker        ld1             {v5.16b},  [x8], #16
902*c0909341SAndroid Build Coastguard Worker        b               2b
903*c0909341SAndroid Build Coastguard Worker8:
904*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
905*c0909341SAndroid Build Coastguard Worker        b.le            9f
906*c0909341SAndroid Build Coastguard Worker        // End of horizontal loop, move pointers to next four rows
907*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  w9, uxtw
908*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
909*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
910*c0909341SAndroid Build Coastguard Worker        // Load the top row as early as possible
911*c0909341SAndroid Build Coastguard Worker        ld1             {v5.16b},  [x8], #16
912*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x1
913*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x1
914*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
915*c0909341SAndroid Build Coastguard Worker        b               1b
916*c0909341SAndroid Build Coastguard Worker9:
917*c0909341SAndroid Build Coastguard Worker        ret
918*c0909341SAndroid Build Coastguard Workerendfunc
919*c0909341SAndroid Build Coastguard Worker
920*c0909341SAndroid Build Coastguard Workerjumptable ipred_paeth_tbl
921*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_paeth_tbl
922*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_paeth_tbl
923*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_paeth_tbl
924*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_paeth_tbl
925*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_paeth_tbl
926*c0909341SAndroid Build Coastguard Workerendjumptable
927*c0909341SAndroid Build Coastguard Worker
928*c0909341SAndroid Build Coastguard Worker// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
929*c0909341SAndroid Build Coastguard Worker//                             const pixel *const topleft,
930*c0909341SAndroid Build Coastguard Worker//                             const int width, const int height, const int a,
931*c0909341SAndroid Build Coastguard Worker//                             const int max_width, const int max_height);
932*c0909341SAndroid Build Coastguard Workerfunction ipred_smooth_8bpc_neon, export=1
933*c0909341SAndroid Build Coastguard Worker        movrel          x10, X(sm_weights)
934*c0909341SAndroid Build Coastguard Worker        add             x11, x10, w4, uxtw
935*c0909341SAndroid Build Coastguard Worker        add             x10, x10, w3, uxtw
936*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
937*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_smooth_tbl
938*c0909341SAndroid Build Coastguard Worker        sub             x12, x2,  w4, uxtw
939*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
940*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
941*c0909341SAndroid Build Coastguard Worker        ld1r            {v4.16b},  [x12] // bottom
942*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  #1
943*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
944*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
945*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
946*c0909341SAndroid Build Coastguard Worker        br              x5
947*c0909341SAndroid Build Coastguard Worker40:
948*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
949*c0909341SAndroid Build Coastguard Worker        ld1r            {v6.2s}, [x8]             // top
950*c0909341SAndroid Build Coastguard Worker        ld1r            {v7.2s}, [x10]            // weights_hor
951*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
952*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
953*c0909341SAndroid Build Coastguard Worker        dup             v5.16b,  v6.b[3]          // right
954*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
955*c0909341SAndroid Build Coastguard Worker        uxtl            v7.8h,   v7.8b            // weights_hor
956*c0909341SAndroid Build Coastguard Worker4:
957*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
958*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
959*c0909341SAndroid Build Coastguard Worker        shll            v20.8h,  v5.8b,   #8      // right*256
960*c0909341SAndroid Build Coastguard Worker        shll            v21.8h,  v5.8b,   #8
961*c0909341SAndroid Build Coastguard Worker        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
962*c0909341SAndroid Build Coastguard Worker        zip1            v0.2s,   v3.2s,   v2.2s
963*c0909341SAndroid Build Coastguard Worker        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
964*c0909341SAndroid Build Coastguard Worker        zip1            v18.2s,  v18.2s,  v19.2s
965*c0909341SAndroid Build Coastguard Worker        shll            v22.8h,  v4.8b,   #8      // bottom*256
966*c0909341SAndroid Build Coastguard Worker        shll            v23.8h,  v4.8b,   #8
967*c0909341SAndroid Build Coastguard Worker        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
968*c0909341SAndroid Build Coastguard Worker        usubl           v1.8h,   v1.8b,   v5.8b
969*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
970*c0909341SAndroid Build Coastguard Worker        uxtl            v18.8h,  v18.8b
971*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
972*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v1.8h,   v7.8h
973*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
974*c0909341SAndroid Build Coastguard Worker        mla             v23.8h,  v6.8h,   v18.8h
975*c0909341SAndroid Build Coastguard Worker        uhadd           v20.8h,  v20.8h,  v22.8h
976*c0909341SAndroid Build Coastguard Worker        uhadd           v21.8h,  v21.8h,  v23.8h
977*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #8
978*c0909341SAndroid Build Coastguard Worker        rshrn           v21.8b,  v21.8h,  #8
979*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[0], [x0], x1
980*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[1], [x6], x1
981*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
982*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[0], [x0], x1
983*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[1], [x6], x1
984*c0909341SAndroid Build Coastguard Worker        b.gt            4b
985*c0909341SAndroid Build Coastguard Worker        ret
986*c0909341SAndroid Build Coastguard Worker80:
987*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
988*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8b}, [x8]             // top
989*c0909341SAndroid Build Coastguard Worker        ld1             {v7.8b}, [x10]            // weights_hor
990*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
991*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
992*c0909341SAndroid Build Coastguard Worker        dup             v5.16b,  v6.b[7]          // right
993*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
994*c0909341SAndroid Build Coastguard Worker        uxtl            v7.8h,   v7.8b            // weights_hor
995*c0909341SAndroid Build Coastguard Worker8:
996*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
997*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
998*c0909341SAndroid Build Coastguard Worker        shll            v20.8h,  v5.8b,   #8      // right*256
999*c0909341SAndroid Build Coastguard Worker        shll            v21.8h,  v5.8b,   #8
1000*c0909341SAndroid Build Coastguard Worker        shll            v22.8h,  v5.8b,   #8
1001*c0909341SAndroid Build Coastguard Worker        shll            v23.8h,  v5.8b,   #8
1002*c0909341SAndroid Build Coastguard Worker        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1003*c0909341SAndroid Build Coastguard Worker        usubl           v1.8h,   v1.8b,   v5.8b
1004*c0909341SAndroid Build Coastguard Worker        usubl           v2.8h,   v2.8b,   v5.8b
1005*c0909341SAndroid Build Coastguard Worker        usubl           v3.8h,   v3.8b,   v5.8b
1006*c0909341SAndroid Build Coastguard Worker        shll            v24.8h,  v4.8b,   #8      // bottom*256
1007*c0909341SAndroid Build Coastguard Worker        shll            v25.8h,  v4.8b,   #8
1008*c0909341SAndroid Build Coastguard Worker        shll            v26.8h,  v4.8b,   #8
1009*c0909341SAndroid Build Coastguard Worker        shll            v27.8h,  v4.8b,   #8
1010*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1011*c0909341SAndroid Build Coastguard Worker        uxtl            v17.8h,  v17.8b
1012*c0909341SAndroid Build Coastguard Worker        uxtl            v18.8h,  v18.8b
1013*c0909341SAndroid Build Coastguard Worker        uxtl            v19.8h,  v19.8b
1014*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1015*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
1016*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v1.8h,   v7.8h
1017*c0909341SAndroid Build Coastguard Worker        mla             v23.8h,  v0.8h,   v7.8h
1018*c0909341SAndroid Build Coastguard Worker        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1019*c0909341SAndroid Build Coastguard Worker        mla             v25.8h,  v6.8h,   v17.8h
1020*c0909341SAndroid Build Coastguard Worker        mla             v26.8h,  v6.8h,   v18.8h
1021*c0909341SAndroid Build Coastguard Worker        mla             v27.8h,  v6.8h,   v19.8h
1022*c0909341SAndroid Build Coastguard Worker        uhadd           v20.8h,  v20.8h,  v24.8h
1023*c0909341SAndroid Build Coastguard Worker        uhadd           v21.8h,  v21.8h,  v25.8h
1024*c0909341SAndroid Build Coastguard Worker        uhadd           v22.8h,  v22.8h,  v26.8h
1025*c0909341SAndroid Build Coastguard Worker        uhadd           v23.8h,  v23.8h,  v27.8h
1026*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #8
1027*c0909341SAndroid Build Coastguard Worker        rshrn           v21.8b,  v21.8h,  #8
1028*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #8
1029*c0909341SAndroid Build Coastguard Worker        rshrn           v23.8b,  v23.8h,  #8
1030*c0909341SAndroid Build Coastguard Worker        st1             {v20.8b}, [x0], x1
1031*c0909341SAndroid Build Coastguard Worker        st1             {v21.8b}, [x6], x1
1032*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1033*c0909341SAndroid Build Coastguard Worker        st1             {v22.8b}, [x0], x1
1034*c0909341SAndroid Build Coastguard Worker        st1             {v23.8b}, [x6], x1
1035*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1036*c0909341SAndroid Build Coastguard Worker        ret
1037*c0909341SAndroid Build Coastguard Worker160:
1038*c0909341SAndroid Build Coastguard Worker320:
1039*c0909341SAndroid Build Coastguard Worker640:
1040*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1041*c0909341SAndroid Build Coastguard Worker        add             x12, x2,  w3, uxtw
1042*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #2
1043*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-2
1044*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.16b}, [x12]           // right
1045*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw
1046*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
1047*c0909341SAndroid Build Coastguard Worker
1048*c0909341SAndroid Build Coastguard Worker1:
1049*c0909341SAndroid Build Coastguard Worker        ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
1050*c0909341SAndroid Build Coastguard Worker        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1051*c0909341SAndroid Build Coastguard Worker        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1052*c0909341SAndroid Build Coastguard Worker        usubl           v1.8h,   v1.8b,   v5.8b
1053*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1054*c0909341SAndroid Build Coastguard Worker        uxtl            v17.8h,  v17.8b
1055*c0909341SAndroid Build Coastguard Worker2:
1056*c0909341SAndroid Build Coastguard Worker        ld1             {v7.16b}, [x10],  #16     // weights_hor
1057*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x8],   #16     // top
1058*c0909341SAndroid Build Coastguard Worker        shll            v20.8h,  v5.8b,   #8      // right*256
1059*c0909341SAndroid Build Coastguard Worker        shll            v21.8h,  v5.8b,   #8
1060*c0909341SAndroid Build Coastguard Worker        shll            v22.8h,  v5.8b,   #8
1061*c0909341SAndroid Build Coastguard Worker        shll            v23.8h,  v5.8b,   #8
1062*c0909341SAndroid Build Coastguard Worker        uxtl            v6.8h,   v7.8b            // weights_hor
1063*c0909341SAndroid Build Coastguard Worker        uxtl2           v7.8h,   v7.16b
1064*c0909341SAndroid Build Coastguard Worker        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1065*c0909341SAndroid Build Coastguard Worker        usubl2          v3.8h,   v3.16b,  v4.16b
1066*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1067*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
1068*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v0.8h,   v6.8h
1069*c0909341SAndroid Build Coastguard Worker        mla             v23.8h,  v0.8h,   v7.8h
1070*c0909341SAndroid Build Coastguard Worker        shll            v24.8h,  v4.8b,   #8      // bottom*256
1071*c0909341SAndroid Build Coastguard Worker        shll            v25.8h,  v4.8b,   #8
1072*c0909341SAndroid Build Coastguard Worker        shll            v26.8h,  v4.8b,   #8
1073*c0909341SAndroid Build Coastguard Worker        shll            v27.8h,  v4.8b,   #8
1074*c0909341SAndroid Build Coastguard Worker        mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1075*c0909341SAndroid Build Coastguard Worker        mla             v25.8h,  v3.8h,   v16.8h
1076*c0909341SAndroid Build Coastguard Worker        mla             v26.8h,  v2.8h,   v17.8h
1077*c0909341SAndroid Build Coastguard Worker        mla             v27.8h,  v3.8h,   v17.8h
1078*c0909341SAndroid Build Coastguard Worker        uhadd           v20.8h,  v20.8h,  v24.8h
1079*c0909341SAndroid Build Coastguard Worker        uhadd           v21.8h,  v21.8h,  v25.8h
1080*c0909341SAndroid Build Coastguard Worker        uhadd           v22.8h,  v22.8h,  v26.8h
1081*c0909341SAndroid Build Coastguard Worker        uhadd           v23.8h,  v23.8h,  v27.8h
1082*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #8
1083*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.16b, v21.8h,  #8
1084*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #8
1085*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.16b, v23.8h,  #8
1086*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1087*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b}, [x0],  #16
1088*c0909341SAndroid Build Coastguard Worker        st1             {v22.16b}, [x6],  #16
1089*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1090*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1091*c0909341SAndroid Build Coastguard Worker        b.le            9f
1092*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  w9, uxtw
1093*c0909341SAndroid Build Coastguard Worker        sub             x10, x10, w9, uxtw
1094*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1095*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
1096*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
1097*c0909341SAndroid Build Coastguard Worker        b               1b
1098*c0909341SAndroid Build Coastguard Worker9:
1099*c0909341SAndroid Build Coastguard Worker        ret
1100*c0909341SAndroid Build Coastguard Workerendfunc
1101*c0909341SAndroid Build Coastguard Worker
1102*c0909341SAndroid Build Coastguard Workerjumptable ipred_smooth_tbl
1103*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_smooth_tbl
1104*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_smooth_tbl
1105*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_smooth_tbl
1106*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_smooth_tbl
1107*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_smooth_tbl
1108*c0909341SAndroid Build Coastguard Workerendjumptable
1109*c0909341SAndroid Build Coastguard Worker
1110*c0909341SAndroid Build Coastguard Worker// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1111*c0909341SAndroid Build Coastguard Worker//                               const pixel *const topleft,
1112*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height, const int a,
1113*c0909341SAndroid Build Coastguard Worker//                               const int max_width, const int max_height);
1114*c0909341SAndroid Build Coastguard Workerfunction ipred_smooth_v_8bpc_neon, export=1
1115*c0909341SAndroid Build Coastguard Worker        movrel          x7,  X(sm_weights)
1116*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  w4, uxtw
1117*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
1118*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_smooth_v_tbl
1119*c0909341SAndroid Build Coastguard Worker        sub             x8,  x2,  w4, uxtw
1120*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
1121*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
1122*c0909341SAndroid Build Coastguard Worker        ld1r            {v4.16b},  [x8] // bottom
1123*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
1124*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
1125*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
1126*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1127*c0909341SAndroid Build Coastguard Worker        br              x5
1128*c0909341SAndroid Build Coastguard Worker40:
1129*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1130*c0909341SAndroid Build Coastguard Worker        ld1r            {v6.2s}, [x2]             // top
1131*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
1132*c0909341SAndroid Build Coastguard Worker4:
1133*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1134*c0909341SAndroid Build Coastguard Worker        shll            v22.8h,  v4.8b,   #8      // bottom*256
1135*c0909341SAndroid Build Coastguard Worker        shll            v23.8h,  v4.8b,   #8
1136*c0909341SAndroid Build Coastguard Worker        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1137*c0909341SAndroid Build Coastguard Worker        zip1            v18.2s,  v18.2s,  v19.2s
1138*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1139*c0909341SAndroid Build Coastguard Worker        uxtl            v18.8h,  v18.8b
1140*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1141*c0909341SAndroid Build Coastguard Worker        mla             v23.8h,  v6.8h,   v18.8h
1142*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #8
1143*c0909341SAndroid Build Coastguard Worker        rshrn           v23.8b,  v23.8h,  #8
1144*c0909341SAndroid Build Coastguard Worker        st1             {v22.s}[0], [x0], x1
1145*c0909341SAndroid Build Coastguard Worker        st1             {v22.s}[1], [x6], x1
1146*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1147*c0909341SAndroid Build Coastguard Worker        st1             {v23.s}[0], [x0], x1
1148*c0909341SAndroid Build Coastguard Worker        st1             {v23.s}[1], [x6], x1
1149*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1150*c0909341SAndroid Build Coastguard Worker        ret
1151*c0909341SAndroid Build Coastguard Worker80:
1152*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1153*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8b}, [x2]             // top
1154*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
1155*c0909341SAndroid Build Coastguard Worker8:
1156*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1157*c0909341SAndroid Build Coastguard Worker        shll            v24.8h,  v4.8b,   #8      // bottom*256
1158*c0909341SAndroid Build Coastguard Worker        shll            v25.8h,  v4.8b,   #8
1159*c0909341SAndroid Build Coastguard Worker        shll            v26.8h,  v4.8b,   #8
1160*c0909341SAndroid Build Coastguard Worker        shll            v27.8h,  v4.8b,   #8
1161*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1162*c0909341SAndroid Build Coastguard Worker        uxtl            v17.8h,  v17.8b
1163*c0909341SAndroid Build Coastguard Worker        uxtl            v18.8h,  v18.8b
1164*c0909341SAndroid Build Coastguard Worker        uxtl            v19.8h,  v19.8b
1165*c0909341SAndroid Build Coastguard Worker        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1166*c0909341SAndroid Build Coastguard Worker        mla             v25.8h,  v6.8h,   v17.8h
1167*c0909341SAndroid Build Coastguard Worker        mla             v26.8h,  v6.8h,   v18.8h
1168*c0909341SAndroid Build Coastguard Worker        mla             v27.8h,  v6.8h,   v19.8h
1169*c0909341SAndroid Build Coastguard Worker        rshrn           v24.8b,  v24.8h,  #8
1170*c0909341SAndroid Build Coastguard Worker        rshrn           v25.8b,  v25.8h,  #8
1171*c0909341SAndroid Build Coastguard Worker        rshrn           v26.8b,  v26.8h,  #8
1172*c0909341SAndroid Build Coastguard Worker        rshrn           v27.8b,  v27.8h,  #8
1173*c0909341SAndroid Build Coastguard Worker        st1             {v24.8b}, [x0], x1
1174*c0909341SAndroid Build Coastguard Worker        st1             {v25.8b}, [x6], x1
1175*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1176*c0909341SAndroid Build Coastguard Worker        st1             {v26.8b}, [x0], x1
1177*c0909341SAndroid Build Coastguard Worker        st1             {v27.8b}, [x6], x1
1178*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1179*c0909341SAndroid Build Coastguard Worker        ret
1180*c0909341SAndroid Build Coastguard Worker160:
1181*c0909341SAndroid Build Coastguard Worker320:
1182*c0909341SAndroid Build Coastguard Worker640:
1183*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1184*c0909341SAndroid Build Coastguard Worker        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1185*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  x1
1186*c0909341SAndroid Build Coastguard Worker        add             x8,  x6,  x1
1187*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1188*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw
1189*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
1190*c0909341SAndroid Build Coastguard Worker
1191*c0909341SAndroid Build Coastguard Worker1:
1192*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1193*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1194*c0909341SAndroid Build Coastguard Worker        uxtl            v17.8h,  v17.8b
1195*c0909341SAndroid Build Coastguard Worker        uxtl            v18.8h,  v18.8b
1196*c0909341SAndroid Build Coastguard Worker        uxtl            v19.8h,  v19.8b
1197*c0909341SAndroid Build Coastguard Worker2:
1198*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x2],   #16     // top
1199*c0909341SAndroid Build Coastguard Worker        shll            v20.8h,  v4.8b,   #8      // bottom*256
1200*c0909341SAndroid Build Coastguard Worker        shll            v21.8h,  v4.8b,   #8
1201*c0909341SAndroid Build Coastguard Worker        shll            v22.8h,  v4.8b,   #8
1202*c0909341SAndroid Build Coastguard Worker        shll            v23.8h,  v4.8b,   #8
1203*c0909341SAndroid Build Coastguard Worker        shll            v24.8h,  v4.8b,   #8
1204*c0909341SAndroid Build Coastguard Worker        shll            v25.8h,  v4.8b,   #8
1205*c0909341SAndroid Build Coastguard Worker        shll            v26.8h,  v4.8b,   #8
1206*c0909341SAndroid Build Coastguard Worker        shll            v27.8h,  v4.8b,   #8
1207*c0909341SAndroid Build Coastguard Worker        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1208*c0909341SAndroid Build Coastguard Worker        usubl2          v3.8h,   v3.16b,  v4.16b
1209*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1210*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v3.8h,   v16.8h
1211*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v2.8h,   v17.8h
1212*c0909341SAndroid Build Coastguard Worker        mla             v23.8h,  v3.8h,   v17.8h
1213*c0909341SAndroid Build Coastguard Worker        mla             v24.8h,  v2.8h,   v18.8h
1214*c0909341SAndroid Build Coastguard Worker        mla             v25.8h,  v3.8h,   v18.8h
1215*c0909341SAndroid Build Coastguard Worker        mla             v26.8h,  v2.8h,   v19.8h
1216*c0909341SAndroid Build Coastguard Worker        mla             v27.8h,  v3.8h,   v19.8h
1217*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #8
1218*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.16b, v21.8h,  #8
1219*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #8
1220*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.16b, v23.8h,  #8
1221*c0909341SAndroid Build Coastguard Worker        rshrn           v24.8b,  v24.8h,  #8
1222*c0909341SAndroid Build Coastguard Worker        rshrn2          v24.16b, v25.8h,  #8
1223*c0909341SAndroid Build Coastguard Worker        rshrn           v26.8b,  v26.8h,  #8
1224*c0909341SAndroid Build Coastguard Worker        rshrn2          v26.16b, v27.8h,  #8
1225*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1226*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b}, [x0],  #16
1227*c0909341SAndroid Build Coastguard Worker        st1             {v22.16b}, [x6],  #16
1228*c0909341SAndroid Build Coastguard Worker        st1             {v24.16b}, [x5],  #16
1229*c0909341SAndroid Build Coastguard Worker        st1             {v26.16b}, [x8],  #16
1230*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1231*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1232*c0909341SAndroid Build Coastguard Worker        b.le            9f
1233*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w9, uxtw
1234*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1235*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
1236*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x1
1237*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x1
1238*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
1239*c0909341SAndroid Build Coastguard Worker        b               1b
1240*c0909341SAndroid Build Coastguard Worker9:
1241*c0909341SAndroid Build Coastguard Worker        ret
1242*c0909341SAndroid Build Coastguard Workerendfunc
1243*c0909341SAndroid Build Coastguard Worker
1244*c0909341SAndroid Build Coastguard Workerjumptable ipred_smooth_v_tbl
1245*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_smooth_v_tbl
1246*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_smooth_v_tbl
1247*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_smooth_v_tbl
1248*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_smooth_v_tbl
1249*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_smooth_v_tbl
1250*c0909341SAndroid Build Coastguard Workerendjumptable
1251*c0909341SAndroid Build Coastguard Worker
1252*c0909341SAndroid Build Coastguard Worker// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1253*c0909341SAndroid Build Coastguard Worker//                               const pixel *const topleft,
1254*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height, const int a,
1255*c0909341SAndroid Build Coastguard Worker//                               const int max_width, const int max_height);
1256*c0909341SAndroid Build Coastguard Workerfunction ipred_smooth_h_8bpc_neon, export=1
1257*c0909341SAndroid Build Coastguard Worker        movrel          x8,  X(sm_weights)
1258*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  w3, uxtw
1259*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
1260*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_smooth_h_tbl
1261*c0909341SAndroid Build Coastguard Worker        add             x12, x2,  w3, uxtw
1262*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
1263*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
1264*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.16b},  [x12] // right
1265*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
1266*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
1267*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1268*c0909341SAndroid Build Coastguard Worker        br              x5
1269*c0909341SAndroid Build Coastguard Worker40:
1270*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1271*c0909341SAndroid Build Coastguard Worker        ld1r            {v7.2s}, [x8]             // weights_hor
1272*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
1273*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
1274*c0909341SAndroid Build Coastguard Worker        uxtl            v7.8h,   v7.8b            // weights_hor
1275*c0909341SAndroid Build Coastguard Worker4:
1276*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1277*c0909341SAndroid Build Coastguard Worker        shll            v20.8h,  v5.8b,   #8      // right*256
1278*c0909341SAndroid Build Coastguard Worker        shll            v21.8h,  v5.8b,   #8
1279*c0909341SAndroid Build Coastguard Worker        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
1280*c0909341SAndroid Build Coastguard Worker        zip1            v0.2s,   v3.2s,   v2.2s
1281*c0909341SAndroid Build Coastguard Worker        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1282*c0909341SAndroid Build Coastguard Worker        usubl           v1.8h,   v1.8b,   v5.8b
1283*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1284*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v1.8h,   v7.8h
1285*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #8
1286*c0909341SAndroid Build Coastguard Worker        rshrn           v21.8b,  v21.8h,  #8
1287*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[0], [x0], x1
1288*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[1], [x6], x1
1289*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1290*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[0], [x0], x1
1291*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[1], [x6], x1
1292*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1293*c0909341SAndroid Build Coastguard Worker        ret
1294*c0909341SAndroid Build Coastguard Worker80:
1295*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1296*c0909341SAndroid Build Coastguard Worker        ld1             {v7.8b}, [x8]             // weights_hor
1297*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
1298*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
1299*c0909341SAndroid Build Coastguard Worker        uxtl            v7.8h,   v7.8b            // weights_hor
1300*c0909341SAndroid Build Coastguard Worker8:
1301*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1302*c0909341SAndroid Build Coastguard Worker        shll            v20.8h,  v5.8b,   #8      // right*256
1303*c0909341SAndroid Build Coastguard Worker        shll            v21.8h,  v5.8b,   #8
1304*c0909341SAndroid Build Coastguard Worker        shll            v22.8h,  v5.8b,   #8
1305*c0909341SAndroid Build Coastguard Worker        shll            v23.8h,  v5.8b,   #8
1306*c0909341SAndroid Build Coastguard Worker        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
1307*c0909341SAndroid Build Coastguard Worker        usubl           v2.8h,   v2.8b,   v5.8b
1308*c0909341SAndroid Build Coastguard Worker        usubl           v1.8h,   v1.8b,   v5.8b
1309*c0909341SAndroid Build Coastguard Worker        usubl           v0.8h,   v0.8b,   v5.8b
1310*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1311*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
1312*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v1.8h,   v7.8h
1313*c0909341SAndroid Build Coastguard Worker        mla             v23.8h,  v0.8h,   v7.8h
1314*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #8
1315*c0909341SAndroid Build Coastguard Worker        rshrn           v21.8b,  v21.8h,  #8
1316*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #8
1317*c0909341SAndroid Build Coastguard Worker        rshrn           v23.8b,  v23.8h,  #8
1318*c0909341SAndroid Build Coastguard Worker        st1             {v20.8b}, [x0], x1
1319*c0909341SAndroid Build Coastguard Worker        st1             {v21.8b}, [x6], x1
1320*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1321*c0909341SAndroid Build Coastguard Worker        st1             {v22.8b}, [x0], x1
1322*c0909341SAndroid Build Coastguard Worker        st1             {v23.8b}, [x6], x1
1323*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1324*c0909341SAndroid Build Coastguard Worker        ret
1325*c0909341SAndroid Build Coastguard Worker160:
1326*c0909341SAndroid Build Coastguard Worker320:
1327*c0909341SAndroid Build Coastguard Worker640:
1328*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1329*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
1330*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
1331*c0909341SAndroid Build Coastguard Worker        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1332*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  x1
1333*c0909341SAndroid Build Coastguard Worker        add             x10, x6,  x1
1334*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1335*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw
1336*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
1337*c0909341SAndroid Build Coastguard Worker
1338*c0909341SAndroid Build Coastguard Worker1:
1339*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
1340*c0909341SAndroid Build Coastguard Worker        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1341*c0909341SAndroid Build Coastguard Worker        usubl           v1.8h,   v1.8b,   v5.8b
1342*c0909341SAndroid Build Coastguard Worker        usubl           v2.8h,   v2.8b,   v5.8b
1343*c0909341SAndroid Build Coastguard Worker        usubl           v3.8h,   v3.8b,   v5.8b
1344*c0909341SAndroid Build Coastguard Worker2:
1345*c0909341SAndroid Build Coastguard Worker        ld1             {v7.16b}, [x8],   #16     // weights_hor
1346*c0909341SAndroid Build Coastguard Worker        shll            v20.8h,  v5.8b,   #8      // right*256
1347*c0909341SAndroid Build Coastguard Worker        shll            v21.8h,  v5.8b,   #8
1348*c0909341SAndroid Build Coastguard Worker        shll            v22.8h,  v5.8b,   #8
1349*c0909341SAndroid Build Coastguard Worker        shll            v23.8h,  v5.8b,   #8
1350*c0909341SAndroid Build Coastguard Worker        shll            v24.8h,  v5.8b,   #8
1351*c0909341SAndroid Build Coastguard Worker        shll            v25.8h,  v5.8b,   #8
1352*c0909341SAndroid Build Coastguard Worker        shll            v26.8h,  v5.8b,   #8
1353*c0909341SAndroid Build Coastguard Worker        shll            v27.8h,  v5.8b,   #8
1354*c0909341SAndroid Build Coastguard Worker        uxtl            v6.8h,   v7.8b            // weights_hor
1355*c0909341SAndroid Build Coastguard Worker        uxtl2           v7.8h,   v7.16b
1356*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1357*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
1358*c0909341SAndroid Build Coastguard Worker        mla             v22.8h,  v2.8h,   v6.8h
1359*c0909341SAndroid Build Coastguard Worker        mla             v23.8h,  v2.8h,   v7.8h
1360*c0909341SAndroid Build Coastguard Worker        mla             v24.8h,  v1.8h,   v6.8h
1361*c0909341SAndroid Build Coastguard Worker        mla             v25.8h,  v1.8h,   v7.8h
1362*c0909341SAndroid Build Coastguard Worker        mla             v26.8h,  v0.8h,   v6.8h
1363*c0909341SAndroid Build Coastguard Worker        mla             v27.8h,  v0.8h,   v7.8h
1364*c0909341SAndroid Build Coastguard Worker        rshrn           v20.8b,  v20.8h,  #8
1365*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.16b, v21.8h,  #8
1366*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #8
1367*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.16b, v23.8h,  #8
1368*c0909341SAndroid Build Coastguard Worker        rshrn           v24.8b,  v24.8h,  #8
1369*c0909341SAndroid Build Coastguard Worker        rshrn2          v24.16b, v25.8h,  #8
1370*c0909341SAndroid Build Coastguard Worker        rshrn           v26.8b,  v26.8h,  #8
1371*c0909341SAndroid Build Coastguard Worker        rshrn2          v26.16b, v27.8h,  #8
1372*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1373*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b}, [x0],  #16
1374*c0909341SAndroid Build Coastguard Worker        st1             {v22.16b}, [x6],  #16
1375*c0909341SAndroid Build Coastguard Worker        st1             {v24.16b}, [x5],  #16
1376*c0909341SAndroid Build Coastguard Worker        st1             {v26.16b}, [x10], #16
1377*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1378*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1379*c0909341SAndroid Build Coastguard Worker        b.le            9f
1380*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  w9, uxtw
1381*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1382*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
1383*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x1
1384*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x1
1385*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
1386*c0909341SAndroid Build Coastguard Worker        b               1b
1387*c0909341SAndroid Build Coastguard Worker9:
1388*c0909341SAndroid Build Coastguard Worker        ret
1389*c0909341SAndroid Build Coastguard Workerendfunc
1390*c0909341SAndroid Build Coastguard Worker
1391*c0909341SAndroid Build Coastguard Workerjumptable ipred_smooth_h_tbl
1392*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_smooth_h_tbl
1393*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_smooth_h_tbl
1394*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_smooth_h_tbl
1395*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_smooth_h_tbl
1396*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_smooth_h_tbl
1397*c0909341SAndroid Build Coastguard Workerendjumptable
1398*c0909341SAndroid Build Coastguard Worker
1399*c0909341SAndroid Build Coastguard Workerconst padding_mask_buf
1400*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1401*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1402*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1403*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1404*c0909341SAndroid Build Coastguard Workerpadding_mask:
1405*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1406*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1407*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1408*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1409*c0909341SAndroid Build Coastguard Workerendconst
1410*c0909341SAndroid Build Coastguard Worker
1411*c0909341SAndroid Build Coastguard Worker// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
1412*c0909341SAndroid Build Coastguard Worker//                                       const pixel *const in, const int end);
1413*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_upsample_edge_8bpc_neon, export=1
1414*c0909341SAndroid Build Coastguard Worker        movrel          x4,  padding_mask
1415*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b},  [x2]           // in[]
1416*c0909341SAndroid Build Coastguard Worker        add             x5,  x2,  w3,  uxtw       // in[end]
1417*c0909341SAndroid Build Coastguard Worker        sub             x4,  x4,  w3,  uxtw
1418*c0909341SAndroid Build Coastguard Worker
1419*c0909341SAndroid Build Coastguard Worker        ld1r            {v1.16b},  [x5]           // padding
1420*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b},  [x4]           // padding_mask
1421*c0909341SAndroid Build Coastguard Worker
1422*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #9
1423*c0909341SAndroid Build Coastguard Worker
1424*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
1425*c0909341SAndroid Build Coastguard Worker
1426*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v0.16b,  v1.16b,  #1
1427*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #2
1428*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v0.16b,  v1.16b,  #3
1429*c0909341SAndroid Build Coastguard Worker
1430*c0909341SAndroid Build Coastguard Worker        uaddl           v16.8h,  v4.8b,   v5.8b   // in[i+1] + in[i+2]
1431*c0909341SAndroid Build Coastguard Worker        uaddl2          v17.8h,  v4.16b,  v5.16b
1432*c0909341SAndroid Build Coastguard Worker        uaddl           v18.8h,  v0.8b,   v6.8b   // in[i+0] + in[i+3]
1433*c0909341SAndroid Build Coastguard Worker        uaddl2          v19.8h,  v0.16b,  v6.16b
1434*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
1435*c0909341SAndroid Build Coastguard Worker        mul             v17.8h,  v17.8h,  v31.8h
1436*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v16.8h,  v18.8h
1437*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v17.8h,  v19.8h
1438*c0909341SAndroid Build Coastguard Worker
1439*c0909341SAndroid Build Coastguard Worker        sqrshrun        v16.8b,  v16.8h,  #4
1440*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v16.16b, v17.8h,  #4
1441*c0909341SAndroid Build Coastguard Worker
1442*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b,  v4.16b,  v16.16b
1443*c0909341SAndroid Build Coastguard Worker        zip2            v1.16b,  v4.16b,  v16.16b
1444*c0909341SAndroid Build Coastguard Worker
1445*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b, v1.16b}, [x0]
1446*c0909341SAndroid Build Coastguard Worker
1447*c0909341SAndroid Build Coastguard Worker        ret
1448*c0909341SAndroid Build Coastguard Workerendfunc
1449*c0909341SAndroid Build Coastguard Worker
1450*c0909341SAndroid Build Coastguard Worker// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz,
1451*c0909341SAndroid Build Coastguard Worker//                                       const pixel *const in);
1452*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_upsample_edge_8bpc_neon, export=1
1453*c0909341SAndroid Build Coastguard Worker        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
1454*c0909341SAndroid Build Coastguard Worker        movrel          x4,  padding_mask
1455*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b},  [x2]           // in[]
1456*c0909341SAndroid Build Coastguard Worker        add             x5,  x2,  w1,  uxtw       // in[sz]
1457*c0909341SAndroid Build Coastguard Worker        sub             x4,  x4,  w1,  uxtw
1458*c0909341SAndroid Build Coastguard Worker
1459*c0909341SAndroid Build Coastguard Worker        ld1r            {v2.16b},  [x2]           // in[0] for padding
1460*c0909341SAndroid Build Coastguard Worker        ld1r            {v1.16b},  [x5]           // padding
1461*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b},  [x4]           // padding_mask
1462*c0909341SAndroid Build Coastguard Worker
1463*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #9
1464*c0909341SAndroid Build Coastguard Worker
1465*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
1466*c0909341SAndroid Build Coastguard Worker
1467*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v2.16b,  v0.16b,  #15
1468*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #1
1469*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v0.16b,  v1.16b,  #2
1470*c0909341SAndroid Build Coastguard Worker
1471*c0909341SAndroid Build Coastguard Worker        uaddl           v16.8h,  v0.8b,   v5.8b   // in[i+0] + in[i+1]
1472*c0909341SAndroid Build Coastguard Worker        uaddl           v18.8h,  v4.8b,   v6.8b   // in[i-1] + in[i+2]
1473*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
1474*c0909341SAndroid Build Coastguard Worker        sub             v16.8h,  v16.8h,  v18.8h
1475*c0909341SAndroid Build Coastguard Worker
1476*c0909341SAndroid Build Coastguard Worker        sqrshrun        v16.8b,  v16.8h,  #4
1477*c0909341SAndroid Build Coastguard Worker
1478*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  #16
1479*c0909341SAndroid Build Coastguard Worker
1480*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b,  v0.16b,  v16.16b
1481*c0909341SAndroid Build Coastguard Worker
1482*c0909341SAndroid Build Coastguard Worker        st1             {v1.b}[0], [x5]
1483*c0909341SAndroid Build Coastguard Worker        // In case sz=8, output one single pixel in out[16].
1484*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x0]
1485*c0909341SAndroid Build Coastguard Worker
1486*c0909341SAndroid Build Coastguard Worker        ret
1487*c0909341SAndroid Build Coastguard Workerendfunc
1488*c0909341SAndroid Build Coastguard Worker
1489*c0909341SAndroid Build Coastguard Workerconst edge_filter
1490*c0909341SAndroid Build Coastguard Worker        .byte 0, 4, 8, 0
1491*c0909341SAndroid Build Coastguard Worker        .byte 0, 5, 6, 0
1492*c0909341SAndroid Build Coastguard Worker// Leaving out the coeffs for strength=3
1493*c0909341SAndroid Build Coastguard Worker//      .byte 2, 4, 4, 0
1494*c0909341SAndroid Build Coastguard Workerendconst
1495*c0909341SAndroid Build Coastguard Worker
1496*c0909341SAndroid Build Coastguard Worker// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
1497*c0909341SAndroid Build Coastguard Worker//                                     const pixel *const in, const int end,
1498*c0909341SAndroid Build Coastguard Worker//                                     const int strength);
1499*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_filter_edge_8bpc_neon, export=1
1500*c0909341SAndroid Build Coastguard Worker        cmp             w4, #3
1501*c0909341SAndroid Build Coastguard Worker        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
1502*c0909341SAndroid Build Coastguard Worker
1503*c0909341SAndroid Build Coastguard Worker        movrel          x5,  edge_filter, -3
1504*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  w4,  uxtw #2    // edge_filter + (strength - 1)*4 + 1
1505*c0909341SAndroid Build Coastguard Worker
1506*c0909341SAndroid Build Coastguard Worker        ld1             {v31.h}[0], [x5]          // kernel[1-2]
1507*c0909341SAndroid Build Coastguard Worker
1508*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2], #16
1509*c0909341SAndroid Build Coastguard Worker
1510*c0909341SAndroid Build Coastguard Worker        dup             v30.16b, v31.b[0]
1511*c0909341SAndroid Build Coastguard Worker        dup             v31.16b, v31.b[1]
1512*c0909341SAndroid Build Coastguard Worker1:
1513*c0909341SAndroid Build Coastguard Worker        // in[end], is the last valid pixel. We produce 16 pixels out by
1514*c0909341SAndroid Build Coastguard Worker        // using 18 pixels in - the last pixel used is [17] of the ones
1515*c0909341SAndroid Build Coastguard Worker        // read/buffered.
1516*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #17
1517*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x2], #16
1518*c0909341SAndroid Build Coastguard Worker        b.lt            2f
1519*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v0.16b,  v1.16b,  #1
1520*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1521*c0909341SAndroid Build Coastguard Worker        umull           v4.8h,   v0.8b,   v30.8b
1522*c0909341SAndroid Build Coastguard Worker        umlal           v4.8h,   v2.8b,   v31.8b
1523*c0909341SAndroid Build Coastguard Worker        umlal           v4.8h,   v3.8b,   v30.8b
1524*c0909341SAndroid Build Coastguard Worker        umull2          v5.8h,   v0.16b,  v30.16b
1525*c0909341SAndroid Build Coastguard Worker        umlal2          v5.8h,   v2.16b,  v31.16b
1526*c0909341SAndroid Build Coastguard Worker        umlal2          v5.8h,   v3.16b,  v30.16b
1527*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1528*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
1529*c0909341SAndroid Build Coastguard Worker        rshrn           v4.8b,   v4.8h,   #4
1530*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.16b,  v5.8h,   #4
1531*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #16
1532*c0909341SAndroid Build Coastguard Worker        st1             {v4.16b}, [x0], #16
1533*c0909341SAndroid Build Coastguard Worker        b.gt            1b
1534*c0909341SAndroid Build Coastguard Worker        ret
1535*c0909341SAndroid Build Coastguard Worker2:
1536*c0909341SAndroid Build Coastguard Worker        // Right padding
1537*c0909341SAndroid Build Coastguard Worker
1538*c0909341SAndroid Build Coastguard Worker        // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
1539*c0909341SAndroid Build Coastguard Worker        movrel          x5,  padding_mask
1540*c0909341SAndroid Build Coastguard Worker        sub             w6,  w3,  #32
1541*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  w3,  uxtw
1542*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w6,  sxtw
1543*c0909341SAndroid Build Coastguard Worker
1544*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x5]            // padding_mask
1545*c0909341SAndroid Build Coastguard Worker
1546*c0909341SAndroid Build Coastguard Worker        ld1r            {v1.16b}, [x6]
1547*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v1.16b,  v2.16b  // Pad v0-v1
1548*c0909341SAndroid Build Coastguard Worker
1549*c0909341SAndroid Build Coastguard Worker        // Filter one block
1550*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v0.16b,  v1.16b,  #1
1551*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1552*c0909341SAndroid Build Coastguard Worker        umull           v4.8h,   v0.8b,   v30.8b
1553*c0909341SAndroid Build Coastguard Worker        umlal           v4.8h,   v2.8b,   v31.8b
1554*c0909341SAndroid Build Coastguard Worker        umlal           v4.8h,   v3.8b,   v30.8b
1555*c0909341SAndroid Build Coastguard Worker        umull2          v5.8h,   v0.16b,  v30.16b
1556*c0909341SAndroid Build Coastguard Worker        umlal2          v5.8h,   v2.16b,  v31.16b
1557*c0909341SAndroid Build Coastguard Worker        umlal2          v5.8h,   v3.16b,  v30.16b
1558*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1559*c0909341SAndroid Build Coastguard Worker        rshrn           v4.8b,   v4.8h,   #4
1560*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.16b,  v5.8h,   #4
1561*c0909341SAndroid Build Coastguard Worker        st1             {v4.16b}, [x0], #16
1562*c0909341SAndroid Build Coastguard Worker        b.le            9f
1563*c0909341SAndroid Build Coastguard Worker5:
1564*c0909341SAndroid Build Coastguard Worker        // After one block, any remaining output would only be filtering
1565*c0909341SAndroid Build Coastguard Worker        // padding - thus just store the padding.
1566*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1567*c0909341SAndroid Build Coastguard Worker        st1             {v1.16b}, [x0], #16
1568*c0909341SAndroid Build Coastguard Worker        b.gt            5b
1569*c0909341SAndroid Build Coastguard Worker9:
1570*c0909341SAndroid Build Coastguard Worker        ret
1571*c0909341SAndroid Build Coastguard Worker
1572*c0909341SAndroid Build Coastguard WorkerL(fivetap):
1573*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #1              // topleft -= 1
1574*c0909341SAndroid Build Coastguard Worker        movi            v29.16b, #2
1575*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2], #16
1576*c0909341SAndroid Build Coastguard Worker        movi            v30.16b, #4
1577*c0909341SAndroid Build Coastguard Worker        movi            v31.16b, #4
1578*c0909341SAndroid Build Coastguard Worker        ins             v0.b[0], v0.b[1]
1579*c0909341SAndroid Build Coastguard Worker1:
1580*c0909341SAndroid Build Coastguard Worker        // in[end+1], is the last valid pixel. We produce 16 pixels out by
1581*c0909341SAndroid Build Coastguard Worker        // using 20 pixels in - the last pixel used is [19] of the ones
1582*c0909341SAndroid Build Coastguard Worker        // read/buffered.
1583*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #18
1584*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x2], #16
1585*c0909341SAndroid Build Coastguard Worker        b.lt            2f                        // if (end + 1 < 19)
1586*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v0.16b,  v1.16b,  #1
1587*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1588*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v0.16b,  v1.16b,  #3
1589*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #4
1590*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v0.8b,   v29.8b
1591*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v2.8b,   v30.8b
1592*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v3.8b,   v31.8b
1593*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v4.8b,   v30.8b
1594*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v5.8b,   v29.8b
1595*c0909341SAndroid Build Coastguard Worker        umull2          v7.8h,   v0.16b,  v29.16b
1596*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v2.16b,  v30.16b
1597*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v3.16b,  v31.16b
1598*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v4.16b,  v30.16b
1599*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v5.16b,  v29.16b
1600*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1601*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
1602*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #4
1603*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v7.8h,   #4
1604*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #16
1605*c0909341SAndroid Build Coastguard Worker        st1             {v6.16b}, [x0], #16
1606*c0909341SAndroid Build Coastguard Worker        b.gt            1b
1607*c0909341SAndroid Build Coastguard Worker        ret
1608*c0909341SAndroid Build Coastguard Worker2:
1609*c0909341SAndroid Build Coastguard Worker        // Right padding
1610*c0909341SAndroid Build Coastguard Worker
1611*c0909341SAndroid Build Coastguard Worker        // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
1612*c0909341SAndroid Build Coastguard Worker        movrel          x5,  padding_mask, -1
1613*c0909341SAndroid Build Coastguard Worker        sub             w6,  w3,  #31
1614*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  w3,  uxtw
1615*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w6,  sxtw
1616*c0909341SAndroid Build Coastguard Worker
1617*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x5]    // padding_mask
1618*c0909341SAndroid Build Coastguard Worker
1619*c0909341SAndroid Build Coastguard Worker        ld1r            {v28.16b}, [x6]
1620*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v28.16b, v2.16b  // Pad v0-v1
1621*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v28.16b, v3.16b
1622*c0909341SAndroid Build Coastguard Worker4:
1623*c0909341SAndroid Build Coastguard Worker        // Filter one block
1624*c0909341SAndroid Build Coastguard Worker        ext             v2.16b,  v0.16b,  v1.16b,  #1
1625*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1626*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v0.16b,  v1.16b,  #3
1627*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #4
1628*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v0.8b,   v29.8b
1629*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v2.8b,   v30.8b
1630*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v3.8b,   v31.8b
1631*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v4.8b,   v30.8b
1632*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v5.8b,   v29.8b
1633*c0909341SAndroid Build Coastguard Worker        umull2          v7.8h,   v0.16b,  v29.16b
1634*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v2.16b,  v30.16b
1635*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v3.16b,  v31.16b
1636*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v4.16b,  v30.16b
1637*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v5.16b,  v29.16b
1638*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1639*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
1640*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v28.16b
1641*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #4
1642*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v7.8h,   #4
1643*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #16
1644*c0909341SAndroid Build Coastguard Worker        st1             {v6.16b}, [x0], #16
1645*c0909341SAndroid Build Coastguard Worker        b.le            9f
1646*c0909341SAndroid Build Coastguard Worker        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
1647*c0909341SAndroid Build Coastguard Worker        // filter properly once more - aka (w3 >= 0).
1648*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #0
1649*c0909341SAndroid Build Coastguard Worker        b.ge            4b
1650*c0909341SAndroid Build Coastguard Worker5:
1651*c0909341SAndroid Build Coastguard Worker        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
1652*c0909341SAndroid Build Coastguard Worker        // last valid pixel - thus just output that without filtering.
1653*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1654*c0909341SAndroid Build Coastguard Worker        st1             {v1.16b}, [x0], #16
1655*c0909341SAndroid Build Coastguard Worker        b.gt            5b
1656*c0909341SAndroid Build Coastguard Worker9:
1657*c0909341SAndroid Build Coastguard Worker        ret
1658*c0909341SAndroid Build Coastguard Workerendfunc
1659*c0909341SAndroid Build Coastguard Worker
1660*c0909341SAndroid Build Coastguard Worker// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
1661*c0909341SAndroid Build Coastguard Worker//                                const int n);
1662*c0909341SAndroid Build Coastguard Workerfunction ipred_pixel_set_8bpc_neon, export=1
1663*c0909341SAndroid Build Coastguard Worker        dup             v0.16b,  w1
1664*c0909341SAndroid Build Coastguard Worker1:
1665*c0909341SAndroid Build Coastguard Worker        subs            w2,  w2,  #16
1666*c0909341SAndroid Build Coastguard Worker        st1             {v0.16b}, [x0], #16
1667*c0909341SAndroid Build Coastguard Worker        b.gt            1b
1668*c0909341SAndroid Build Coastguard Worker        ret
1669*c0909341SAndroid Build Coastguard Workerendfunc
1670*c0909341SAndroid Build Coastguard Worker
1671*c0909341SAndroid Build Coastguard Worker// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1672*c0909341SAndroid Build Coastguard Worker//                               const pixel *const top,
1673*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height,
1674*c0909341SAndroid Build Coastguard Worker//                               const int dx, const int max_base_x);
1675*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_fill1_8bpc_neon, export=1
1676*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
1677*c0909341SAndroid Build Coastguard Worker        movrel          x8,  ipred_z1_fill1_tbl
1678*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
1679*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x8, w9, uxtw #2]
1680*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1681*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x9
1682*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.16b}, [x10]          // padding
1683*c0909341SAndroid Build Coastguard Worker        mov             w7,  w5
1684*c0909341SAndroid Build Coastguard Worker        mov             w15, #64
1685*c0909341SAndroid Build Coastguard Worker        br              x8
1686*c0909341SAndroid Build Coastguard Worker40:
1687*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1688*c0909341SAndroid Build Coastguard Worker4:
1689*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1690*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1691*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1692*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1693*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1694*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1695*c0909341SAndroid Build Coastguard Worker        b.ge            49f
1696*c0909341SAndroid Build Coastguard Worker        ldr             d0,  [x2, w8, uxtw]       // top[base]
1697*c0909341SAndroid Build Coastguard Worker        ldr             d2,  [x2, w10, uxtw]
1698*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   w9               // frac
1699*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   w11
1700*c0909341SAndroid Build Coastguard Worker        ext             v1.8b,   v0.8b,   v0.8b,   #1 // top[base+1]
1701*c0909341SAndroid Build Coastguard Worker        ext             v3.8b,   v2.8b,   v2.8b,   #1
1702*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
1703*c0909341SAndroid Build Coastguard Worker        usubl           v7.8h,   v3.8b,   v2.8b
1704*c0909341SAndroid Build Coastguard Worker        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
1705*c0909341SAndroid Build Coastguard Worker        ushll           v17.8h,  v2.8b,   #6
1706*c0909341SAndroid Build Coastguard Worker        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
1707*c0909341SAndroid Build Coastguard Worker        mla             v17.4h,  v7.4h,   v5.4h
1708*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
1709*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v17.8h,  #6
1710*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[0], [x0], x1
1711*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1712*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1713*c0909341SAndroid Build Coastguard Worker        st1             {v17.s}[0], [x0], x1
1714*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1715*c0909341SAndroid Build Coastguard Worker        ret
1716*c0909341SAndroid Build Coastguard Worker
1717*c0909341SAndroid Build Coastguard Worker49:
1718*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0], x1
1719*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1720*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0], x1
1721*c0909341SAndroid Build Coastguard Worker        b.gt            49b
1722*c0909341SAndroid Build Coastguard Worker        ret
1723*c0909341SAndroid Build Coastguard Worker
1724*c0909341SAndroid Build Coastguard Worker80:
1725*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1726*c0909341SAndroid Build Coastguard Worker8:
1727*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1728*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1729*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1730*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1731*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1732*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1733*c0909341SAndroid Build Coastguard Worker        b.ge            89f
1734*c0909341SAndroid Build Coastguard Worker        ldr             q0,  [x2, w8, uxtw]       // top[base]
1735*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x2, w10, uxtw]
1736*c0909341SAndroid Build Coastguard Worker        dup             v4.8b,   w9               // frac
1737*c0909341SAndroid Build Coastguard Worker        dup             v5.8b,   w11
1738*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
1739*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
1740*c0909341SAndroid Build Coastguard Worker        dup             v6.8b,   w9               // 64 - frac
1741*c0909341SAndroid Build Coastguard Worker        dup             v7.8b,   w11
1742*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v0.16b,  #1 // top[base+1]
1743*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v2.16b,  #1
1744*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
1745*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v1.8b,   v4.8b   // + top[base+1]*frac
1746*c0909341SAndroid Build Coastguard Worker        umull           v17.8h,  v2.8b,   v7.8b
1747*c0909341SAndroid Build Coastguard Worker        umlal           v17.8h,  v3.8b,   v5.8b
1748*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
1749*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v17.8h,  #6
1750*c0909341SAndroid Build Coastguard Worker        st1             {v16.8b}, [x0], x1
1751*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1752*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1753*c0909341SAndroid Build Coastguard Worker        st1             {v17.8b}, [x0], x1
1754*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1755*c0909341SAndroid Build Coastguard Worker        ret
1756*c0909341SAndroid Build Coastguard Worker
1757*c0909341SAndroid Build Coastguard Worker89:
1758*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x0], x1
1759*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1760*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x0], x1
1761*c0909341SAndroid Build Coastguard Worker        b.gt            89b
1762*c0909341SAndroid Build Coastguard Worker        ret
1763*c0909341SAndroid Build Coastguard Worker
1764*c0909341SAndroid Build Coastguard Worker160:
1765*c0909341SAndroid Build Coastguard Worker320:
1766*c0909341SAndroid Build Coastguard Worker640:
1767*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1768*c0909341SAndroid Build Coastguard Worker
1769*c0909341SAndroid Build Coastguard Worker        mov             w12, w3
1770*c0909341SAndroid Build Coastguard Worker
1771*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1
1772*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1773*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3,  uxtw
1774*c0909341SAndroid Build Coastguard Worker1:
1775*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1776*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1777*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1778*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1779*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1780*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1781*c0909341SAndroid Build Coastguard Worker        b.ge            169f
1782*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw
1783*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw
1784*c0909341SAndroid Build Coastguard Worker        dup             v4.16b,  w9               // frac
1785*c0909341SAndroid Build Coastguard Worker        dup             v5.16b,  w11
1786*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x8],  #32 // top[base]
1787*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x10], #32
1788*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
1789*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
1790*c0909341SAndroid Build Coastguard Worker        dup             v6.16b,  w9               // 64 - frac
1791*c0909341SAndroid Build Coastguard Worker        dup             v7.16b,  w11
1792*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1793*c0909341SAndroid Build Coastguard Worker2:
1794*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v1.16b,  #1 // top[base+1]
1795*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b,  #1
1796*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1797*c0909341SAndroid Build Coastguard Worker        umull           v18.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
1798*c0909341SAndroid Build Coastguard Worker        umlal           v18.8h,  v16.8b,  v4.8b   // + top[base+1]*frac
1799*c0909341SAndroid Build Coastguard Worker        umull2          v19.8h,  v0.16b,  v6.16b
1800*c0909341SAndroid Build Coastguard Worker        umlal2          v19.8h,  v16.16b, v4.16b
1801*c0909341SAndroid Build Coastguard Worker        umull           v20.8h,  v2.8b,   v7.8b
1802*c0909341SAndroid Build Coastguard Worker        umlal           v20.8h,  v17.8b,  v5.8b
1803*c0909341SAndroid Build Coastguard Worker        umull2          v21.8h,  v2.16b,  v7.16b
1804*c0909341SAndroid Build Coastguard Worker        umlal2          v21.8h,  v17.16b, v5.16b
1805*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v18.8h,  #6
1806*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.16b, v19.8h,  #6
1807*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v20.8h,  #6
1808*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.16b, v21.8h,  #6
1809*c0909341SAndroid Build Coastguard Worker        st1             {v16.16b}, [x0],  #16
1810*c0909341SAndroid Build Coastguard Worker        st1             {v17.16b}, [x13], #16
1811*c0909341SAndroid Build Coastguard Worker        b.le            3f
1812*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
1813*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x8],  #16 // top[base]
1814*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v3.16b
1815*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x10], #16
1816*c0909341SAndroid Build Coastguard Worker        b               2b
1817*c0909341SAndroid Build Coastguard Worker
1818*c0909341SAndroid Build Coastguard Worker3:
1819*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1820*c0909341SAndroid Build Coastguard Worker        b.le            9f
1821*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1822*c0909341SAndroid Build Coastguard Worker        add             x13, x13, x1
1823*c0909341SAndroid Build Coastguard Worker        mov             w3,  w12
1824*c0909341SAndroid Build Coastguard Worker        b               1b
1825*c0909341SAndroid Build Coastguard Worker9:
1826*c0909341SAndroid Build Coastguard Worker        ret
1827*c0909341SAndroid Build Coastguard Worker
1828*c0909341SAndroid Build Coastguard Worker169:
1829*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x0],  #16
1830*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1831*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x13], #16
1832*c0909341SAndroid Build Coastguard Worker        b.gt            169b
1833*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1834*c0909341SAndroid Build Coastguard Worker        b.le            9b
1835*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1836*c0909341SAndroid Build Coastguard Worker        add             x13, x13, x1
1837*c0909341SAndroid Build Coastguard Worker        mov             w3,  w12
1838*c0909341SAndroid Build Coastguard Worker        b               169b
1839*c0909341SAndroid Build Coastguard Workerendfunc
1840*c0909341SAndroid Build Coastguard Worker
1841*c0909341SAndroid Build Coastguard Workerjumptable ipred_z1_fill1_tbl
1842*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z1_fill1_tbl
1843*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z1_fill1_tbl
1844*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z1_fill1_tbl
1845*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z1_fill1_tbl
1846*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z1_fill1_tbl
1847*c0909341SAndroid Build Coastguard Workerendjumptable
1848*c0909341SAndroid Build Coastguard Worker
1849*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_fill2_8bpc_neon, export=1
1850*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #8
1851*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1852*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.16b}, [x10]          // padding
1853*c0909341SAndroid Build Coastguard Worker        mov             w7,  w5
1854*c0909341SAndroid Build Coastguard Worker        mov             w15, #64
1855*c0909341SAndroid Build Coastguard Worker        b.eq            8f
1856*c0909341SAndroid Build Coastguard Worker
1857*c0909341SAndroid Build Coastguard Worker4:      // w == 4
1858*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1859*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1860*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1861*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1862*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1863*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1864*c0909341SAndroid Build Coastguard Worker        b.ge            49f
1865*c0909341SAndroid Build Coastguard Worker        ldr             d0,  [x2, w8, uxtw]       // top[base]
1866*c0909341SAndroid Build Coastguard Worker        ldr             d2,  [x2, w10, uxtw]
1867*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   w9               // frac
1868*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   w11
1869*c0909341SAndroid Build Coastguard Worker        uzp2            v1.8b,   v0.8b,   v0.8b   // top[base+1]
1870*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8b,   v0.8b,   v0.8b   // top[base]
1871*c0909341SAndroid Build Coastguard Worker        uzp2            v3.8b,   v2.8b,   v2.8b
1872*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8b,   v2.8b,   v2.8b
1873*c0909341SAndroid Build Coastguard Worker        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
1874*c0909341SAndroid Build Coastguard Worker        usubl           v7.8h,   v3.8b,   v2.8b
1875*c0909341SAndroid Build Coastguard Worker        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
1876*c0909341SAndroid Build Coastguard Worker        ushll           v17.8h,  v2.8b,   #6
1877*c0909341SAndroid Build Coastguard Worker        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
1878*c0909341SAndroid Build Coastguard Worker        mla             v17.4h,  v7.4h,   v5.4h
1879*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
1880*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v17.8h,  #6
1881*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[0], [x0], x1
1882*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1883*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1884*c0909341SAndroid Build Coastguard Worker        st1             {v17.s}[0], [x0], x1
1885*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1886*c0909341SAndroid Build Coastguard Worker        ret
1887*c0909341SAndroid Build Coastguard Worker
1888*c0909341SAndroid Build Coastguard Worker49:
1889*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0], x1
1890*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1891*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0], x1
1892*c0909341SAndroid Build Coastguard Worker        b.gt            49b
1893*c0909341SAndroid Build Coastguard Worker        ret
1894*c0909341SAndroid Build Coastguard Worker
1895*c0909341SAndroid Build Coastguard Worker8:      // w == 8
1896*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1897*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1898*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1899*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1900*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1901*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1902*c0909341SAndroid Build Coastguard Worker        b.ge            89f
1903*c0909341SAndroid Build Coastguard Worker        ldr             q0,  [x2, w8, uxtw]       // top[base]
1904*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x2, w10, uxtw]
1905*c0909341SAndroid Build Coastguard Worker        dup             v4.8b,   w9               // frac
1906*c0909341SAndroid Build Coastguard Worker        dup             v5.8b,   w11
1907*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
1908*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
1909*c0909341SAndroid Build Coastguard Worker        dup             v6.8b,   w9               // 64 - frac
1910*c0909341SAndroid Build Coastguard Worker        dup             v7.8b,   w11
1911*c0909341SAndroid Build Coastguard Worker        uzp2            v1.16b,  v0.16b,  v0.16b  // top[base+1]
1912*c0909341SAndroid Build Coastguard Worker        uzp1            v0.16b,  v0.16b,  v0.16b  // top[base]
1913*c0909341SAndroid Build Coastguard Worker        uzp2            v3.16b,  v2.16b,  v2.16b
1914*c0909341SAndroid Build Coastguard Worker        uzp1            v2.16b,  v2.16b,  v2.16b
1915*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v1.8b,   v4.8b   // top[base+1]*frac
1916*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v0.8b,   v6.8b   // + top[base]*(64-frac)
1917*c0909341SAndroid Build Coastguard Worker        umull           v17.8h,  v3.8b,   v5.8b
1918*c0909341SAndroid Build Coastguard Worker        umlal           v17.8h,  v2.8b,   v7.8b
1919*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
1920*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v17.8h,  #6
1921*c0909341SAndroid Build Coastguard Worker        st1             {v16.8b}, [x0], x1
1922*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1923*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1924*c0909341SAndroid Build Coastguard Worker        st1             {v17.8b}, [x0], x1
1925*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1926*c0909341SAndroid Build Coastguard Worker        ret
1927*c0909341SAndroid Build Coastguard Worker
1928*c0909341SAndroid Build Coastguard Worker89:
1929*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x0], x1
1930*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1931*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x0], x1
1932*c0909341SAndroid Build Coastguard Worker        b.gt            89b
1933*c0909341SAndroid Build Coastguard Worker        ret
1934*c0909341SAndroid Build Coastguard Workerendfunc
1935*c0909341SAndroid Build Coastguard Worker
1936*c0909341SAndroid Build Coastguard Worker// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
1937*c0909341SAndroid Build Coastguard Worker//                              const int n);
1938*c0909341SAndroid Build Coastguard Workerfunction ipred_reverse_8bpc_neon, export=1
1939*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #16
1940*c0909341SAndroid Build Coastguard Worker        add             x3,  x0,  #8
1941*c0909341SAndroid Build Coastguard Worker        mov             x4,  #16
1942*c0909341SAndroid Build Coastguard Worker1:
1943*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1]
1944*c0909341SAndroid Build Coastguard Worker        subs            w2,  w2,  #16
1945*c0909341SAndroid Build Coastguard Worker        rev64           v0.16b,  v0.16b
1946*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #16
1947*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[1], [x0], x4
1948*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[0], [x3], x4
1949*c0909341SAndroid Build Coastguard Worker        b.gt            1b
1950*c0909341SAndroid Build Coastguard Worker        ret
1951*c0909341SAndroid Build Coastguard Workerendfunc
1952*c0909341SAndroid Build Coastguard Worker
1953*c0909341SAndroid Build Coastguard Workerconst increments
1954*c0909341SAndroid Build Coastguard Worker        .short          0,  1,  2,  3,  4,  5,  6,  7
1955*c0909341SAndroid Build Coastguard Worker        .short          8,  9,  10, 11, 12, 13, 14, 15
1956*c0909341SAndroid Build Coastguard Workerendconst
1957*c0909341SAndroid Build Coastguard Worker
1958*c0909341SAndroid Build Coastguard Worker// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1959*c0909341SAndroid Build Coastguard Worker//                               const pixel *const top,
1960*c0909341SAndroid Build Coastguard Worker//                               const pixel *const left,
1961*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height,
1962*c0909341SAndroid Build Coastguard Worker//                               const int dx, const int dy);
1963*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_fill1_8bpc_neon, export=1
1964*c0909341SAndroid Build Coastguard Worker        clz             w10, w4
1965*c0909341SAndroid Build Coastguard Worker        movrel          x9,  ipred_z2_fill1_tbl
1966*c0909341SAndroid Build Coastguard Worker        sub             w10, w10, #25
1967*c0909341SAndroid Build Coastguard Worker        ldrsw           x10, [x9, w10, uxtw #2]
1968*c0909341SAndroid Build Coastguard Worker        mov             w8,  #(1 << 6)            // xpos = 1 << 6
1969*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  x10
1970*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
1971*c0909341SAndroid Build Coastguard Worker
1972*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
1973*c0909341SAndroid Build Coastguard Worker        ld1             {v31.8h},  [x11]          // increments
1974*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                   // -dy
1975*c0909341SAndroid Build Coastguard Worker
1976*c0909341SAndroid Build Coastguard Worker        br              x9
1977*c0909341SAndroid Build Coastguard Worker40:
1978*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1979*c0909341SAndroid Build Coastguard Worker
1980*c0909341SAndroid Build Coastguard Worker        dup             v30.4h,  w7               // -dy
1981*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
1982*c0909341SAndroid Build Coastguard Worker
1983*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
1984*c0909341SAndroid Build Coastguard Worker        movi            v25.16b, #0x3e
1985*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v16.4h,  v30.4h  // -= dy
1986*c0909341SAndroid Build Coastguard Worker
1987*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3}
1988*c0909341SAndroid Build Coastguard Worker
1989*c0909341SAndroid Build Coastguard Worker        // Worst case height for w=4 is 16, but we need at least h+1 elements
1990*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x3]    // left[]
1991*c0909341SAndroid Build Coastguard Worker
1992*c0909341SAndroid Build Coastguard Worker        movi            v26.16b, #64
1993*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
1994*c0909341SAndroid Build Coastguard Worker
1995*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
1996*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
1997*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v27.8b,  v25.8b  // frac_y
1998*c0909341SAndroid Build Coastguard Worker
1999*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2000*c0909341SAndroid Build Coastguard Worker
2001*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2002*c0909341SAndroid Build Coastguard Worker        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2003*c0909341SAndroid Build Coastguard Worker
2004*c0909341SAndroid Build Coastguard Worker        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
2005*c0909341SAndroid Build Coastguard Worker
2006*c0909341SAndroid Build Coastguard Worker        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
2007*c0909341SAndroid Build Coastguard Worker
2008*c0909341SAndroid Build Coastguard Worker        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2009*c0909341SAndroid Build Coastguard Worker
2010*c0909341SAndroid Build Coastguard Worker        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2011*c0909341SAndroid Build Coastguard Worker
2012*c0909341SAndroid Build Coastguard Worker        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2013*c0909341SAndroid Build Coastguard Worker        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2014*c0909341SAndroid Build Coastguard Worker
2015*c0909341SAndroid Build Coastguard Worker        movi            v29.8b,  #2
2016*c0909341SAndroid Build Coastguard Worker4:
2017*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2018*c0909341SAndroid Build Coastguard Worker        dup             v6.4h,   w8               // xpos
2019*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2020*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-4                  // base_x <= -4
2021*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2022*c0909341SAndroid Build Coastguard Worker        b.le            49f
2023*c0909341SAndroid Build Coastguard Worker
2024*c0909341SAndroid Build Coastguard Worker        dup             v7.4h,   w8               // xpos
2025*c0909341SAndroid Build Coastguard Worker
2026*c0909341SAndroid Build Coastguard Worker        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2027*c0909341SAndroid Build Coastguard Worker        ldr             d4,  [x2, w11, sxtw]
2028*c0909341SAndroid Build Coastguard Worker
2029*c0909341SAndroid Build Coastguard Worker        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2030*c0909341SAndroid Build Coastguard Worker
2031*c0909341SAndroid Build Coastguard Worker        // Cut corners here; only doing tbl over v0 here; we only
2032*c0909341SAndroid Build Coastguard Worker        // seem to need the last pixel, from v1, after skipping to the
2033*c0909341SAndroid Build Coastguard Worker        // left-only codepath below.
2034*c0909341SAndroid Build Coastguard Worker        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2035*c0909341SAndroid Build Coastguard Worker
2036*c0909341SAndroid Build Coastguard Worker        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2037*c0909341SAndroid Build Coastguard Worker        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2038*c0909341SAndroid Build Coastguard Worker
2039*c0909341SAndroid Build Coastguard Worker        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
2040*c0909341SAndroid Build Coastguard Worker        ext             v5.8b,   v4.8b,   v4.8b,   #1
2041*c0909341SAndroid Build Coastguard Worker
2042*c0909341SAndroid Build Coastguard Worker        and             v6.8b,   v6.8b,   v25.8b  // frac_x
2043*c0909341SAndroid Build Coastguard Worker
2044*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2045*c0909341SAndroid Build Coastguard Worker
2046*c0909341SAndroid Build Coastguard Worker        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
2047*c0909341SAndroid Build Coastguard Worker        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
2048*c0909341SAndroid Build Coastguard Worker
2049*c0909341SAndroid Build Coastguard Worker        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
2050*c0909341SAndroid Build Coastguard Worker
2051*c0909341SAndroid Build Coastguard Worker        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
2052*c0909341SAndroid Build Coastguard Worker
2053*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
2054*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2055*c0909341SAndroid Build Coastguard Worker
2056*c0909341SAndroid Build Coastguard Worker        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2057*c0909341SAndroid Build Coastguard Worker        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
2058*c0909341SAndroid Build Coastguard Worker
2059*c0909341SAndroid Build Coastguard Worker        cmge            v20.8b,  v20.8b,  #0
2060*c0909341SAndroid Build Coastguard Worker
2061*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
2062*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #6
2063*c0909341SAndroid Build Coastguard Worker
2064*c0909341SAndroid Build Coastguard Worker        bit             v16.8b,  v22.8b,  v20.8b
2065*c0909341SAndroid Build Coastguard Worker
2066*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[0], [x0], x1
2067*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2068*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2069*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[1], [x0], x1
2070*c0909341SAndroid Build Coastguard Worker        b.le            9f
2071*c0909341SAndroid Build Coastguard Worker
2072*c0909341SAndroid Build Coastguard Worker        ext             v16.8b,  v17.8b,  v17.8b, #4
2073*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2074*c0909341SAndroid Build Coastguard Worker        b               4b
2075*c0909341SAndroid Build Coastguard Worker
2076*c0909341SAndroid Build Coastguard Worker49:
2077*c0909341SAndroid Build Coastguard Worker        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2]
2078*c0909341SAndroid Build Coastguard Worker
2079*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2080*c0909341SAndroid Build Coastguard Worker
2081*c0909341SAndroid Build Coastguard Worker        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
2082*c0909341SAndroid Build Coastguard Worker        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2083*c0909341SAndroid Build Coastguard Worker        rshrn           v18.8b,  v18.8h,  #6
2084*c0909341SAndroid Build Coastguard Worker
2085*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0], x1
2086*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2087*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x0], x1
2088*c0909341SAndroid Build Coastguard Worker        b.le            9f
2089*c0909341SAndroid Build Coastguard Worker
2090*c0909341SAndroid Build Coastguard Worker        ext             v16.8b,  v17.8b,  v17.8b, #4
2091*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2092*c0909341SAndroid Build Coastguard Worker        b               49b
2093*c0909341SAndroid Build Coastguard Worker
2094*c0909341SAndroid Build Coastguard Worker9:
2095*c0909341SAndroid Build Coastguard Worker        ret
2096*c0909341SAndroid Build Coastguard Worker
2097*c0909341SAndroid Build Coastguard Worker80:
2098*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2099*c0909341SAndroid Build Coastguard Worker
2100*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  w7               // -dy
2101*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
2102*c0909341SAndroid Build Coastguard Worker
2103*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
2104*c0909341SAndroid Build Coastguard Worker        movi            v25.16b, #0x3e
2105*c0909341SAndroid Build Coastguard Worker        add             v30.8h,  v16.8h,  v30.8h  // -= dy
2106*c0909341SAndroid Build Coastguard Worker
2107*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2108*c0909341SAndroid Build Coastguard Worker
2109*c0909341SAndroid Build Coastguard Worker        // Worst case height for w=8 is 32, but we need at least h+1 elements
2110*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
2111*c0909341SAndroid Build Coastguard Worker
2112*c0909341SAndroid Build Coastguard Worker        movi            v26.16b, #64
2113*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
2114*c0909341SAndroid Build Coastguard Worker
2115*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2116*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2117*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2118*c0909341SAndroid Build Coastguard Worker
2119*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2120*c0909341SAndroid Build Coastguard Worker
2121*c0909341SAndroid Build Coastguard Worker        // Cut corners here; for the first row we don't expect to need to
2122*c0909341SAndroid Build Coastguard Worker        // read outside of v0.
2123*c0909341SAndroid Build Coastguard Worker        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2124*c0909341SAndroid Build Coastguard Worker
2125*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
2126*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
2127*c0909341SAndroid Build Coastguard Worker
2128*c0909341SAndroid Build Coastguard Worker        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2129*c0909341SAndroid Build Coastguard Worker
2130*c0909341SAndroid Build Coastguard Worker        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
2131*c0909341SAndroid Build Coastguard Worker
2132*c0909341SAndroid Build Coastguard Worker        movi            v24.8b,  #2               // 2
2133*c0909341SAndroid Build Coastguard Worker8:
2134*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2135*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
2136*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2137*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-8                  // base_x <= -8
2138*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2139*c0909341SAndroid Build Coastguard Worker        b.le            89f
2140*c0909341SAndroid Build Coastguard Worker
2141*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
2142*c0909341SAndroid Build Coastguard Worker
2143*c0909341SAndroid Build Coastguard Worker        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2144*c0909341SAndroid Build Coastguard Worker        ldr             q6,  [x2, w11, sxtw]
2145*c0909341SAndroid Build Coastguard Worker
2146*c0909341SAndroid Build Coastguard Worker        // Cut corners here; only doing tbl over v0-v1 here; we only
2147*c0909341SAndroid Build Coastguard Worker        // seem to need the last pixel, from v2, after skipping to the
2148*c0909341SAndroid Build Coastguard Worker        // left-only codepath below.
2149*c0909341SAndroid Build Coastguard Worker        tbl             v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1]
2150*c0909341SAndroid Build Coastguard Worker
2151*c0909341SAndroid Build Coastguard Worker        shrn            v21.8b,  v16.8h,  #6      // first base_x
2152*c0909341SAndroid Build Coastguard Worker        shrn2           v21.16b, v17.8h,  #6
2153*c0909341SAndroid Build Coastguard Worker        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2154*c0909341SAndroid Build Coastguard Worker        xtn2            v16.16b, v17.8h
2155*c0909341SAndroid Build Coastguard Worker
2156*c0909341SAndroid Build Coastguard Worker        tbl             v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2]
2157*c0909341SAndroid Build Coastguard Worker
2158*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
2159*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v6.16b,  #1
2160*c0909341SAndroid Build Coastguard Worker
2161*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
2162*c0909341SAndroid Build Coastguard Worker
2163*c0909341SAndroid Build Coastguard Worker        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
2164*c0909341SAndroid Build Coastguard Worker        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
2165*c0909341SAndroid Build Coastguard Worker
2166*c0909341SAndroid Build Coastguard Worker        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
2167*c0909341SAndroid Build Coastguard Worker
2168*c0909341SAndroid Build Coastguard Worker        add             v21.16b, v21.16b, v31.16b // actual base_x
2169*c0909341SAndroid Build Coastguard Worker
2170*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2171*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2172*c0909341SAndroid Build Coastguard Worker        umull           v17.8h,  v19.8b,  v28.8b
2173*c0909341SAndroid Build Coastguard Worker        umlal           v17.8h,  v20.8b,  v27.8b
2174*c0909341SAndroid Build Coastguard Worker
2175*c0909341SAndroid Build Coastguard Worker        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2176*c0909341SAndroid Build Coastguard Worker        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2177*c0909341SAndroid Build Coastguard Worker        umull2          v23.8h,  v4.16b,  v7.16b
2178*c0909341SAndroid Build Coastguard Worker        umlal2          v23.8h,  v5.16b,  v16.16b
2179*c0909341SAndroid Build Coastguard Worker
2180*c0909341SAndroid Build Coastguard Worker        cmge            v21.16b, v21.16b, #0
2181*c0909341SAndroid Build Coastguard Worker
2182*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #6
2183*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v17.8h,  #6
2184*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #6
2185*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.16b, v23.8h,  #6
2186*c0909341SAndroid Build Coastguard Worker
2187*c0909341SAndroid Build Coastguard Worker        bit             v6.16b,  v22.16b, v21.16b
2188*c0909341SAndroid Build Coastguard Worker
2189*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[0], [x0], x1
2190*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2191*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2192*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[1], [x0], x1
2193*c0909341SAndroid Build Coastguard Worker        b.le            9f
2194*c0909341SAndroid Build Coastguard Worker
2195*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v20.8b
2196*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2197*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2198*c0909341SAndroid Build Coastguard Worker        b               8b
2199*c0909341SAndroid Build Coastguard Worker
2200*c0909341SAndroid Build Coastguard Worker89:
2201*c0909341SAndroid Build Coastguard Worker        tbl             v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1]
2202*c0909341SAndroid Build Coastguard Worker        tbl             v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2]
2203*c0909341SAndroid Build Coastguard Worker
2204*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2205*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2206*c0909341SAndroid Build Coastguard Worker        umull           v17.8h,  v19.8b,  v28.8b
2207*c0909341SAndroid Build Coastguard Worker        umlal           v17.8h,  v20.8b,  v27.8b
2208*c0909341SAndroid Build Coastguard Worker
2209*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #6
2210*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v17.8h,  #6
2211*c0909341SAndroid Build Coastguard Worker
2212*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[0], [x0], x1
2213*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2214*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[1], [x0], x1
2215*c0909341SAndroid Build Coastguard Worker        b.le            9f
2216*c0909341SAndroid Build Coastguard Worker
2217*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v20.8b
2218*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2219*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2220*c0909341SAndroid Build Coastguard Worker        b               89b
2221*c0909341SAndroid Build Coastguard Worker
2222*c0909341SAndroid Build Coastguard Worker9:
2223*c0909341SAndroid Build Coastguard Worker        ret
2224*c0909341SAndroid Build Coastguard Worker
2225*c0909341SAndroid Build Coastguard Worker160:
2226*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2227*c0909341SAndroid Build Coastguard Worker
2228*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #-0x40]!
2229*c0909341SAndroid Build Coastguard Worker        stp             d10, d11, [sp, #0x10]
2230*c0909341SAndroid Build Coastguard Worker        stp             d12, d13, [sp, #0x20]
2231*c0909341SAndroid Build Coastguard Worker        stp             d14, d15, [sp, #0x30]
2232*c0909341SAndroid Build Coastguard Worker
2233*c0909341SAndroid Build Coastguard Worker        add             x11, x11, #16             // increments
2234*c0909341SAndroid Build Coastguard Worker
2235*c0909341SAndroid Build Coastguard Worker        dup             v18.8h,  w7               // -dy
2236*c0909341SAndroid Build Coastguard Worker        movi            v17.16b, #1
2237*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  #1              // Skip past left[0]
2238*c0909341SAndroid Build Coastguard Worker
2239*c0909341SAndroid Build Coastguard Worker        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
2240*c0909341SAndroid Build Coastguard Worker
2241*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2242*c0909341SAndroid Build Coastguard Worker        mul             v19.8h,  v14.8h,  v18.8h  // {8,9,10,11,12,13,14,15}* -dy
2243*c0909341SAndroid Build Coastguard Worker        movi            v25.16b, #0x3e
2244*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2245*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v19.8h,  v18.8h
2246*c0909341SAndroid Build Coastguard Worker
2247*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2248*c0909341SAndroid Build Coastguard Worker        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
2249*c0909341SAndroid Build Coastguard Worker
2250*c0909341SAndroid Build Coastguard Worker        // Worst case height is 64.
2251*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
2252*c0909341SAndroid Build Coastguard Worker        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
2253*c0909341SAndroid Build Coastguard Worker
2254*c0909341SAndroid Build Coastguard Worker        movi            v26.16b, #64
2255*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
2256*c0909341SAndroid Build Coastguard Worker
2257*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v16.8h           // (uint8_t)ypos
2258*c0909341SAndroid Build Coastguard Worker        xtn2            v27.16b, v18.8h
2259*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
2260*c0909341SAndroid Build Coastguard Worker        shrn2           v29.16b, v18.8h,  #6
2261*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v15.16b          // left[0]
2262*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v27.16b, v25.16b // frac_y
2263*c0909341SAndroid Build Coastguard Worker
2264*c0909341SAndroid Build Coastguard Worker        // Cut corners here; for the first row we don't expect to need to
2265*c0909341SAndroid Build Coastguard Worker        // read outside of v0.
2266*c0909341SAndroid Build Coastguard Worker        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
2267*c0909341SAndroid Build Coastguard Worker
2268*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v29.16b, v19.16b // base_y + 2
2269*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v17.16b // base_y + 1
2270*c0909341SAndroid Build Coastguard Worker
2271*c0909341SAndroid Build Coastguard Worker        sub             v28.16b, v26.16b, v27.16b // 64 - frac_y
2272*c0909341SAndroid Build Coastguard Worker
2273*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #2               // 2
2274*c0909341SAndroid Build Coastguard Worker16:
2275*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2276*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
2277*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2278*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-16                 // base_x <= -16
2279*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2280*c0909341SAndroid Build Coastguard Worker        b.le            169f
2281*c0909341SAndroid Build Coastguard Worker
2282*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
2283*c0909341SAndroid Build Coastguard Worker
2284*c0909341SAndroid Build Coastguard Worker        add             x9,  x2,  w9,  sxtw
2285*c0909341SAndroid Build Coastguard Worker        add             x11, x2,  w11, sxtw
2286*c0909341SAndroid Build Coastguard Worker
2287*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b, v5.16b}, [x9]    // top[base_x]
2288*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b          // left[0]
2289*c0909341SAndroid Build Coastguard Worker        ld1             {v6.16b, v7.16b}, [x11]
2290*c0909341SAndroid Build Coastguard Worker
2291*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2292*c0909341SAndroid Build Coastguard Worker
2293*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b          // left[0]
2294*c0909341SAndroid Build Coastguard Worker
2295*c0909341SAndroid Build Coastguard Worker        shrn            v21.8b,  v16.8h,  #6      // first base_x
2296*c0909341SAndroid Build Coastguard Worker        shrn            v22.8b,  v17.8h,  #6
2297*c0909341SAndroid Build Coastguard Worker        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2298*c0909341SAndroid Build Coastguard Worker        xtn             v17.8b,  v17.8h
2299*c0909341SAndroid Build Coastguard Worker
2300*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2301*c0909341SAndroid Build Coastguard Worker
2302*c0909341SAndroid Build Coastguard Worker        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
2303*c0909341SAndroid Build Coastguard Worker        trn1            v22.2d,  v22.2d,  v22.2d
2304*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
2305*c0909341SAndroid Build Coastguard Worker        trn1            v17.2d,  v17.2d,  v17.2d
2306*c0909341SAndroid Build Coastguard Worker
2307*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v5.16b,  #1 // top[base_x+1]
2308*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v7.16b,  #1
2309*c0909341SAndroid Build Coastguard Worker
2310*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
2311*c0909341SAndroid Build Coastguard Worker        and             v17.16b, v17.16b, v25.16b
2312*c0909341SAndroid Build Coastguard Worker
2313*c0909341SAndroid Build Coastguard Worker        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2314*c0909341SAndroid Build Coastguard Worker        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2315*c0909341SAndroid Build Coastguard Worker
2316*c0909341SAndroid Build Coastguard Worker        sub             v8.16b,  v26.16b, v16.16b // 64 - frac_x
2317*c0909341SAndroid Build Coastguard Worker        sub             v9.16b,  v26.16b, v17.16b
2318*c0909341SAndroid Build Coastguard Worker
2319*c0909341SAndroid Build Coastguard Worker        umull2          v11.8h,  v18.16b, v28.16b
2320*c0909341SAndroid Build Coastguard Worker        umlal2          v11.8h,  v19.16b, v27.16b
2321*c0909341SAndroid Build Coastguard Worker
2322*c0909341SAndroid Build Coastguard Worker        add             v21.16b, v21.16b, v31.16b // actual base_x
2323*c0909341SAndroid Build Coastguard Worker        add             v22.16b, v22.16b, v31.16b
2324*c0909341SAndroid Build Coastguard Worker
2325*c0909341SAndroid Build Coastguard Worker        umull           v12.8h,  v19.8b,  v28.8b
2326*c0909341SAndroid Build Coastguard Worker        umlal           v12.8h,  v20.8b,  v27.8b
2327*c0909341SAndroid Build Coastguard Worker        umull2          v13.8h,  v19.16b, v28.16b
2328*c0909341SAndroid Build Coastguard Worker        umlal2          v13.8h,  v20.16b, v27.16b
2329*c0909341SAndroid Build Coastguard Worker
2330*c0909341SAndroid Build Coastguard Worker        rshrn           v10.8b,  v10.8h,  #6
2331*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.16b, v11.8h,  #6
2332*c0909341SAndroid Build Coastguard Worker        rshrn           v11.8b,  v12.8h,  #6
2333*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.16b, v13.8h,  #6
2334*c0909341SAndroid Build Coastguard Worker
2335*c0909341SAndroid Build Coastguard Worker        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2336*c0909341SAndroid Build Coastguard Worker        umlal           v12.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2337*c0909341SAndroid Build Coastguard Worker        umull2          v13.8h,  v4.16b,  v8.16b
2338*c0909341SAndroid Build Coastguard Worker        umlal2          v13.8h,  v5.16b,  v16.16b
2339*c0909341SAndroid Build Coastguard Worker        umull           v14.8h,  v6.8b,   v9.8b
2340*c0909341SAndroid Build Coastguard Worker        umlal           v14.8h,  v7.8b,   v17.8b
2341*c0909341SAndroid Build Coastguard Worker        umull2          v18.8h,  v6.16b,  v9.16b
2342*c0909341SAndroid Build Coastguard Worker        umlal2          v18.8h,  v7.16b,  v17.16b
2343*c0909341SAndroid Build Coastguard Worker
2344*c0909341SAndroid Build Coastguard Worker        cmge            v21.16b, v21.16b, #0
2345*c0909341SAndroid Build Coastguard Worker        cmge            v22.16b, v22.16b, #0
2346*c0909341SAndroid Build Coastguard Worker
2347*c0909341SAndroid Build Coastguard Worker        rshrn           v12.8b,  v12.8h,  #6
2348*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.16b, v13.8h,  #6
2349*c0909341SAndroid Build Coastguard Worker        rshrn           v13.8b,  v14.8h,  #6
2350*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.16b, v18.8h,  #6
2351*c0909341SAndroid Build Coastguard Worker
2352*c0909341SAndroid Build Coastguard Worker        bit             v10.16b, v12.16b, v21.16b
2353*c0909341SAndroid Build Coastguard Worker        bit             v11.16b, v13.16b, v22.16b
2354*c0909341SAndroid Build Coastguard Worker
2355*c0909341SAndroid Build Coastguard Worker        st1             {v10.16b}, [x0], x1
2356*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2357*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2358*c0909341SAndroid Build Coastguard Worker        st1             {v11.16b}, [x0], x1
2359*c0909341SAndroid Build Coastguard Worker        b.le            9f
2360*c0909341SAndroid Build Coastguard Worker
2361*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2362*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2
2363*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 2
2364*c0909341SAndroid Build Coastguard Worker        b               16b
2365*c0909341SAndroid Build Coastguard Worker
2366*c0909341SAndroid Build Coastguard Worker169:
2367*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b
2368*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b
2369*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2370*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2371*c0909341SAndroid Build Coastguard Worker
2372*c0909341SAndroid Build Coastguard Worker        umull           v4.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2373*c0909341SAndroid Build Coastguard Worker        umlal           v4.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2374*c0909341SAndroid Build Coastguard Worker        umull2          v5.8h,   v18.16b, v28.16b
2375*c0909341SAndroid Build Coastguard Worker        umlal2          v5.8h,   v19.16b, v27.16b
2376*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v19.8b,  v28.8b
2377*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v20.8b,  v27.8b
2378*c0909341SAndroid Build Coastguard Worker        umull2          v7.8h,   v19.16b, v28.16b
2379*c0909341SAndroid Build Coastguard Worker        umlal2          v7.8h,   v20.16b, v27.16b
2380*c0909341SAndroid Build Coastguard Worker
2381*c0909341SAndroid Build Coastguard Worker        rshrn           v4.8b,   v4.8h,   #6
2382*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.16b,  v5.8h,   #6
2383*c0909341SAndroid Build Coastguard Worker        rshrn           v5.8b,   v6.8h,   #6
2384*c0909341SAndroid Build Coastguard Worker        rshrn2          v5.16b,  v7.8h,   #6
2385*c0909341SAndroid Build Coastguard Worker
2386*c0909341SAndroid Build Coastguard Worker        st1             {v4.16b}, [x0], x1
2387*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2388*c0909341SAndroid Build Coastguard Worker        st1             {v5.16b}, [x0], x1
2389*c0909341SAndroid Build Coastguard Worker        b.le            9f
2390*c0909341SAndroid Build Coastguard Worker
2391*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2392*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2
2393*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 2
2394*c0909341SAndroid Build Coastguard Worker        b               169b
2395*c0909341SAndroid Build Coastguard Worker
2396*c0909341SAndroid Build Coastguard Worker9:
2397*c0909341SAndroid Build Coastguard Worker        ldp             d14, d15, [sp, #0x30]
2398*c0909341SAndroid Build Coastguard Worker        ldp             d12, d13, [sp, #0x20]
2399*c0909341SAndroid Build Coastguard Worker        ldp             d10, d11, [sp, #0x10]
2400*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp], 0x40
2401*c0909341SAndroid Build Coastguard Worker        ret
2402*c0909341SAndroid Build Coastguard Worker
2403*c0909341SAndroid Build Coastguard Worker320:
2404*c0909341SAndroid Build Coastguard Worker640:
2405*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2406*c0909341SAndroid Build Coastguard Worker
2407*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #-0x40]!
2408*c0909341SAndroid Build Coastguard Worker        stp             d10, d11, [sp, #0x10]
2409*c0909341SAndroid Build Coastguard Worker        stp             d12, d13, [sp, #0x20]
2410*c0909341SAndroid Build Coastguard Worker        stp             d14, d15, [sp, #0x30]
2411*c0909341SAndroid Build Coastguard Worker
2412*c0909341SAndroid Build Coastguard Worker        add             x11, x11, #16             // increments
2413*c0909341SAndroid Build Coastguard Worker
2414*c0909341SAndroid Build Coastguard Worker        dup             v25.8h,  w7               // -dy
2415*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  #1              // Skip past left[0]
2416*c0909341SAndroid Build Coastguard Worker
2417*c0909341SAndroid Build Coastguard Worker        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
2418*c0909341SAndroid Build Coastguard Worker
2419*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1              // alternating row
2420*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1              // stride *= 2
2421*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w4,  uxtw       // stride -= width
2422*c0909341SAndroid Build Coastguard Worker
2423*c0909341SAndroid Build Coastguard Worker        movi            v11.8h,  #8
2424*c0909341SAndroid Build Coastguard Worker        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
2425*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v26.8h,  v25.8h  // -= dy
2426*c0909341SAndroid Build Coastguard Worker        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
2427*c0909341SAndroid Build Coastguard Worker
2428*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2429*c0909341SAndroid Build Coastguard Worker        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
2430*c0909341SAndroid Build Coastguard Worker
2431*c0909341SAndroid Build Coastguard Worker        // Worst case height is 64.
2432*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
2433*c0909341SAndroid Build Coastguard Worker        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
2434*c0909341SAndroid Build Coastguard Worker
2435*c0909341SAndroid Build Coastguard Worker        mov             w12, w4                   // orig w
2436*c0909341SAndroid Build Coastguard Worker        neg             w14, w4                   // -w
2437*c0909341SAndroid Build Coastguard Worker
2438*c0909341SAndroid Build Coastguard Worker1:
2439*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v26.16b          // reset ypos
2440*c0909341SAndroid Build Coastguard Worker
2441*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2442*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
2443*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2444*c0909341SAndroid Build Coastguard Worker        cmp             w9,  w14                  // base_x <= -w
2445*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2446*c0909341SAndroid Build Coastguard Worker        b.le            329f
2447*c0909341SAndroid Build Coastguard Worker
2448*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
2449*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2450*c0909341SAndroid Build Coastguard Worker
2451*c0909341SAndroid Build Coastguard Worker        add             x9,  x2,  w9,  sxtw
2452*c0909341SAndroid Build Coastguard Worker        add             x11, x2,  w11, sxtw
2453*c0909341SAndroid Build Coastguard Worker
2454*c0909341SAndroid Build Coastguard Worker        sqshrn          v21.8b,  v16.8h,  #6      // first base_x
2455*c0909341SAndroid Build Coastguard Worker        sqshrn          v22.8b,  v17.8h,  #6
2456*c0909341SAndroid Build Coastguard Worker        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2457*c0909341SAndroid Build Coastguard Worker        xtn             v17.8b,  v17.8h
2458*c0909341SAndroid Build Coastguard Worker
2459*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x9], #16       // top[base_x]
2460*c0909341SAndroid Build Coastguard Worker        ld1             {v6.16b}, [x11], #16
2461*c0909341SAndroid Build Coastguard Worker
2462*c0909341SAndroid Build Coastguard Worker        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
2463*c0909341SAndroid Build Coastguard Worker        trn1            v22.2d,  v22.2d,  v22.2d
2464*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
2465*c0909341SAndroid Build Coastguard Worker        trn1            v17.2d,  v17.2d,  v17.2d
2466*c0909341SAndroid Build Coastguard Worker
2467*c0909341SAndroid Build Coastguard Worker        movi            v10.16b, #0x3e
2468*c0909341SAndroid Build Coastguard Worker        movi            v11.16b, #64
2469*c0909341SAndroid Build Coastguard Worker
2470*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v10.16b // frac_x
2471*c0909341SAndroid Build Coastguard Worker        and             v17.16b, v17.16b, v10.16b
2472*c0909341SAndroid Build Coastguard Worker
2473*c0909341SAndroid Build Coastguard Worker        sub             v8.16b,  v11.16b, v16.16b // 64 - frac_x
2474*c0909341SAndroid Build Coastguard Worker        sub             v9.16b,  v11.16b, v17.16b
2475*c0909341SAndroid Build Coastguard Worker
2476*c0909341SAndroid Build Coastguard Worker        add             v21.16b, v21.16b, v31.16b // actual base_x
2477*c0909341SAndroid Build Coastguard Worker        add             v22.16b, v22.16b, v31.16b
2478*c0909341SAndroid Build Coastguard Worker
2479*c0909341SAndroid Build Coastguard Worker2:
2480*c0909341SAndroid Build Coastguard Worker        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2481*c0909341SAndroid Build Coastguard Worker        movi            v12.16b, #64
2482*c0909341SAndroid Build Coastguard Worker        movi            v20.16b, #2
2483*c0909341SAndroid Build Coastguard Worker        movi            v10.16b, #0x3e
2484*c0909341SAndroid Build Coastguard Worker
2485*c0909341SAndroid Build Coastguard Worker        smov            w10,     v22.b[0]
2486*c0909341SAndroid Build Coastguard Worker
2487*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
2488*c0909341SAndroid Build Coastguard Worker        xtn2            v27.16b, v13.8h
2489*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2490*c0909341SAndroid Build Coastguard Worker        shrn2           v29.16b, v13.8h,  #6
2491*c0909341SAndroid Build Coastguard Worker        cmp             w10, #0                   // base_x (bottom left) >= 0
2492*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v27.16b, v10.16b // frac_y
2493*c0909341SAndroid Build Coastguard Worker
2494*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v15.16b          // left[0]
2495*c0909341SAndroid Build Coastguard Worker
2496*c0909341SAndroid Build Coastguard Worker        b.ge            4f
2497*c0909341SAndroid Build Coastguard Worker
2498*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
2499*c0909341SAndroid Build Coastguard Worker        movi            v13.16b, #1
2500*c0909341SAndroid Build Coastguard Worker
2501*c0909341SAndroid Build Coastguard Worker        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2502*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v13.16b // base_y + 1
2503*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b          // left[0]
2504*c0909341SAndroid Build Coastguard Worker
2505*c0909341SAndroid Build Coastguard Worker        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
2506*c0909341SAndroid Build Coastguard Worker
2507*c0909341SAndroid Build Coastguard Worker        ld1             {v5.16b}, [x9], #16       // top[base_x]
2508*c0909341SAndroid Build Coastguard Worker        ld1             {v7.16b}, [x11], #16
2509*c0909341SAndroid Build Coastguard Worker
2510*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2511*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v13.16b // base_y + 2
2512*c0909341SAndroid Build Coastguard Worker
2513*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b          // left[0]
2514*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2515*c0909341SAndroid Build Coastguard Worker
2516*c0909341SAndroid Build Coastguard Worker        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2517*c0909341SAndroid Build Coastguard Worker        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2518*c0909341SAndroid Build Coastguard Worker        umull2          v11.8h,  v18.16b, v28.16b
2519*c0909341SAndroid Build Coastguard Worker        umlal2          v11.8h,  v19.16b, v27.16b
2520*c0909341SAndroid Build Coastguard Worker        umull           v12.8h,  v19.8b,  v28.8b
2521*c0909341SAndroid Build Coastguard Worker        umlal           v12.8h,  v20.8b,  v27.8b
2522*c0909341SAndroid Build Coastguard Worker        umull2          v13.8h,  v19.16b, v28.16b
2523*c0909341SAndroid Build Coastguard Worker        umlal2          v13.8h,  v20.16b, v27.16b
2524*c0909341SAndroid Build Coastguard Worker
2525*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
2526*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v6.16b,  v7.16b,  #1
2527*c0909341SAndroid Build Coastguard Worker
2528*c0909341SAndroid Build Coastguard Worker        rshrn           v10.8b,  v10.8h,  #6
2529*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.16b, v11.8h,  #6
2530*c0909341SAndroid Build Coastguard Worker        rshrn           v11.8b,  v12.8h,  #6
2531*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.16b, v13.8h,  #6
2532*c0909341SAndroid Build Coastguard Worker
2533*c0909341SAndroid Build Coastguard Worker        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2534*c0909341SAndroid Build Coastguard Worker        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
2535*c0909341SAndroid Build Coastguard Worker        umull2          v13.8h,  v4.16b,  v8.16b
2536*c0909341SAndroid Build Coastguard Worker        umlal2          v13.8h,  v18.16b, v16.16b
2537*c0909341SAndroid Build Coastguard Worker        umull           v14.8h,  v6.8b,   v9.8b
2538*c0909341SAndroid Build Coastguard Worker        umlal           v14.8h,  v19.8b,  v17.8b
2539*c0909341SAndroid Build Coastguard Worker        umull2          v20.8h,  v6.16b,  v9.16b
2540*c0909341SAndroid Build Coastguard Worker        umlal2          v20.8h,  v19.16b, v17.16b
2541*c0909341SAndroid Build Coastguard Worker
2542*c0909341SAndroid Build Coastguard Worker        cmge            v18.16b, v21.16b, #0
2543*c0909341SAndroid Build Coastguard Worker        cmge            v19.16b, v22.16b, #0
2544*c0909341SAndroid Build Coastguard Worker
2545*c0909341SAndroid Build Coastguard Worker        rshrn           v12.8b,  v12.8h,  #6
2546*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.16b, v13.8h,  #6
2547*c0909341SAndroid Build Coastguard Worker        rshrn           v13.8b,  v14.8h,  #6
2548*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.16b, v20.8h,  #6
2549*c0909341SAndroid Build Coastguard Worker
2550*c0909341SAndroid Build Coastguard Worker        bit             v10.16b, v12.16b, v18.16b
2551*c0909341SAndroid Build Coastguard Worker        bit             v11.16b, v13.16b, v19.16b
2552*c0909341SAndroid Build Coastguard Worker
2553*c0909341SAndroid Build Coastguard Worker        st1             {v10.16b}, [x0], #16
2554*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
2555*c0909341SAndroid Build Coastguard Worker        st1             {v11.16b}, [x13], #16
2556*c0909341SAndroid Build Coastguard Worker        b.le            3f
2557*c0909341SAndroid Build Coastguard Worker
2558*c0909341SAndroid Build Coastguard Worker        movi            v10.16b, #16
2559*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v5.16b
2560*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v7.16b
2561*c0909341SAndroid Build Coastguard Worker        add             v21.16b, v21.16b, v10.16b // base_x += 16
2562*c0909341SAndroid Build Coastguard Worker        add             v22.16b, v22.16b, v10.16b
2563*c0909341SAndroid Build Coastguard Worker        b               2b
2564*c0909341SAndroid Build Coastguard Worker
2565*c0909341SAndroid Build Coastguard Worker3:
2566*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2567*c0909341SAndroid Build Coastguard Worker        b.le            9f
2568*c0909341SAndroid Build Coastguard Worker        movi            v10.8h, #128
2569*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
2570*c0909341SAndroid Build Coastguard Worker        add             x13, x13, x1
2571*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12                  // reset w
2572*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
2573*c0909341SAndroid Build Coastguard Worker        b               1b
2574*c0909341SAndroid Build Coastguard Worker
2575*c0909341SAndroid Build Coastguard Worker4:      // The rest of the row only predicted from top[]
2576*c0909341SAndroid Build Coastguard Worker        ld1             {v5.16b}, [x9], #16       // top[base_x]
2577*c0909341SAndroid Build Coastguard Worker        ld1             {v7.16b}, [x11], #16
2578*c0909341SAndroid Build Coastguard Worker
2579*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
2580*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v6.16b,  v7.16b,  #1
2581*c0909341SAndroid Build Coastguard Worker
2582*c0909341SAndroid Build Coastguard Worker        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2583*c0909341SAndroid Build Coastguard Worker        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
2584*c0909341SAndroid Build Coastguard Worker        umull2          v13.8h,  v4.16b,  v8.16b
2585*c0909341SAndroid Build Coastguard Worker        umlal2          v13.8h,  v18.16b, v16.16b
2586*c0909341SAndroid Build Coastguard Worker        umull           v14.8h,  v6.8b,   v9.8b
2587*c0909341SAndroid Build Coastguard Worker        umlal           v14.8h,  v19.8b,  v17.8b
2588*c0909341SAndroid Build Coastguard Worker        umull2          v20.8h,  v6.16b,  v9.16b
2589*c0909341SAndroid Build Coastguard Worker        umlal2          v20.8h,  v19.16b, v17.16b
2590*c0909341SAndroid Build Coastguard Worker
2591*c0909341SAndroid Build Coastguard Worker        rshrn           v12.8b,  v12.8h,  #6
2592*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.16b, v13.8h,  #6
2593*c0909341SAndroid Build Coastguard Worker        rshrn           v13.8b,  v14.8h,  #6
2594*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.16b, v20.8h,  #6
2595*c0909341SAndroid Build Coastguard Worker
2596*c0909341SAndroid Build Coastguard Worker        st1             {v12.16b}, [x0], #16
2597*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
2598*c0909341SAndroid Build Coastguard Worker        st1             {v13.16b}, [x13], #16
2599*c0909341SAndroid Build Coastguard Worker        b.le            3b
2600*c0909341SAndroid Build Coastguard Worker
2601*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v5.16b
2602*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v7.16b
2603*c0909341SAndroid Build Coastguard Worker        b               4b
2604*c0909341SAndroid Build Coastguard Worker
2605*c0909341SAndroid Build Coastguard Worker329:    // The rest of the block only predicted from left[]
2606*c0909341SAndroid Build Coastguard Worker        add             x1,  x1,  w4,  uxtw       // restore stride
2607*c0909341SAndroid Build Coastguard Worker        mov             w12, w5                   // orig remaining h
2608*c0909341SAndroid Build Coastguard Worker1:
2609*c0909341SAndroid Build Coastguard Worker        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2610*c0909341SAndroid Build Coastguard Worker        movi            v12.16b, #64
2611*c0909341SAndroid Build Coastguard Worker        movi            v10.16b, #0x3e
2612*c0909341SAndroid Build Coastguard Worker
2613*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
2614*c0909341SAndroid Build Coastguard Worker        xtn2            v27.16b, v13.8h
2615*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2616*c0909341SAndroid Build Coastguard Worker        shrn2           v29.16b, v13.8h,  #6
2617*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v27.16b, v10.16b // frac_y
2618*c0909341SAndroid Build Coastguard Worker
2619*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v15.16b          // left[0]
2620*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
2621*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #1
2622*c0909341SAndroid Build Coastguard Worker
2623*c0909341SAndroid Build Coastguard Worker        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2624*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // base_y + 1
2625*c0909341SAndroid Build Coastguard Worker
2626*c0909341SAndroid Build Coastguard Worker        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
2627*c0909341SAndroid Build Coastguard Worker2:
2628*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b          // left[0]
2629*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2630*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // base_y + 2
2631*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b          // left[0]
2632*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2633*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // next base_y
2634*c0909341SAndroid Build Coastguard Worker
2635*c0909341SAndroid Build Coastguard Worker        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2636*c0909341SAndroid Build Coastguard Worker        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2637*c0909341SAndroid Build Coastguard Worker        umull2          v11.8h,  v18.16b, v28.16b
2638*c0909341SAndroid Build Coastguard Worker        umlal2          v11.8h,  v19.16b, v27.16b
2639*c0909341SAndroid Build Coastguard Worker        umull           v12.8h,  v19.8b,  v28.8b
2640*c0909341SAndroid Build Coastguard Worker        umlal           v12.8h,  v20.8b,  v27.8b
2641*c0909341SAndroid Build Coastguard Worker        umull2          v13.8h,  v19.16b, v28.16b
2642*c0909341SAndroid Build Coastguard Worker        umlal2          v13.8h,  v20.16b, v27.16b
2643*c0909341SAndroid Build Coastguard Worker
2644*c0909341SAndroid Build Coastguard Worker        rshrn           v10.8b,  v10.8h,  #6
2645*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.16b, v11.8h,  #6
2646*c0909341SAndroid Build Coastguard Worker        rshrn           v11.8b,  v12.8h,  #6
2647*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.16b, v13.8h,  #6
2648*c0909341SAndroid Build Coastguard Worker
2649*c0909341SAndroid Build Coastguard Worker        st1             {v10.16b}, [x0], x1
2650*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2651*c0909341SAndroid Build Coastguard Worker        st1             {v11.16b}, [x13], x1
2652*c0909341SAndroid Build Coastguard Worker        b.le            3f
2653*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2654*c0909341SAndroid Build Coastguard Worker        b               2b
2655*c0909341SAndroid Build Coastguard Worker
2656*c0909341SAndroid Build Coastguard Worker3:
2657*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
2658*c0909341SAndroid Build Coastguard Worker        b.le            9f
2659*c0909341SAndroid Build Coastguard Worker
2660*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
2661*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
2662*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
2663*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
2664*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #16
2665*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #16
2666*c0909341SAndroid Build Coastguard Worker        mov             w5,  w12                  // reset h
2667*c0909341SAndroid Build Coastguard Worker        b               1b
2668*c0909341SAndroid Build Coastguard Worker
2669*c0909341SAndroid Build Coastguard Worker9:
2670*c0909341SAndroid Build Coastguard Worker        ldp             d14, d15, [sp, #0x30]
2671*c0909341SAndroid Build Coastguard Worker        ldp             d12, d13, [sp, #0x20]
2672*c0909341SAndroid Build Coastguard Worker        ldp             d10, d11, [sp, #0x10]
2673*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp], 0x40
2674*c0909341SAndroid Build Coastguard Worker        ret
2675*c0909341SAndroid Build Coastguard Workerendfunc
2676*c0909341SAndroid Build Coastguard Worker
2677*c0909341SAndroid Build Coastguard Workerjumptable ipred_z2_fill1_tbl
2678*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z2_fill1_tbl
2679*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z2_fill1_tbl
2680*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z2_fill1_tbl
2681*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z2_fill1_tbl
2682*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z2_fill1_tbl
2683*c0909341SAndroid Build Coastguard Workerendjumptable
2684*c0909341SAndroid Build Coastguard Worker
2685*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_fill2_8bpc_neon, export=1
2686*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
2687*c0909341SAndroid Build Coastguard Worker        mov             w8,  #(2 << 6)            // xpos = 2 << 6
2688*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2689*c0909341SAndroid Build Coastguard Worker
2690*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
2691*c0909341SAndroid Build Coastguard Worker        ld1             {v31.8h},  [x11]          // increments
2692*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                   // -dy
2693*c0909341SAndroid Build Coastguard Worker        b.eq            80f
2694*c0909341SAndroid Build Coastguard Worker
2695*c0909341SAndroid Build Coastguard Worker40:
2696*c0909341SAndroid Build Coastguard Worker        dup             v30.4h,  w7               // -dy
2697*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
2698*c0909341SAndroid Build Coastguard Worker
2699*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2700*c0909341SAndroid Build Coastguard Worker        movi            v25.16b, #0x3e
2701*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2702*c0909341SAndroid Build Coastguard Worker
2703*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3}
2704*c0909341SAndroid Build Coastguard Worker
2705*c0909341SAndroid Build Coastguard Worker        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2706*c0909341SAndroid Build Coastguard Worker        // from left.
2707*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x3]            // left[]
2708*c0909341SAndroid Build Coastguard Worker
2709*c0909341SAndroid Build Coastguard Worker        movi            v26.16b, #64
2710*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
2711*c0909341SAndroid Build Coastguard Worker
2712*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2713*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2714*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2715*c0909341SAndroid Build Coastguard Worker
2716*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2717*c0909341SAndroid Build Coastguard Worker
2718*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2719*c0909341SAndroid Build Coastguard Worker        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2720*c0909341SAndroid Build Coastguard Worker
2721*c0909341SAndroid Build Coastguard Worker        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
2722*c0909341SAndroid Build Coastguard Worker
2723*c0909341SAndroid Build Coastguard Worker        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
2724*c0909341SAndroid Build Coastguard Worker
2725*c0909341SAndroid Build Coastguard Worker        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2726*c0909341SAndroid Build Coastguard Worker
2727*c0909341SAndroid Build Coastguard Worker        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2728*c0909341SAndroid Build Coastguard Worker
2729*c0909341SAndroid Build Coastguard Worker        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2730*c0909341SAndroid Build Coastguard Worker        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2731*c0909341SAndroid Build Coastguard Worker
2732*c0909341SAndroid Build Coastguard Worker        movi            v29.8b,  #2
2733*c0909341SAndroid Build Coastguard Worker        add             v31.8b,  v31.8b,  v31.8b  // {0,2,4,6,0,2,4,6}
2734*c0909341SAndroid Build Coastguard Worker4:
2735*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2736*c0909341SAndroid Build Coastguard Worker        dup             v6.4h,   w8               // xpos
2737*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2738*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-8                  // base_x <= -8
2739*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2740*c0909341SAndroid Build Coastguard Worker        b.le            49f
2741*c0909341SAndroid Build Coastguard Worker
2742*c0909341SAndroid Build Coastguard Worker        dup             v7.4h,   w8               // xpos
2743*c0909341SAndroid Build Coastguard Worker
2744*c0909341SAndroid Build Coastguard Worker        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2745*c0909341SAndroid Build Coastguard Worker        ldr             d4,  [x2, w11, sxtw]
2746*c0909341SAndroid Build Coastguard Worker
2747*c0909341SAndroid Build Coastguard Worker        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2748*c0909341SAndroid Build Coastguard Worker
2749*c0909341SAndroid Build Coastguard Worker        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2750*c0909341SAndroid Build Coastguard Worker
2751*c0909341SAndroid Build Coastguard Worker        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2752*c0909341SAndroid Build Coastguard Worker        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2753*c0909341SAndroid Build Coastguard Worker
2754*c0909341SAndroid Build Coastguard Worker        uzp2            v3.8b,   v2.8b,   v4.8b   // top[base_x+1]
2755*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8b,   v2.8b,   v4.8b   // top[base_x]
2756*c0909341SAndroid Build Coastguard Worker
2757*c0909341SAndroid Build Coastguard Worker        and             v6.8b,   v6.8b,   v25.8b  // frac_x
2758*c0909341SAndroid Build Coastguard Worker
2759*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2760*c0909341SAndroid Build Coastguard Worker
2761*c0909341SAndroid Build Coastguard Worker        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
2762*c0909341SAndroid Build Coastguard Worker
2763*c0909341SAndroid Build Coastguard Worker        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
2764*c0909341SAndroid Build Coastguard Worker
2765*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
2766*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2767*c0909341SAndroid Build Coastguard Worker
2768*c0909341SAndroid Build Coastguard Worker        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2769*c0909341SAndroid Build Coastguard Worker        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
2770*c0909341SAndroid Build Coastguard Worker
2771*c0909341SAndroid Build Coastguard Worker        cmge            v20.8b,  v20.8b,  #0
2772*c0909341SAndroid Build Coastguard Worker
2773*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
2774*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #6
2775*c0909341SAndroid Build Coastguard Worker
2776*c0909341SAndroid Build Coastguard Worker        bit             v16.8b,  v22.8b,  v20.8b
2777*c0909341SAndroid Build Coastguard Worker
2778*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[0], [x0], x1
2779*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2780*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2781*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[1], [x0], x1
2782*c0909341SAndroid Build Coastguard Worker        b.le            9f
2783*c0909341SAndroid Build Coastguard Worker
2784*c0909341SAndroid Build Coastguard Worker        ext             v16.8b,  v17.8b,  v17.8b, #4
2785*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2786*c0909341SAndroid Build Coastguard Worker        b               4b
2787*c0909341SAndroid Build Coastguard Worker
2788*c0909341SAndroid Build Coastguard Worker49:
2789*c0909341SAndroid Build Coastguard Worker        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2790*c0909341SAndroid Build Coastguard Worker
2791*c0909341SAndroid Build Coastguard Worker        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2792*c0909341SAndroid Build Coastguard Worker
2793*c0909341SAndroid Build Coastguard Worker        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
2794*c0909341SAndroid Build Coastguard Worker        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2795*c0909341SAndroid Build Coastguard Worker        rshrn           v18.8b,  v18.8h,  #6
2796*c0909341SAndroid Build Coastguard Worker
2797*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0], x1
2798*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2799*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x0], x1
2800*c0909341SAndroid Build Coastguard Worker        b.le            9f
2801*c0909341SAndroid Build Coastguard Worker
2802*c0909341SAndroid Build Coastguard Worker        ext             v16.8b,  v17.8b,  v17.8b, #4
2803*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2804*c0909341SAndroid Build Coastguard Worker        b               49b
2805*c0909341SAndroid Build Coastguard Worker
2806*c0909341SAndroid Build Coastguard Worker9:
2807*c0909341SAndroid Build Coastguard Worker        ret
2808*c0909341SAndroid Build Coastguard Worker
2809*c0909341SAndroid Build Coastguard Worker80:
2810*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  w7               // -dy
2811*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
2812*c0909341SAndroid Build Coastguard Worker
2813*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
2814*c0909341SAndroid Build Coastguard Worker        movi            v25.16b, #0x3e
2815*c0909341SAndroid Build Coastguard Worker        add             v30.8h,  v16.8h,  v30.8h  // -= dy
2816*c0909341SAndroid Build Coastguard Worker
2817*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2818*c0909341SAndroid Build Coastguard Worker
2819*c0909341SAndroid Build Coastguard Worker        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2820*c0909341SAndroid Build Coastguard Worker        // from left.
2821*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x3]    // left[]
2822*c0909341SAndroid Build Coastguard Worker
2823*c0909341SAndroid Build Coastguard Worker        movi            v26.16b, #64
2824*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
2825*c0909341SAndroid Build Coastguard Worker
2826*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2827*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2828*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2829*c0909341SAndroid Build Coastguard Worker
2830*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2831*c0909341SAndroid Build Coastguard Worker
2832*c0909341SAndroid Build Coastguard Worker        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2833*c0909341SAndroid Build Coastguard Worker
2834*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
2835*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
2836*c0909341SAndroid Build Coastguard Worker
2837*c0909341SAndroid Build Coastguard Worker        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2838*c0909341SAndroid Build Coastguard Worker
2839*c0909341SAndroid Build Coastguard Worker        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
2840*c0909341SAndroid Build Coastguard Worker
2841*c0909341SAndroid Build Coastguard Worker        movi            v24.8b,  #2               // 2
2842*c0909341SAndroid Build Coastguard Worker        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
2843*c0909341SAndroid Build Coastguard Worker8:
2844*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2845*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
2846*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2847*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-16                 // base_x <= -16
2848*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2849*c0909341SAndroid Build Coastguard Worker        b.le            89f
2850*c0909341SAndroid Build Coastguard Worker
2851*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
2852*c0909341SAndroid Build Coastguard Worker
2853*c0909341SAndroid Build Coastguard Worker        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2854*c0909341SAndroid Build Coastguard Worker        ldr             q6,  [x2, w11, sxtw]
2855*c0909341SAndroid Build Coastguard Worker
2856*c0909341SAndroid Build Coastguard Worker        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
2857*c0909341SAndroid Build Coastguard Worker
2858*c0909341SAndroid Build Coastguard Worker        shrn            v21.8b,  v16.8h,  #6      // first base_x
2859*c0909341SAndroid Build Coastguard Worker        shrn2           v21.16b, v17.8h,  #6
2860*c0909341SAndroid Build Coastguard Worker        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2861*c0909341SAndroid Build Coastguard Worker        xtn2            v16.16b, v17.8h
2862*c0909341SAndroid Build Coastguard Worker
2863*c0909341SAndroid Build Coastguard Worker        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
2864*c0909341SAndroid Build Coastguard Worker
2865*c0909341SAndroid Build Coastguard Worker        uzp2            v5.16b,  v4.16b,  v6.16b  // top[base_x+1]
2866*c0909341SAndroid Build Coastguard Worker        uzp1            v4.16b,  v4.16b,  v6.16b  // top[base_x]
2867*c0909341SAndroid Build Coastguard Worker
2868*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
2869*c0909341SAndroid Build Coastguard Worker
2870*c0909341SAndroid Build Coastguard Worker        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
2871*c0909341SAndroid Build Coastguard Worker
2872*c0909341SAndroid Build Coastguard Worker        add             v21.16b, v21.16b, v31.16b // actual base_x
2873*c0909341SAndroid Build Coastguard Worker
2874*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2875*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2876*c0909341SAndroid Build Coastguard Worker        umull           v17.8h,  v19.8b,  v28.8b
2877*c0909341SAndroid Build Coastguard Worker        umlal           v17.8h,  v20.8b,  v27.8b
2878*c0909341SAndroid Build Coastguard Worker
2879*c0909341SAndroid Build Coastguard Worker        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2880*c0909341SAndroid Build Coastguard Worker        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2881*c0909341SAndroid Build Coastguard Worker        umull2          v23.8h,  v4.16b,  v7.16b
2882*c0909341SAndroid Build Coastguard Worker        umlal2          v23.8h,  v5.16b,  v16.16b
2883*c0909341SAndroid Build Coastguard Worker
2884*c0909341SAndroid Build Coastguard Worker        cmge            v21.16b, v21.16b, #0
2885*c0909341SAndroid Build Coastguard Worker
2886*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #6
2887*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v17.8h,  #6
2888*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #6
2889*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.16b, v23.8h,  #6
2890*c0909341SAndroid Build Coastguard Worker
2891*c0909341SAndroid Build Coastguard Worker        bit             v6.16b,  v22.16b, v21.16b
2892*c0909341SAndroid Build Coastguard Worker
2893*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[0], [x0], x1
2894*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2895*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2896*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[1], [x0], x1
2897*c0909341SAndroid Build Coastguard Worker        b.le            9f
2898*c0909341SAndroid Build Coastguard Worker
2899*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v20.8b
2900*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2901*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2902*c0909341SAndroid Build Coastguard Worker        b               8b
2903*c0909341SAndroid Build Coastguard Worker
2904*c0909341SAndroid Build Coastguard Worker89:
2905*c0909341SAndroid Build Coastguard Worker        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
2906*c0909341SAndroid Build Coastguard Worker        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
2907*c0909341SAndroid Build Coastguard Worker
2908*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2909*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2910*c0909341SAndroid Build Coastguard Worker        umull           v17.8h,  v19.8b,  v28.8b
2911*c0909341SAndroid Build Coastguard Worker        umlal           v17.8h,  v20.8b,  v27.8b
2912*c0909341SAndroid Build Coastguard Worker
2913*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #6
2914*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v17.8h,  #6
2915*c0909341SAndroid Build Coastguard Worker
2916*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[0], [x0], x1
2917*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2918*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[1], [x0], x1
2919*c0909341SAndroid Build Coastguard Worker        b.le            9f
2920*c0909341SAndroid Build Coastguard Worker
2921*c0909341SAndroid Build Coastguard Worker        mov             v18.8b,  v20.8b
2922*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2923*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2924*c0909341SAndroid Build Coastguard Worker        b               89b
2925*c0909341SAndroid Build Coastguard Worker
2926*c0909341SAndroid Build Coastguard Worker9:
2927*c0909341SAndroid Build Coastguard Worker        ret
2928*c0909341SAndroid Build Coastguard Workerendfunc
2929*c0909341SAndroid Build Coastguard Worker
2930*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_fill3_8bpc_neon, export=1
2931*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
2932*c0909341SAndroid Build Coastguard Worker        mov             w8,  #(1 << 6)            // xpos = 1 << 6
2933*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2934*c0909341SAndroid Build Coastguard Worker
2935*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
2936*c0909341SAndroid Build Coastguard Worker        ld1             {v31.8h},  [x11]          // increments
2937*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                   // -dy
2938*c0909341SAndroid Build Coastguard Worker        b.eq            80f
2939*c0909341SAndroid Build Coastguard Worker
2940*c0909341SAndroid Build Coastguard Worker40:
2941*c0909341SAndroid Build Coastguard Worker        dup             v30.4h,  w7               // -dy
2942*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
2943*c0909341SAndroid Build Coastguard Worker
2944*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2945*c0909341SAndroid Build Coastguard Worker        movi            v25.16b, #0x3e
2946*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2947*c0909341SAndroid Build Coastguard Worker
2948*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3}
2949*c0909341SAndroid Build Coastguard Worker
2950*c0909341SAndroid Build Coastguard Worker        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
2951*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x3]    // left[]
2952*c0909341SAndroid Build Coastguard Worker
2953*c0909341SAndroid Build Coastguard Worker        movi            v26.16b, #64
2954*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
2955*c0909341SAndroid Build Coastguard Worker
2956*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2957*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2958*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2959*c0909341SAndroid Build Coastguard Worker
2960*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
2961*c0909341SAndroid Build Coastguard Worker
2962*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2963*c0909341SAndroid Build Coastguard Worker        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2964*c0909341SAndroid Build Coastguard Worker
2965*c0909341SAndroid Build Coastguard Worker        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2966*c0909341SAndroid Build Coastguard Worker
2967*c0909341SAndroid Build Coastguard Worker        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3
2968*c0909341SAndroid Build Coastguard Worker
2969*c0909341SAndroid Build Coastguard Worker        trn1            v29.2s,  v29.2s,  v28.2s  // base_y + 0, base_y + 2
2970*c0909341SAndroid Build Coastguard Worker        trn1            v30.2s,  v30.2s,  v24.2s  // base_y + 1, base_y + 3
2971*c0909341SAndroid Build Coastguard Worker
2972*c0909341SAndroid Build Coastguard Worker        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2973*c0909341SAndroid Build Coastguard Worker
2974*c0909341SAndroid Build Coastguard Worker        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2975*c0909341SAndroid Build Coastguard Worker        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2976*c0909341SAndroid Build Coastguard Worker
2977*c0909341SAndroid Build Coastguard Worker        movi            v24.8b,  #4
2978*c0909341SAndroid Build Coastguard Worker4:
2979*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2980*c0909341SAndroid Build Coastguard Worker        dup             v6.4h,   w8               // xpos
2981*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2982*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-4                  // base_x <= -4
2983*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2984*c0909341SAndroid Build Coastguard Worker        b.le            49f
2985*c0909341SAndroid Build Coastguard Worker
2986*c0909341SAndroid Build Coastguard Worker        dup             v7.4h,   w8               // xpos
2987*c0909341SAndroid Build Coastguard Worker
2988*c0909341SAndroid Build Coastguard Worker        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2989*c0909341SAndroid Build Coastguard Worker        ldr             d4,  [x2, w11, sxtw]
2990*c0909341SAndroid Build Coastguard Worker
2991*c0909341SAndroid Build Coastguard Worker        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2992*c0909341SAndroid Build Coastguard Worker
2993*c0909341SAndroid Build Coastguard Worker        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
2994*c0909341SAndroid Build Coastguard Worker        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
2995*c0909341SAndroid Build Coastguard Worker
2996*c0909341SAndroid Build Coastguard Worker        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2997*c0909341SAndroid Build Coastguard Worker        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2998*c0909341SAndroid Build Coastguard Worker
2999*c0909341SAndroid Build Coastguard Worker        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
3000*c0909341SAndroid Build Coastguard Worker        ext             v5.8b,   v4.8b,   v4.8b,   #1
3001*c0909341SAndroid Build Coastguard Worker
3002*c0909341SAndroid Build Coastguard Worker        and             v6.8b,   v6.8b,   v25.8b  // frac_x
3003*c0909341SAndroid Build Coastguard Worker
3004*c0909341SAndroid Build Coastguard Worker        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
3005*c0909341SAndroid Build Coastguard Worker        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
3006*c0909341SAndroid Build Coastguard Worker
3007*c0909341SAndroid Build Coastguard Worker        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
3008*c0909341SAndroid Build Coastguard Worker
3009*c0909341SAndroid Build Coastguard Worker        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
3010*c0909341SAndroid Build Coastguard Worker
3011*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
3012*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
3013*c0909341SAndroid Build Coastguard Worker
3014*c0909341SAndroid Build Coastguard Worker        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
3015*c0909341SAndroid Build Coastguard Worker        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
3016*c0909341SAndroid Build Coastguard Worker
3017*c0909341SAndroid Build Coastguard Worker        cmge            v20.8b,  v20.8b,  #0
3018*c0909341SAndroid Build Coastguard Worker
3019*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
3020*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #6
3021*c0909341SAndroid Build Coastguard Worker
3022*c0909341SAndroid Build Coastguard Worker        bit             v16.8b,  v22.8b,  v20.8b
3023*c0909341SAndroid Build Coastguard Worker
3024*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[0], [x0], x1
3025*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3026*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3027*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[1], [x0], x1
3028*c0909341SAndroid Build Coastguard Worker        b.le            9f
3029*c0909341SAndroid Build Coastguard Worker
3030*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
3031*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
3032*c0909341SAndroid Build Coastguard Worker        b               4b
3033*c0909341SAndroid Build Coastguard Worker
3034*c0909341SAndroid Build Coastguard Worker49:
3035*c0909341SAndroid Build Coastguard Worker        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
3036*c0909341SAndroid Build Coastguard Worker        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
3037*c0909341SAndroid Build Coastguard Worker
3038*c0909341SAndroid Build Coastguard Worker        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
3039*c0909341SAndroid Build Coastguard Worker        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
3040*c0909341SAndroid Build Coastguard Worker        rshrn           v18.8b,  v18.8h,  #6
3041*c0909341SAndroid Build Coastguard Worker
3042*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0], x1
3043*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3044*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x0], x1
3045*c0909341SAndroid Build Coastguard Worker        b.le            9f
3046*c0909341SAndroid Build Coastguard Worker
3047*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
3048*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
3049*c0909341SAndroid Build Coastguard Worker        b               49b
3050*c0909341SAndroid Build Coastguard Worker
3051*c0909341SAndroid Build Coastguard Worker9:
3052*c0909341SAndroid Build Coastguard Worker        ret
3053*c0909341SAndroid Build Coastguard Worker
3054*c0909341SAndroid Build Coastguard Worker80:
3055*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  w7               // -dy
3056*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
3057*c0909341SAndroid Build Coastguard Worker
3058*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
3059*c0909341SAndroid Build Coastguard Worker        movi            v25.16b, #0x3e
3060*c0909341SAndroid Build Coastguard Worker        add             v30.8h,  v16.8h,  v30.8h  // -= dy
3061*c0909341SAndroid Build Coastguard Worker
3062*c0909341SAndroid Build Coastguard Worker        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
3063*c0909341SAndroid Build Coastguard Worker
3064*c0909341SAndroid Build Coastguard Worker        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3065*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
3066*c0909341SAndroid Build Coastguard Worker
3067*c0909341SAndroid Build Coastguard Worker        movi            v26.16b, #64
3068*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
3069*c0909341SAndroid Build Coastguard Worker
3070*c0909341SAndroid Build Coastguard Worker        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
3071*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
3072*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v27.8b,  v25.8b  // frac_y
3073*c0909341SAndroid Build Coastguard Worker
3074*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
3075*c0909341SAndroid Build Coastguard Worker
3076*c0909341SAndroid Build Coastguard Worker        add             v28.8b,  v29.8b,  v17.8b  // base_y + 1
3077*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
3078*c0909341SAndroid Build Coastguard Worker
3079*c0909341SAndroid Build Coastguard Worker        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
3080*c0909341SAndroid Build Coastguard Worker        add             v24.8b,  v28.8b,  v19.8b  // base_y + 3
3081*c0909341SAndroid Build Coastguard Worker
3082*c0909341SAndroid Build Coastguard Worker        trn1            v29.2d,  v29.2d,  v30.2d  // base_y + 0, base_y + 2
3083*c0909341SAndroid Build Coastguard Worker        trn1            v30.2d,  v28.2d,  v24.2d  // base_y + 1, base_y + 3
3084*c0909341SAndroid Build Coastguard Worker
3085*c0909341SAndroid Build Coastguard Worker        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
3086*c0909341SAndroid Build Coastguard Worker
3087*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #4
3088*c0909341SAndroid Build Coastguard Worker
3089*c0909341SAndroid Build Coastguard Worker        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
3090*c0909341SAndroid Build Coastguard Worker        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
3091*c0909341SAndroid Build Coastguard Worker8:
3092*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
3093*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
3094*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3095*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-8                  // base_x <= -8
3096*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
3097*c0909341SAndroid Build Coastguard Worker        b.le            89f
3098*c0909341SAndroid Build Coastguard Worker
3099*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
3100*c0909341SAndroid Build Coastguard Worker
3101*c0909341SAndroid Build Coastguard Worker        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
3102*c0909341SAndroid Build Coastguard Worker        ldr             q6,  [x2, w11, sxtw]
3103*c0909341SAndroid Build Coastguard Worker
3104*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3105*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3106*c0909341SAndroid Build Coastguard Worker
3107*c0909341SAndroid Build Coastguard Worker        shrn            v21.8b,  v16.8h,  #6      // first base_x
3108*c0909341SAndroid Build Coastguard Worker        shrn2           v21.16b, v17.8h,  #6
3109*c0909341SAndroid Build Coastguard Worker        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
3110*c0909341SAndroid Build Coastguard Worker        xtn2            v16.16b, v17.8h
3111*c0909341SAndroid Build Coastguard Worker
3112*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
3113*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v6.16b,  #1
3114*c0909341SAndroid Build Coastguard Worker
3115*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
3116*c0909341SAndroid Build Coastguard Worker
3117*c0909341SAndroid Build Coastguard Worker        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
3118*c0909341SAndroid Build Coastguard Worker        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
3119*c0909341SAndroid Build Coastguard Worker
3120*c0909341SAndroid Build Coastguard Worker        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
3121*c0909341SAndroid Build Coastguard Worker
3122*c0909341SAndroid Build Coastguard Worker        add             v21.16b, v21.16b, v31.16b // actual base_x
3123*c0909341SAndroid Build Coastguard Worker
3124*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
3125*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
3126*c0909341SAndroid Build Coastguard Worker        umull2          v17.8h,  v18.16b, v28.16b
3127*c0909341SAndroid Build Coastguard Worker        umlal2          v17.8h,  v19.16b, v27.16b
3128*c0909341SAndroid Build Coastguard Worker
3129*c0909341SAndroid Build Coastguard Worker        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
3130*c0909341SAndroid Build Coastguard Worker        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
3131*c0909341SAndroid Build Coastguard Worker        umull2          v23.8h,  v4.16b,  v7.16b
3132*c0909341SAndroid Build Coastguard Worker        umlal2          v23.8h,  v5.16b,  v16.16b
3133*c0909341SAndroid Build Coastguard Worker
3134*c0909341SAndroid Build Coastguard Worker        cmge            v21.16b, v21.16b, #0
3135*c0909341SAndroid Build Coastguard Worker
3136*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #6
3137*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v17.8h,  #6
3138*c0909341SAndroid Build Coastguard Worker        rshrn           v22.8b,  v22.8h,  #6
3139*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.16b, v23.8h,  #6
3140*c0909341SAndroid Build Coastguard Worker
3141*c0909341SAndroid Build Coastguard Worker        bit             v6.16b,  v22.16b, v21.16b
3142*c0909341SAndroid Build Coastguard Worker
3143*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[0], [x0], x1
3144*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3145*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3146*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[1], [x0], x1
3147*c0909341SAndroid Build Coastguard Worker        b.le            9f
3148*c0909341SAndroid Build Coastguard Worker
3149*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 4
3150*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 4
3151*c0909341SAndroid Build Coastguard Worker        b               8b
3152*c0909341SAndroid Build Coastguard Worker
3153*c0909341SAndroid Build Coastguard Worker89:
3154*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3155*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3156*c0909341SAndroid Build Coastguard Worker
3157*c0909341SAndroid Build Coastguard Worker        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
3158*c0909341SAndroid Build Coastguard Worker        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
3159*c0909341SAndroid Build Coastguard Worker        umull2          v17.8h,  v18.16b, v28.16b
3160*c0909341SAndroid Build Coastguard Worker        umlal2          v17.8h,  v19.16b, v27.16b
3161*c0909341SAndroid Build Coastguard Worker
3162*c0909341SAndroid Build Coastguard Worker        rshrn           v6.8b,   v6.8h,   #6
3163*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.16b,  v17.8h,  #6
3164*c0909341SAndroid Build Coastguard Worker
3165*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[0], [x0], x1
3166*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3167*c0909341SAndroid Build Coastguard Worker        st1             {v6.d}[1], [x0], x1
3168*c0909341SAndroid Build Coastguard Worker        b.le            9f
3169*c0909341SAndroid Build Coastguard Worker
3170*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 4
3171*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 4
3172*c0909341SAndroid Build Coastguard Worker        b               89b
3173*c0909341SAndroid Build Coastguard Worker
3174*c0909341SAndroid Build Coastguard Worker9:
3175*c0909341SAndroid Build Coastguard Worker        ret
3176*c0909341SAndroid Build Coastguard Workerendfunc
3177*c0909341SAndroid Build Coastguard Worker
3178*c0909341SAndroid Build Coastguard Worker
3179*c0909341SAndroid Build Coastguard Worker// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3180*c0909341SAndroid Build Coastguard Worker//                               const pixel *const left,
3181*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height,
3182*c0909341SAndroid Build Coastguard Worker//                               const int dy, const int max_base_y);
3183*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill1_8bpc_neon, export=1
3184*c0909341SAndroid Build Coastguard Worker        cmp             w6,  #64
3185*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
3186*c0909341SAndroid Build Coastguard Worker        movrel          x8,  ipred_z3_fill1_tbl
3187*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
3188*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x8, w9, uxtw #2]
3189*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3190*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x9
3191*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
3192*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.16b}, [x10]          // padding
3193*c0909341SAndroid Build Coastguard Worker        ld1             {v30.8h},  [x11]          // increments
3194*c0909341SAndroid Build Coastguard Worker        mov             w7,  w5
3195*c0909341SAndroid Build Coastguard Worker        b.gt            L(ipred_z3_fill1_large_h16)
3196*c0909341SAndroid Build Coastguard Worker        br              x8
3197*c0909341SAndroid Build Coastguard Worker
3198*c0909341SAndroid Build Coastguard Worker40:
3199*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3200*c0909341SAndroid Build Coastguard Worker        dup             v29.4h,  w5               // dy
3201*c0909341SAndroid Build Coastguard Worker
3202*c0909341SAndroid Build Coastguard Worker        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
3203*c0909341SAndroid Build Coastguard Worker        movi            v23.16b, #0x3e
3204*c0909341SAndroid Build Coastguard Worker
3205*c0909341SAndroid Build Coastguard Worker        // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
3206*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x2] // left[]
3207*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v29.4h,  v30.4h  // ypos
3208*c0909341SAndroid Build Coastguard Worker
3209*c0909341SAndroid Build Coastguard Worker        movi            v22.16b, #64
3210*c0909341SAndroid Build Coastguard Worker        movi            v20.16b, #1
3211*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
3212*c0909341SAndroid Build Coastguard Worker
3213*c0909341SAndroid Build Coastguard Worker        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3214*c0909341SAndroid Build Coastguard Worker        uqshrn          v26.8b,  v30.8h,  #6      // base
3215*c0909341SAndroid Build Coastguard Worker        and             v24.8b,  v24.8b,  v23.8b  // frac
3216*c0909341SAndroid Build Coastguard Worker
3217*c0909341SAndroid Build Coastguard Worker        mov             v4.8b,   v31.8b
3218*c0909341SAndroid Build Coastguard Worker        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3219*c0909341SAndroid Build Coastguard Worker        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3220*c0909341SAndroid Build Coastguard Worker        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3221*c0909341SAndroid Build Coastguard Worker
3222*c0909341SAndroid Build Coastguard Worker        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
3223*c0909341SAndroid Build Coastguard Worker
3224*c0909341SAndroid Build Coastguard Worker        trn1            v27.2s,  v27.2s,  v28.2s  // base + 1, base + 2
3225*c0909341SAndroid Build Coastguard Worker        trn1            v24.2s,  v24.2s,  v24.2s  // frac
3226*c0909341SAndroid Build Coastguard Worker        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
3227*c0909341SAndroid Build Coastguard Worker1:
3228*c0909341SAndroid Build Coastguard Worker        mov             v5.8b,   v31.8b
3229*c0909341SAndroid Build Coastguard Worker        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
3230*c0909341SAndroid Build Coastguard Worker
3231*c0909341SAndroid Build Coastguard Worker        trn1            v4.2s,   v4.2s,   v5.2s   // left[base], left[base+1]
3232*c0909341SAndroid Build Coastguard Worker
3233*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3234*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3235*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
3236*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[0], [x0], x1
3237*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3238*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[1], [x0], x1
3239*c0909341SAndroid Build Coastguard Worker        b.le            9f
3240*c0909341SAndroid Build Coastguard Worker
3241*c0909341SAndroid Build Coastguard Worker        ext             v4.8b,   v5.8b,   v5.8b,  #4
3242*c0909341SAndroid Build Coastguard Worker        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
3243*c0909341SAndroid Build Coastguard Worker        b               1b
3244*c0909341SAndroid Build Coastguard Worker
3245*c0909341SAndroid Build Coastguard Worker9:
3246*c0909341SAndroid Build Coastguard Worker        ret
3247*c0909341SAndroid Build Coastguard Worker
3248*c0909341SAndroid Build Coastguard Worker80:
3249*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3250*c0909341SAndroid Build Coastguard Worker        dup             v29.8h,  w5               // dy
3251*c0909341SAndroid Build Coastguard Worker
3252*c0909341SAndroid Build Coastguard Worker        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
3253*c0909341SAndroid Build Coastguard Worker        movi            v23.16b, #0x3e
3254*c0909341SAndroid Build Coastguard Worker
3255*c0909341SAndroid Build Coastguard Worker        // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
3256*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b}, [x2] // left[]
3257*c0909341SAndroid Build Coastguard Worker        add             v30.8h,  v29.8h,  v30.8h  // ypos
3258*c0909341SAndroid Build Coastguard Worker
3259*c0909341SAndroid Build Coastguard Worker        movi            v22.16b, #64
3260*c0909341SAndroid Build Coastguard Worker        movi            v20.16b, #1
3261*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
3262*c0909341SAndroid Build Coastguard Worker
3263*c0909341SAndroid Build Coastguard Worker        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3264*c0909341SAndroid Build Coastguard Worker        uqshrn          v26.8b,  v30.8h,  #6      // base
3265*c0909341SAndroid Build Coastguard Worker        and             v24.8b,  v24.8b,  v23.8b  // frac
3266*c0909341SAndroid Build Coastguard Worker
3267*c0909341SAndroid Build Coastguard Worker        mov             v4.8b,   v31.8b
3268*c0909341SAndroid Build Coastguard Worker        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3269*c0909341SAndroid Build Coastguard Worker        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3270*c0909341SAndroid Build Coastguard Worker        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3271*c0909341SAndroid Build Coastguard Worker
3272*c0909341SAndroid Build Coastguard Worker        tbx             v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
3273*c0909341SAndroid Build Coastguard Worker1:
3274*c0909341SAndroid Build Coastguard Worker        mov             v5.8b,   v31.8b
3275*c0909341SAndroid Build Coastguard Worker        mov             v6.8b,   v31.8b
3276*c0909341SAndroid Build Coastguard Worker        tbx             v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
3277*c0909341SAndroid Build Coastguard Worker        tbx             v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
3278*c0909341SAndroid Build Coastguard Worker
3279*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3280*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3281*c0909341SAndroid Build Coastguard Worker        umull           v17.8h,  v5.8b,   v25.8b
3282*c0909341SAndroid Build Coastguard Worker        umlal           v17.8h,  v6.8b,   v24.8b
3283*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
3284*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v17.8h,  #6
3285*c0909341SAndroid Build Coastguard Worker        st1             {v16.8b}, [x0], x1
3286*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3287*c0909341SAndroid Build Coastguard Worker        st1             {v17.8b}, [x0], x1
3288*c0909341SAndroid Build Coastguard Worker        b.le            9f
3289*c0909341SAndroid Build Coastguard Worker
3290*c0909341SAndroid Build Coastguard Worker        mov             v4.8b,   v6.8b
3291*c0909341SAndroid Build Coastguard Worker        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
3292*c0909341SAndroid Build Coastguard Worker        uqadd           v28.8b,  v28.8b,  v21.8b  // base += 2
3293*c0909341SAndroid Build Coastguard Worker        b               1b
3294*c0909341SAndroid Build Coastguard Worker
3295*c0909341SAndroid Build Coastguard Worker9:
3296*c0909341SAndroid Build Coastguard Worker        ret
3297*c0909341SAndroid Build Coastguard Worker
3298*c0909341SAndroid Build Coastguard Worker160:
3299*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3300*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  w5               // dy
3301*c0909341SAndroid Build Coastguard Worker
3302*c0909341SAndroid Build Coastguard Worker        shl             v29.8h,  v28.8h,  #3      // 8*dy
3303*c0909341SAndroid Build Coastguard Worker        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
3304*c0909341SAndroid Build Coastguard Worker        movi            v23.16b, #0x3e
3305*c0909341SAndroid Build Coastguard Worker
3306*c0909341SAndroid Build Coastguard Worker        // This is only executed if we've checked that max_base_y <= 64.
3307*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
3308*c0909341SAndroid Build Coastguard Worker        add             v28.8h,  v28.8h,  v30.8h  // ypos
3309*c0909341SAndroid Build Coastguard Worker
3310*c0909341SAndroid Build Coastguard Worker        movi            v22.16b, #64
3311*c0909341SAndroid Build Coastguard Worker        movi            v20.16b, #1
3312*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
3313*c0909341SAndroid Build Coastguard Worker
3314*c0909341SAndroid Build Coastguard Worker        add             v29.8h,  v28.8h,  v29.8h  // ypos + 8*dy
3315*c0909341SAndroid Build Coastguard Worker
3316*c0909341SAndroid Build Coastguard Worker        xtn             v24.8b,  v28.8h           // (uint8_t)ypos
3317*c0909341SAndroid Build Coastguard Worker        xtn2            v24.16b, v29.8h
3318*c0909341SAndroid Build Coastguard Worker        uqshrn          v26.8b,  v28.8h,  #6      // base
3319*c0909341SAndroid Build Coastguard Worker        uqshrn2         v26.16b, v29.8h,  #6
3320*c0909341SAndroid Build Coastguard Worker        and             v24.16b, v24.16b, v23.16b // frac
3321*c0909341SAndroid Build Coastguard Worker
3322*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v31.16b
3323*c0909341SAndroid Build Coastguard Worker        uqadd           v27.16b, v26.16b, v20.16b // base + 1
3324*c0909341SAndroid Build Coastguard Worker        uqadd           v28.16b, v26.16b, v21.16b // base + 2
3325*c0909341SAndroid Build Coastguard Worker        sub             v25.16b, v22.16b, v24.16b // 64 - frac
3326*c0909341SAndroid Build Coastguard Worker
3327*c0909341SAndroid Build Coastguard Worker        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
3328*c0909341SAndroid Build Coastguard Worker1:
3329*c0909341SAndroid Build Coastguard Worker        mov             v5.16b,  v31.16b
3330*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v31.16b
3331*c0909341SAndroid Build Coastguard Worker        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
3332*c0909341SAndroid Build Coastguard Worker        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]
3333*c0909341SAndroid Build Coastguard Worker
3334*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3335*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3336*c0909341SAndroid Build Coastguard Worker        umull2          v17.8h,  v4.16b,  v25.16b
3337*c0909341SAndroid Build Coastguard Worker        umlal2          v17.8h,  v5.16b,  v24.16b
3338*c0909341SAndroid Build Coastguard Worker        umull           v18.8h,  v5.8b,   v25.8b
3339*c0909341SAndroid Build Coastguard Worker        umlal           v18.8h,  v6.8b,   v24.8b
3340*c0909341SAndroid Build Coastguard Worker        umull2          v19.8h,  v5.16b,  v25.16b
3341*c0909341SAndroid Build Coastguard Worker        umlal2          v19.8h,  v6.16b,  v24.16b
3342*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
3343*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.16b, v17.8h,  #6
3344*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v18.8h,  #6
3345*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.16b, v19.8h,  #6
3346*c0909341SAndroid Build Coastguard Worker        st1             {v16.16b}, [x0], x1
3347*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3348*c0909341SAndroid Build Coastguard Worker        st1             {v17.16b}, [x0], x1
3349*c0909341SAndroid Build Coastguard Worker        b.le            9f
3350*c0909341SAndroid Build Coastguard Worker
3351*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v6.16b
3352*c0909341SAndroid Build Coastguard Worker        uqadd           v27.16b, v27.16b, v21.16b // base += 2
3353*c0909341SAndroid Build Coastguard Worker        uqadd           v28.16b, v28.16b, v21.16b // base += 2
3354*c0909341SAndroid Build Coastguard Worker        b               1b
3355*c0909341SAndroid Build Coastguard Worker
3356*c0909341SAndroid Build Coastguard Worker9:
3357*c0909341SAndroid Build Coastguard Worker        ret
3358*c0909341SAndroid Build Coastguard Worker320:
3359*c0909341SAndroid Build Coastguard Worker640:
3360*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3361*c0909341SAndroid Build Coastguard Worker        dup             v28.8h,  w5               // dy
3362*c0909341SAndroid Build Coastguard Worker        mov             w12, w3
3363*c0909341SAndroid Build Coastguard Worker
3364*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1
3365*c0909341SAndroid Build Coastguard Worker
3366*c0909341SAndroid Build Coastguard Worker        shl             v29.8h,  v28.8h,  #3      // 8*dy
3367*c0909341SAndroid Build Coastguard Worker        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
3368*c0909341SAndroid Build Coastguard Worker        movi            v23.16b, #0x3e
3369*c0909341SAndroid Build Coastguard Worker
3370*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3371*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3,  uxtw
3372*c0909341SAndroid Build Coastguard Worker        add             v30.8h,  v28.8h,  v30.8h  // ypos
3373*c0909341SAndroid Build Coastguard Worker
3374*c0909341SAndroid Build Coastguard Worker        // This is only executed if we've checked that max_base_y <= 64.
3375*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
3376*c0909341SAndroid Build Coastguard Worker
3377*c0909341SAndroid Build Coastguard Worker        movi            v22.16b, #64
3378*c0909341SAndroid Build Coastguard Worker        movi            v20.16b, #1
3379*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
3380*c0909341SAndroid Build Coastguard Worker
3381*c0909341SAndroid Build Coastguard Worker1:
3382*c0909341SAndroid Build Coastguard Worker        mov             v26.16b,  v30.16b         // reset ypos
3383*c0909341SAndroid Build Coastguard Worker
3384*c0909341SAndroid Build Coastguard Worker2:
3385*c0909341SAndroid Build Coastguard Worker        add             v27.8h,  v26.8h,  v29.8h  // ypos + 8*dy
3386*c0909341SAndroid Build Coastguard Worker        uqshrn          v16.8b,  v26.8h,  #6      // base
3387*c0909341SAndroid Build Coastguard Worker        uqshrn2         v16.16b, v27.8h,  #6
3388*c0909341SAndroid Build Coastguard Worker        xtn             v24.8b,  v26.8h           // (uint8_t)ypos
3389*c0909341SAndroid Build Coastguard Worker        xtn2            v24.16b, v27.8h
3390*c0909341SAndroid Build Coastguard Worker        umov            w14,     v16.b[0]
3391*c0909341SAndroid Build Coastguard Worker        and             v24.16b, v24.16b, v23.16b // frac
3392*c0909341SAndroid Build Coastguard Worker
3393*c0909341SAndroid Build Coastguard Worker        uqadd           v17.16b, v16.16b, v20.16b // base + 1
3394*c0909341SAndroid Build Coastguard Worker        cmp             w14, w6                   // base >= max_base_y
3395*c0909341SAndroid Build Coastguard Worker        uqadd           v18.16b, v16.16b, v21.16b // base + 2
3396*c0909341SAndroid Build Coastguard Worker        sub             v25.16b, v22.16b, v24.16b // 64 - frac
3397*c0909341SAndroid Build Coastguard Worker
3398*c0909341SAndroid Build Coastguard Worker        b.ge            4f
3399*c0909341SAndroid Build Coastguard Worker
3400*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v31.16b
3401*c0909341SAndroid Build Coastguard Worker        mov             v5.16b,  v31.16b
3402*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v31.16b
3403*c0909341SAndroid Build Coastguard Worker        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
3404*c0909341SAndroid Build Coastguard Worker        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
3405*c0909341SAndroid Build Coastguard Worker        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]
3406*c0909341SAndroid Build Coastguard Worker
3407*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
3408*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3409*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3410*c0909341SAndroid Build Coastguard Worker        umull2          v17.8h,  v4.16b,  v25.16b
3411*c0909341SAndroid Build Coastguard Worker        umlal2          v17.8h,  v5.16b,  v24.16b
3412*c0909341SAndroid Build Coastguard Worker        umull           v18.8h,  v5.8b,   v25.8b
3413*c0909341SAndroid Build Coastguard Worker        umlal           v18.8h,  v6.8b,   v24.8b
3414*c0909341SAndroid Build Coastguard Worker        umull2          v19.8h,  v5.16b,  v25.16b
3415*c0909341SAndroid Build Coastguard Worker        umlal2          v19.8h,  v6.16b,  v24.16b
3416*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
3417*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.16b, v17.8h,  #6
3418*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v18.8h,  #6
3419*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.16b, v19.8h,  #6
3420*c0909341SAndroid Build Coastguard Worker        st1             {v16.16b}, [x0],  #16
3421*c0909341SAndroid Build Coastguard Worker        st1             {v17.16b}, [x13], #16
3422*c0909341SAndroid Build Coastguard Worker        b.le            3f
3423*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v27.8h,  v29.8h  // ypos += 16*dy
3424*c0909341SAndroid Build Coastguard Worker        b               2b
3425*c0909341SAndroid Build Coastguard Worker
3426*c0909341SAndroid Build Coastguard Worker3:
3427*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3428*c0909341SAndroid Build Coastguard Worker        b.le            9f
3429*c0909341SAndroid Build Coastguard Worker        movi            v16.8h,  #128
3430*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
3431*c0909341SAndroid Build Coastguard Worker        add             x13, x13, x1
3432*c0909341SAndroid Build Coastguard Worker        add             v30.8h,  v30.8h,  v16.8h  // ypos = dy + y*(1<<6)*2
3433*c0909341SAndroid Build Coastguard Worker        mov             w3,  w12
3434*c0909341SAndroid Build Coastguard Worker        b               1b
3435*c0909341SAndroid Build Coastguard Worker
3436*c0909341SAndroid Build Coastguard Worker4:
3437*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
3438*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x0],  #16
3439*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x13], #16
3440*c0909341SAndroid Build Coastguard Worker        b.gt            4b
3441*c0909341SAndroid Build Coastguard Worker        b               3b
3442*c0909341SAndroid Build Coastguard Worker
3443*c0909341SAndroid Build Coastguard Worker9:
3444*c0909341SAndroid Build Coastguard Worker        ret
3445*c0909341SAndroid Build Coastguard Worker
3446*c0909341SAndroid Build Coastguard WorkerL(ipred_z3_fill1_large_h16):
3447*c0909341SAndroid Build Coastguard Worker        // Fallback case for max_base_y > 64; similar to the z1
3448*c0909341SAndroid Build Coastguard Worker        // implementation. This does the filtering vertically, filling out
3449*c0909341SAndroid Build Coastguard Worker        // a 2x pixel column at a time.
3450*c0909341SAndroid Build Coastguard Worker        mov             w15, #64
3451*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1
3452*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3453*c0909341SAndroid Build Coastguard Worker
3454*c0909341SAndroid Build Coastguard Worker        mov             w12, w4
3455*c0909341SAndroid Build Coastguard Worker1:
3456*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
3457*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
3458*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // ypos += dy
3459*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_y
3460*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
3461*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
3462*c0909341SAndroid Build Coastguard Worker        b.ge            ipred_z3_fill_padding_neon
3463*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw
3464*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw
3465*c0909341SAndroid Build Coastguard Worker        dup             v4.16b,  w9               // frac
3466*c0909341SAndroid Build Coastguard Worker        dup             v5.16b,  w11
3467*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x8],  #32 // left[base]
3468*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x10], #32
3469*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
3470*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
3471*c0909341SAndroid Build Coastguard Worker        dup             v6.16b,  w9               // 64 - frac
3472*c0909341SAndroid Build Coastguard Worker        dup             v7.16b,  w11
3473*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // ypos += dy
3474*c0909341SAndroid Build Coastguard Worker2:
3475*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v1.16b,  #1 // left[base+1]
3476*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v2.16b,  v3.16b,  #1
3477*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
3478*c0909341SAndroid Build Coastguard Worker        umull           v18.8h,  v16.8b,  v4.8b   // left[base+1]*frac
3479*c0909341SAndroid Build Coastguard Worker        umlal           v18.8h,  v0.8b,   v6.8b   // + left[base]*(64-frac)
3480*c0909341SAndroid Build Coastguard Worker        umull2          v19.8h,  v16.16b, v4.16b
3481*c0909341SAndroid Build Coastguard Worker        umlal2          v19.8h,  v0.16b,  v6.16b
3482*c0909341SAndroid Build Coastguard Worker        umull           v20.8h,  v17.8b,  v5.8b
3483*c0909341SAndroid Build Coastguard Worker        umlal           v20.8h,  v2.8b,   v7.8b
3484*c0909341SAndroid Build Coastguard Worker        umull2          v21.8h,  v17.16b, v5.16b
3485*c0909341SAndroid Build Coastguard Worker        umlal2          v21.8h,  v2.16b,  v7.16b
3486*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v18.8h,  #6
3487*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.16b, v19.8h,  #6
3488*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v20.8h,  #6
3489*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.16b, v21.8h,  #6
3490*c0909341SAndroid Build Coastguard Worker        zip1            v18.16b, v16.16b, v17.16b
3491*c0909341SAndroid Build Coastguard Worker        zip2            v19.16b, v16.16b, v17.16b
3492*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[0], [x0],  x1
3493*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[1], [x13], x1
3494*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[2], [x0],  x1
3495*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[3], [x13], x1
3496*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[4], [x0],  x1
3497*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[5], [x13], x1
3498*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[6], [x0],  x1
3499*c0909341SAndroid Build Coastguard Worker        st1             {v18.h}[7], [x13], x1
3500*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[0], [x0],  x1
3501*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[1], [x13], x1
3502*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[2], [x0],  x1
3503*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[3], [x13], x1
3504*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[4], [x0],  x1
3505*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[5], [x13], x1
3506*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[6], [x0],  x1
3507*c0909341SAndroid Build Coastguard Worker        st1             {v19.h}[7], [x13], x1
3508*c0909341SAndroid Build Coastguard Worker        b.le            3f
3509*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
3510*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x8],  #16      // left[base]
3511*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v3.16b
3512*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x10], #16
3513*c0909341SAndroid Build Coastguard Worker        b               2b
3514*c0909341SAndroid Build Coastguard Worker
3515*c0909341SAndroid Build Coastguard Worker3:
3516*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3517*c0909341SAndroid Build Coastguard Worker        b.le            9f
3518*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3519*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3520*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3521*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3522*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2
3523*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #2
3524*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3525*c0909341SAndroid Build Coastguard Worker        b               1b
3526*c0909341SAndroid Build Coastguard Worker9:
3527*c0909341SAndroid Build Coastguard Worker        ret
3528*c0909341SAndroid Build Coastguard Workerendfunc
3529*c0909341SAndroid Build Coastguard Worker
3530*c0909341SAndroid Build Coastguard Workerjumptable ipred_z3_fill1_tbl
3531*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z3_fill1_tbl
3532*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z3_fill1_tbl
3533*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z3_fill1_tbl
3534*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z3_fill1_tbl
3535*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z3_fill1_tbl
3536*c0909341SAndroid Build Coastguard Workerendjumptable
3537*c0909341SAndroid Build Coastguard Worker
3538*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill_padding_neon, export=0
3539*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #16
3540*c0909341SAndroid Build Coastguard Worker        movrel          x8,  ipred_z3_fill_padding_tbl
3541*c0909341SAndroid Build Coastguard Worker        b.gt            ipred_z3_fill_padding_wide
3542*c0909341SAndroid Build Coastguard Worker        // w3 = remaining width, w4 = constant height
3543*c0909341SAndroid Build Coastguard Worker        mov             w12, w4
3544*c0909341SAndroid Build Coastguard Worker
3545*c0909341SAndroid Build Coastguard Worker1:
3546*c0909341SAndroid Build Coastguard Worker        // Fill a WxH rectangle with padding. W can be any number;
3547*c0909341SAndroid Build Coastguard Worker        // this fills the exact width by filling in the largest
3548*c0909341SAndroid Build Coastguard Worker        // power of two in the remaining width, and repeating.
3549*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
3550*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
3551*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x8, w9, uxtw #2]
3552*c0909341SAndroid Build Coastguard Worker        add             x9,  x8,  x9
3553*c0909341SAndroid Build Coastguard Worker        br              x9
3554*c0909341SAndroid Build Coastguard Worker
3555*c0909341SAndroid Build Coastguard Worker20:
3556*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3557*c0909341SAndroid Build Coastguard Worker2:
3558*c0909341SAndroid Build Coastguard Worker        st1             {v31.h}[0], [x0],  x1
3559*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
3560*c0909341SAndroid Build Coastguard Worker        st1             {v31.h}[0], [x13], x1
3561*c0909341SAndroid Build Coastguard Worker        st1             {v31.h}[0], [x0],  x1
3562*c0909341SAndroid Build Coastguard Worker        st1             {v31.h}[0], [x13], x1
3563*c0909341SAndroid Build Coastguard Worker        b.gt            2b
3564*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3565*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3566*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3567*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3568*c0909341SAndroid Build Coastguard Worker        b.le            9f
3569*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3570*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #2
3571*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #2
3572*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3573*c0909341SAndroid Build Coastguard Worker        b               1b
3574*c0909341SAndroid Build Coastguard Worker
3575*c0909341SAndroid Build Coastguard Worker40:
3576*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3577*c0909341SAndroid Build Coastguard Worker4:
3578*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0],  x1
3579*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
3580*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x13], x1
3581*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0],  x1
3582*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x13], x1
3583*c0909341SAndroid Build Coastguard Worker        b.gt            4b
3584*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #4
3585*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3586*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3587*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3588*c0909341SAndroid Build Coastguard Worker        b.le            9f
3589*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3590*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #4
3591*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #4
3592*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3593*c0909341SAndroid Build Coastguard Worker        b               1b
3594*c0909341SAndroid Build Coastguard Worker
3595*c0909341SAndroid Build Coastguard Worker80:
3596*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3597*c0909341SAndroid Build Coastguard Worker8:
3598*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x0],  x1
3599*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
3600*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x13], x1
3601*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x0],  x1
3602*c0909341SAndroid Build Coastguard Worker        st1             {v31.8b}, [x13], x1
3603*c0909341SAndroid Build Coastguard Worker        b.gt            8b
3604*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #8
3605*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3606*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3607*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3608*c0909341SAndroid Build Coastguard Worker        b.le            9f
3609*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3610*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #8
3611*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #8
3612*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3613*c0909341SAndroid Build Coastguard Worker        b               1b
3614*c0909341SAndroid Build Coastguard Worker
3615*c0909341SAndroid Build Coastguard Worker160:
3616*c0909341SAndroid Build Coastguard Worker320:
3617*c0909341SAndroid Build Coastguard Worker640:
3618*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3619*c0909341SAndroid Build Coastguard Worker16:
3620*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x0],  x1
3621*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
3622*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x13], x1
3623*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x0],  x1
3624*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x13], x1
3625*c0909341SAndroid Build Coastguard Worker        b.gt            16b
3626*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
3627*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3628*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3629*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3630*c0909341SAndroid Build Coastguard Worker        b.le            9f
3631*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3632*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #16
3633*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #16
3634*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3635*c0909341SAndroid Build Coastguard Worker        b               1b
3636*c0909341SAndroid Build Coastguard Worker
3637*c0909341SAndroid Build Coastguard Worker9:
3638*c0909341SAndroid Build Coastguard Worker        ret
3639*c0909341SAndroid Build Coastguard Workerendfunc
3640*c0909341SAndroid Build Coastguard Worker
3641*c0909341SAndroid Build Coastguard Workerjumptable ipred_z3_fill_padding_tbl
3642*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z3_fill_padding_tbl
3643*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z3_fill_padding_tbl
3644*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z3_fill_padding_tbl
3645*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z3_fill_padding_tbl
3646*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z3_fill_padding_tbl
3647*c0909341SAndroid Build Coastguard Worker        .word 20b  - ipred_z3_fill_padding_tbl
3648*c0909341SAndroid Build Coastguard Workerendjumptable
3649*c0909341SAndroid Build Coastguard Worker
3650*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill_padding_wide
3651*c0909341SAndroid Build Coastguard Worker        // Fill a WxH rectangle with padding, with W > 16.
3652*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3653*c0909341SAndroid Build Coastguard Worker        mov             w12, w3
3654*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3,  uxtw
3655*c0909341SAndroid Build Coastguard Worker1:
3656*c0909341SAndroid Build Coastguard Worker        ands            w5,  w3,  #15
3657*c0909341SAndroid Build Coastguard Worker        b.eq            2f
3658*c0909341SAndroid Build Coastguard Worker        // If the width isn't aligned to 16, first do one 16 byte write
3659*c0909341SAndroid Build Coastguard Worker        // and align the start pointer.
3660*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  w5
3661*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x0]
3662*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  w5,  uxtw
3663*c0909341SAndroid Build Coastguard Worker2:
3664*c0909341SAndroid Build Coastguard Worker        // Fill the rest of the line with aligned 16 byte writes.
3665*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
3666*c0909341SAndroid Build Coastguard Worker        st1             {v31.16b}, [x0], #16
3667*c0909341SAndroid Build Coastguard Worker        b.gt            2b
3668*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #1
3669*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
3670*c0909341SAndroid Build Coastguard Worker        b.le            9f
3671*c0909341SAndroid Build Coastguard Worker        mov             w3,  w12
3672*c0909341SAndroid Build Coastguard Worker        b               1b
3673*c0909341SAndroid Build Coastguard Worker9:
3674*c0909341SAndroid Build Coastguard Worker        ret
3675*c0909341SAndroid Build Coastguard Workerendfunc
3676*c0909341SAndroid Build Coastguard Worker
3677*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill2_8bpc_neon, export=1
3678*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #8
3679*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3680*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
3681*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.16b}, [x10]          // padding
3682*c0909341SAndroid Build Coastguard Worker        ld1             {v30.8h},  [x11]          // increments
3683*c0909341SAndroid Build Coastguard Worker        b.eq            80f
3684*c0909341SAndroid Build Coastguard Worker
3685*c0909341SAndroid Build Coastguard Worker40:     // w == 4
3686*c0909341SAndroid Build Coastguard Worker        dup             v29.4h,  w5               // dy
3687*c0909341SAndroid Build Coastguard Worker
3688*c0909341SAndroid Build Coastguard Worker        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
3689*c0909341SAndroid Build Coastguard Worker        movi            v23.16b, #0x3e
3690*c0909341SAndroid Build Coastguard Worker
3691*c0909341SAndroid Build Coastguard Worker        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
3692*c0909341SAndroid Build Coastguard Worker        // so max_base_y <= 32.
3693*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x2] // left[]
3694*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v29.4h,  v30.4h  // ypos
3695*c0909341SAndroid Build Coastguard Worker
3696*c0909341SAndroid Build Coastguard Worker        movi            v22.16b, #64
3697*c0909341SAndroid Build Coastguard Worker        movi            v20.16b, #1
3698*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
3699*c0909341SAndroid Build Coastguard Worker
3700*c0909341SAndroid Build Coastguard Worker        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3701*c0909341SAndroid Build Coastguard Worker        uqshrn          v26.8b,  v30.8h,  #6      // base
3702*c0909341SAndroid Build Coastguard Worker        and             v24.8b,  v24.8b,  v23.8b  // frac
3703*c0909341SAndroid Build Coastguard Worker
3704*c0909341SAndroid Build Coastguard Worker        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3705*c0909341SAndroid Build Coastguard Worker        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3706*c0909341SAndroid Build Coastguard Worker        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3707*c0909341SAndroid Build Coastguard Worker        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
3708*c0909341SAndroid Build Coastguard Worker
3709*c0909341SAndroid Build Coastguard Worker        trn1            v24.2s,  v24.2s,  v24.2s  // frac
3710*c0909341SAndroid Build Coastguard Worker        trn1            v26.2s,  v26.2s,  v28.2s  // base + 0, base + 2
3711*c0909341SAndroid Build Coastguard Worker        trn1            v27.2s,  v27.2s,  v29.2s  // base + 1, base + 3
3712*c0909341SAndroid Build Coastguard Worker        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
3713*c0909341SAndroid Build Coastguard Worker
3714*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #4
3715*c0909341SAndroid Build Coastguard Worker1:
3716*c0909341SAndroid Build Coastguard Worker        mov             v4.8b,   v31.8b
3717*c0909341SAndroid Build Coastguard Worker        mov             v5.8b,   v31.8b
3718*c0909341SAndroid Build Coastguard Worker        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
3719*c0909341SAndroid Build Coastguard Worker        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
3720*c0909341SAndroid Build Coastguard Worker
3721*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3722*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3723*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
3724*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[0], [x0], x1
3725*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3726*c0909341SAndroid Build Coastguard Worker        st1             {v16.s}[1], [x0], x1
3727*c0909341SAndroid Build Coastguard Worker        b.le            9f
3728*c0909341SAndroid Build Coastguard Worker
3729*c0909341SAndroid Build Coastguard Worker        uqadd           v26.8b,  v26.8b,  v21.8b  // base += 4
3730*c0909341SAndroid Build Coastguard Worker        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 4
3731*c0909341SAndroid Build Coastguard Worker        b               1b
3732*c0909341SAndroid Build Coastguard Worker
3733*c0909341SAndroid Build Coastguard Worker9:
3734*c0909341SAndroid Build Coastguard Worker        ret
3735*c0909341SAndroid Build Coastguard Worker
3736*c0909341SAndroid Build Coastguard Worker80:     // w == 8
3737*c0909341SAndroid Build Coastguard Worker        dup             v29.8h,  w5               // dy
3738*c0909341SAndroid Build Coastguard Worker
3739*c0909341SAndroid Build Coastguard Worker        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
3740*c0909341SAndroid Build Coastguard Worker        movi            v23.16b, #0x3e
3741*c0909341SAndroid Build Coastguard Worker
3742*c0909341SAndroid Build Coastguard Worker        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
3743*c0909341SAndroid Build Coastguard Worker        // so max_base_y <= 32.
3744*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x2] // left[]
3745*c0909341SAndroid Build Coastguard Worker        add             v30.8h,  v29.8h,  v30.8h  // ypos
3746*c0909341SAndroid Build Coastguard Worker
3747*c0909341SAndroid Build Coastguard Worker        movi            v22.16b, #64
3748*c0909341SAndroid Build Coastguard Worker        movi            v20.16b, #1
3749*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
3750*c0909341SAndroid Build Coastguard Worker
3751*c0909341SAndroid Build Coastguard Worker        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3752*c0909341SAndroid Build Coastguard Worker        uqshrn          v26.8b,  v30.8h,  #6      // base
3753*c0909341SAndroid Build Coastguard Worker        and             v24.8b,  v24.8b,  v23.8b  // frac
3754*c0909341SAndroid Build Coastguard Worker
3755*c0909341SAndroid Build Coastguard Worker        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3756*c0909341SAndroid Build Coastguard Worker        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3757*c0909341SAndroid Build Coastguard Worker        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3758*c0909341SAndroid Build Coastguard Worker        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
3759*c0909341SAndroid Build Coastguard Worker
3760*c0909341SAndroid Build Coastguard Worker        trn1            v24.2d,  v24.2d,  v24.2d  // frac
3761*c0909341SAndroid Build Coastguard Worker        trn1            v26.2d,  v26.2d,  v28.2d  // base + 0, base + 2
3762*c0909341SAndroid Build Coastguard Worker        trn1            v27.2d,  v27.2d,  v29.2d  // base + 1, base + 3
3763*c0909341SAndroid Build Coastguard Worker        trn1            v25.2d,  v25.2d,  v25.2d  // 64 - frac
3764*c0909341SAndroid Build Coastguard Worker
3765*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #4
3766*c0909341SAndroid Build Coastguard Worker1:
3767*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v31.16b
3768*c0909341SAndroid Build Coastguard Worker        mov             v5.16b,  v31.16b
3769*c0909341SAndroid Build Coastguard Worker        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
3770*c0909341SAndroid Build Coastguard Worker        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]
3771*c0909341SAndroid Build Coastguard Worker
3772*c0909341SAndroid Build Coastguard Worker        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3773*c0909341SAndroid Build Coastguard Worker        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3774*c0909341SAndroid Build Coastguard Worker        umull2          v17.8h,  v4.16b,  v25.16b
3775*c0909341SAndroid Build Coastguard Worker        umlal2          v17.8h,  v5.16b,  v24.16b
3776*c0909341SAndroid Build Coastguard Worker        rshrn           v16.8b,  v16.8h,  #6
3777*c0909341SAndroid Build Coastguard Worker        rshrn           v17.8b,  v17.8h,  #6
3778*c0909341SAndroid Build Coastguard Worker        st1             {v16.8b}, [x0], x1
3779*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3780*c0909341SAndroid Build Coastguard Worker        st1             {v17.8b}, [x0], x1
3781*c0909341SAndroid Build Coastguard Worker        b.le            9f
3782*c0909341SAndroid Build Coastguard Worker
3783*c0909341SAndroid Build Coastguard Worker        uqadd           v26.16b, v26.16b, v21.16b // base += 4
3784*c0909341SAndroid Build Coastguard Worker        uqadd           v27.16b, v27.16b, v21.16b // base += 4
3785*c0909341SAndroid Build Coastguard Worker        b               1b
3786*c0909341SAndroid Build Coastguard Worker
3787*c0909341SAndroid Build Coastguard Worker9:
3788*c0909341SAndroid Build Coastguard Worker        ret
3789*c0909341SAndroid Build Coastguard Workerendfunc
3790*c0909341SAndroid Build Coastguard Worker
3791*c0909341SAndroid Build Coastguard Worker
3792*c0909341SAndroid Build Coastguard Worker// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3793*c0909341SAndroid Build Coastguard Worker//                             const pixel *const topleft,
3794*c0909341SAndroid Build Coastguard Worker//                             const int width, const int height, const int filt_idx,
3795*c0909341SAndroid Build Coastguard Worker//                             const int max_width, const int max_height);
3796*c0909341SAndroid Build Coastguard Workerfunction ipred_filter_8bpc_neon, export=1
3797*c0909341SAndroid Build Coastguard Worker        and             w5,  w5,  #511
3798*c0909341SAndroid Build Coastguard Worker        movrel          x6,  X(filter_intra_taps)
3799*c0909341SAndroid Build Coastguard Worker        lsl             w5,  w5,  #6
3800*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  w5, uxtw
3801*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
3802*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
3803*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_filter_tbl
3804*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
3805*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
3806*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
3807*c0909341SAndroid Build Coastguard Worker        sxtl            v16.8h,  v16.8b
3808*c0909341SAndroid Build Coastguard Worker        sxtl            v17.8h,  v17.8b
3809*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
3810*c0909341SAndroid Build Coastguard Worker        sxtl            v18.8h,  v18.8b
3811*c0909341SAndroid Build Coastguard Worker        sxtl            v19.8h,  v19.8b
3812*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
3813*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3814*c0909341SAndroid Build Coastguard Worker        sxtl            v20.8h,  v20.8b
3815*c0909341SAndroid Build Coastguard Worker        sxtl            v21.8h,  v21.8b
3816*c0909341SAndroid Build Coastguard Worker        sxtl            v22.8h,  v22.8b
3817*c0909341SAndroid Build Coastguard Worker        br              x5
3818*c0909341SAndroid Build Coastguard Worker40:
3819*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3820*c0909341SAndroid Build Coastguard Worker        ldur            s0,  [x2, #1]             // top (0-3)
3821*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #2
3822*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-2
3823*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v0.8b            // top (0-3)
3824*c0909341SAndroid Build Coastguard Worker4:
3825*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
3826*c0909341SAndroid Build Coastguard Worker        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3827*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3828*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3829*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
3830*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3831*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3832*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3833*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3834*c0909341SAndroid Build Coastguard Worker        sqrshrun        v2.8b,   v2.8h,   #4
3835*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3836*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[0], [x0], x1
3837*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v2.8b
3838*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[1], [x6], x1
3839*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
3840*c0909341SAndroid Build Coastguard Worker        b.gt            4b
3841*c0909341SAndroid Build Coastguard Worker        ret
3842*c0909341SAndroid Build Coastguard Worker80:
3843*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3844*c0909341SAndroid Build Coastguard Worker        ldur            d0,  [x2, #1]             // top (0-7)
3845*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #2
3846*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-2
3847*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v0.8b            // top (0-7)
3848*c0909341SAndroid Build Coastguard Worker8:
3849*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
3850*c0909341SAndroid Build Coastguard Worker        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3851*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3852*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3853*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
3854*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3855*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3856*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3857*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3858*c0909341SAndroid Build Coastguard Worker        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
3859*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
3860*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
3861*c0909341SAndroid Build Coastguard Worker        sqrshrun        v2.8b,   v2.8h,   #4
3862*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,   v2.8b            // first block, in 16 bit
3863*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
3864*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
3865*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
3866*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
3867*c0909341SAndroid Build Coastguard Worker        sqrshrun        v3.8b,   v3.8h,   #4
3868*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3869*c0909341SAndroid Build Coastguard Worker        st2             {v2.s, v3.s}[0], [x0], x1
3870*c0909341SAndroid Build Coastguard Worker        zip2            v0.2s,   v2.2s,   v3.2s
3871*c0909341SAndroid Build Coastguard Worker        st2             {v2.s, v3.s}[1], [x6], x1
3872*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v0.8b
3873*c0909341SAndroid Build Coastguard Worker        b.gt            8b
3874*c0909341SAndroid Build Coastguard Worker        ret
3875*c0909341SAndroid Build Coastguard Worker160:
3876*c0909341SAndroid Build Coastguard Worker320:
3877*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3878*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  #1
3879*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #2
3880*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-2
3881*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw
3882*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
3883*c0909341SAndroid Build Coastguard Worker
3884*c0909341SAndroid Build Coastguard Worker1:
3885*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
3886*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
3887*c0909341SAndroid Build Coastguard Worker2:
3888*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x8],   #16     // top(0-15)
3889*c0909341SAndroid Build Coastguard Worker        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
3890*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
3891*c0909341SAndroid Build Coastguard Worker        uxtl            v1.8h,   v2.8b            // top(0-7)
3892*c0909341SAndroid Build Coastguard Worker        uxtl2           v2.8h,   v2.16b           // top(8-15)
3893*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
3894*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
3895*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
3896*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
3897*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
3898*c0909341SAndroid Build Coastguard Worker
3899*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
3900*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
3901*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
3902*c0909341SAndroid Build Coastguard Worker        sqrshrun        v3.8b,   v3.8h,   #4
3903*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v3.8b            // first block, in 16 bit
3904*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
3905*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
3906*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3907*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3908*c0909341SAndroid Build Coastguard Worker
3909*c0909341SAndroid Build Coastguard Worker        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
3910*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
3911*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
3912*c0909341SAndroid Build Coastguard Worker        sqrshrun        v4.8b,   v4.8h,   #4
3913*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v4.8b            // second block, in 16 bit
3914*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
3915*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
3916*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3917*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3918*c0909341SAndroid Build Coastguard Worker
3919*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
3920*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
3921*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
3922*c0909341SAndroid Build Coastguard Worker        sqrshrun        v5.8b,   v5.8h,   #4
3923*c0909341SAndroid Build Coastguard Worker        uxtl            v0.8h,   v5.8b            // third block, in 16 bit
3924*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
3925*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
3926*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3927*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3928*c0909341SAndroid Build Coastguard Worker
3929*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
3930*c0909341SAndroid Build Coastguard Worker        sqrshrun        v6.8b,   v6.8h,   #4
3931*c0909341SAndroid Build Coastguard Worker
3932*c0909341SAndroid Build Coastguard Worker        st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
3933*c0909341SAndroid Build Coastguard Worker        st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
3934*c0909341SAndroid Build Coastguard Worker        b.le            8f
3935*c0909341SAndroid Build Coastguard Worker        ins             v0.h[2], v2.h[7]
3936*c0909341SAndroid Build Coastguard Worker        ins             v0.b[0], v6.b[7]
3937*c0909341SAndroid Build Coastguard Worker        ins             v0.b[2], v6.b[3]
3938*c0909341SAndroid Build Coastguard Worker        b               2b
3939*c0909341SAndroid Build Coastguard Worker8:
3940*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3941*c0909341SAndroid Build Coastguard Worker        b.le            9f
3942*c0909341SAndroid Build Coastguard Worker        sub             x8,  x6,  w9, uxtw
3943*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
3944*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
3945*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
3946*c0909341SAndroid Build Coastguard Worker        b               1b
3947*c0909341SAndroid Build Coastguard Worker9:
3948*c0909341SAndroid Build Coastguard Worker        ret
3949*c0909341SAndroid Build Coastguard Workerendfunc
3950*c0909341SAndroid Build Coastguard Worker
3951*c0909341SAndroid Build Coastguard Workerjumptable ipred_filter_tbl
3952*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_filter_tbl
3953*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_filter_tbl
3954*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_filter_tbl
3955*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_filter_tbl
3956*c0909341SAndroid Build Coastguard Workerendjumptable
3957*c0909341SAndroid Build Coastguard Worker
3958*c0909341SAndroid Build Coastguard Worker// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3959*c0909341SAndroid Build Coastguard Worker//                         const pixel *const pal, const uint8_t *idx,
3960*c0909341SAndroid Build Coastguard Worker//                         const int w, const int h);
3961*c0909341SAndroid Build Coastguard Workerfunction pal_pred_8bpc_neon, export=1
3962*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [x2]
3963*c0909341SAndroid Build Coastguard Worker        clz             w9,  w4
3964*c0909341SAndroid Build Coastguard Worker        movrel          x6,  pal_pred_tbl
3965*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
3966*c0909341SAndroid Build Coastguard Worker        movi            v31.16b, #7
3967*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x6, w9, uxtw #2]
3968*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x9
3969*c0909341SAndroid Build Coastguard Worker        add             x2,  x0,  x1
3970*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3971*c0909341SAndroid Build Coastguard Worker        br              x6
3972*c0909341SAndroid Build Coastguard Worker40:
3973*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3974*c0909341SAndroid Build Coastguard Worker4:
3975*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b}, [x3], #8
3976*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
3977*c0909341SAndroid Build Coastguard Worker        ushr            v3.8b,   v1.8b,   #4
3978*c0909341SAndroid Build Coastguard Worker        and             v2.8b,   v1.8b,   v31.8b
3979*c0909341SAndroid Build Coastguard Worker        zip1            v1.16b,  v2.16b,  v3.16b
3980*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v0.16b}, v1.16b
3981*c0909341SAndroid Build Coastguard Worker        st1             {v1.s}[0], [x0], x1
3982*c0909341SAndroid Build Coastguard Worker        st1             {v1.s}[1], [x2], x1
3983*c0909341SAndroid Build Coastguard Worker        st1             {v1.s}[2], [x0], x1
3984*c0909341SAndroid Build Coastguard Worker        st1             {v1.s}[3], [x2], x1
3985*c0909341SAndroid Build Coastguard Worker        b.gt            4b
3986*c0909341SAndroid Build Coastguard Worker        ret
3987*c0909341SAndroid Build Coastguard Worker80:
3988*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3989*c0909341SAndroid Build Coastguard Worker8:
3990*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x3], #16
3991*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
3992*c0909341SAndroid Build Coastguard Worker        ushr            v4.16b,  v1.16b,  #4
3993*c0909341SAndroid Build Coastguard Worker        and             v3.16b,  v1.16b,  v31.16b
3994*c0909341SAndroid Build Coastguard Worker        zip1            v1.16b,  v3.16b,  v4.16b
3995*c0909341SAndroid Build Coastguard Worker        zip2            v2.16b,  v3.16b,  v4.16b
3996*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v0.16b}, v1.16b
3997*c0909341SAndroid Build Coastguard Worker        st1             {v1.d}[0], [x0], x1
3998*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v0.16b}, v2.16b
3999*c0909341SAndroid Build Coastguard Worker        st1             {v1.d}[1], [x2], x1
4000*c0909341SAndroid Build Coastguard Worker        st1             {v2.d}[0], [x0], x1
4001*c0909341SAndroid Build Coastguard Worker        st1             {v2.d}[1], [x2], x1
4002*c0909341SAndroid Build Coastguard Worker        b.gt            8b
4003*c0909341SAndroid Build Coastguard Worker        ret
4004*c0909341SAndroid Build Coastguard Worker160:
4005*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4006*c0909341SAndroid Build Coastguard Worker16:
4007*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b, v2.16b}, [x3], #32
4008*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
4009*c0909341SAndroid Build Coastguard Worker        ushr            v5.16b,  v1.16b,  #4
4010*c0909341SAndroid Build Coastguard Worker        and             v4.16b,  v1.16b,  v31.16b
4011*c0909341SAndroid Build Coastguard Worker        ushr            v7.16b,  v2.16b,  #4
4012*c0909341SAndroid Build Coastguard Worker        and             v6.16b,  v2.16b,  v31.16b
4013*c0909341SAndroid Build Coastguard Worker        zip1            v1.16b,  v4.16b,  v5.16b
4014*c0909341SAndroid Build Coastguard Worker        zip2            v2.16b,  v4.16b,  v5.16b
4015*c0909341SAndroid Build Coastguard Worker        zip1            v3.16b,  v6.16b,  v7.16b
4016*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v0.16b}, v1.16b
4017*c0909341SAndroid Build Coastguard Worker        zip2            v4.16b,  v6.16b,  v7.16b
4018*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v0.16b}, v2.16b
4019*c0909341SAndroid Build Coastguard Worker        st1             {v1.16b}, [x0], x1
4020*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v0.16b}, v3.16b
4021*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x2], x1
4022*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v0.16b}, v4.16b
4023*c0909341SAndroid Build Coastguard Worker        st1             {v3.16b}, [x0], x1
4024*c0909341SAndroid Build Coastguard Worker        st1             {v4.16b}, [x2], x1
4025*c0909341SAndroid Build Coastguard Worker        b.gt            16b
4026*c0909341SAndroid Build Coastguard Worker        ret
4027*c0909341SAndroid Build Coastguard Worker320:
4028*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4029*c0909341SAndroid Build Coastguard Worker32:
4030*c0909341SAndroid Build Coastguard Worker        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
4031*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
4032*c0909341SAndroid Build Coastguard Worker        ushr            v21.16b, v16.16b, #4
4033*c0909341SAndroid Build Coastguard Worker        and             v20.16b, v16.16b, v31.16b
4034*c0909341SAndroid Build Coastguard Worker        ushr            v23.16b, v17.16b, #4
4035*c0909341SAndroid Build Coastguard Worker        and             v22.16b, v17.16b, v31.16b
4036*c0909341SAndroid Build Coastguard Worker        ushr            v25.16b, v18.16b, #4
4037*c0909341SAndroid Build Coastguard Worker        and             v24.16b, v18.16b, v31.16b
4038*c0909341SAndroid Build Coastguard Worker        ushr            v27.16b, v19.16b, #4
4039*c0909341SAndroid Build Coastguard Worker        and             v26.16b, v19.16b, v31.16b
4040*c0909341SAndroid Build Coastguard Worker        zip1            v16.16b, v20.16b, v21.16b
4041*c0909341SAndroid Build Coastguard Worker        zip2            v17.16b, v20.16b, v21.16b
4042*c0909341SAndroid Build Coastguard Worker        zip1            v18.16b, v22.16b, v23.16b
4043*c0909341SAndroid Build Coastguard Worker        zip2            v19.16b, v22.16b, v23.16b
4044*c0909341SAndroid Build Coastguard Worker        zip1            v20.16b, v24.16b, v25.16b
4045*c0909341SAndroid Build Coastguard Worker        zip2            v21.16b, v24.16b, v25.16b
4046*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v0.16b}, v16.16b
4047*c0909341SAndroid Build Coastguard Worker        zip1            v22.16b, v26.16b, v27.16b
4048*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v0.16b}, v17.16b
4049*c0909341SAndroid Build Coastguard Worker        zip2            v23.16b, v26.16b, v27.16b
4050*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b}, v18.16b
4051*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b}, v19.16b
4052*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v0.16b}, v20.16b
4053*c0909341SAndroid Build Coastguard Worker        st1             {v16.16b, v17.16b}, [x0], x1
4054*c0909341SAndroid Build Coastguard Worker        tbl             v21.16b, {v0.16b}, v21.16b
4055*c0909341SAndroid Build Coastguard Worker        st1             {v18.16b, v19.16b}, [x2], x1
4056*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v0.16b}, v22.16b
4057*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b, v21.16b}, [x0], x1
4058*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v0.16b}, v23.16b
4059*c0909341SAndroid Build Coastguard Worker        st1             {v22.16b, v23.16b}, [x2], x1
4060*c0909341SAndroid Build Coastguard Worker        b.gt            32b
4061*c0909341SAndroid Build Coastguard Worker        ret
4062*c0909341SAndroid Build Coastguard Worker640:
4063*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4064*c0909341SAndroid Build Coastguard Worker64:
4065*c0909341SAndroid Build Coastguard Worker        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
4066*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
4067*c0909341SAndroid Build Coastguard Worker        ushr            v21.16b, v16.16b, #4
4068*c0909341SAndroid Build Coastguard Worker        and             v20.16b, v16.16b, v31.16b
4069*c0909341SAndroid Build Coastguard Worker        ushr            v23.16b, v17.16b, #4
4070*c0909341SAndroid Build Coastguard Worker        and             v22.16b, v17.16b, v31.16b
4071*c0909341SAndroid Build Coastguard Worker        ushr            v25.16b, v18.16b, #4
4072*c0909341SAndroid Build Coastguard Worker        and             v24.16b, v18.16b, v31.16b
4073*c0909341SAndroid Build Coastguard Worker        ushr            v27.16b, v19.16b, #4
4074*c0909341SAndroid Build Coastguard Worker        and             v26.16b, v19.16b, v31.16b
4075*c0909341SAndroid Build Coastguard Worker        zip1            v16.16b, v20.16b, v21.16b
4076*c0909341SAndroid Build Coastguard Worker        zip2            v17.16b, v20.16b, v21.16b
4077*c0909341SAndroid Build Coastguard Worker        zip1            v18.16b, v22.16b, v23.16b
4078*c0909341SAndroid Build Coastguard Worker        zip2            v19.16b, v22.16b, v23.16b
4079*c0909341SAndroid Build Coastguard Worker        zip1            v20.16b, v24.16b, v25.16b
4080*c0909341SAndroid Build Coastguard Worker        zip2            v21.16b, v24.16b, v25.16b
4081*c0909341SAndroid Build Coastguard Worker        tbl             v16.16b, {v0.16b}, v16.16b
4082*c0909341SAndroid Build Coastguard Worker        zip1            v22.16b, v26.16b, v27.16b
4083*c0909341SAndroid Build Coastguard Worker        tbl             v17.16b, {v0.16b}, v17.16b
4084*c0909341SAndroid Build Coastguard Worker        zip2            v23.16b, v26.16b, v27.16b
4085*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b}, v18.16b
4086*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b}, v19.16b
4087*c0909341SAndroid Build Coastguard Worker        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
4088*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v0.16b}, v20.16b
4089*c0909341SAndroid Build Coastguard Worker        tbl             v21.16b, {v0.16b}, v21.16b
4090*c0909341SAndroid Build Coastguard Worker        tbl             v22.16b, {v0.16b}, v22.16b
4091*c0909341SAndroid Build Coastguard Worker        tbl             v23.16b, {v0.16b}, v23.16b
4092*c0909341SAndroid Build Coastguard Worker        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
4093*c0909341SAndroid Build Coastguard Worker        b.gt            64b
4094*c0909341SAndroid Build Coastguard Worker        ret
4095*c0909341SAndroid Build Coastguard Workerendfunc
4096*c0909341SAndroid Build Coastguard Worker
4097*c0909341SAndroid Build Coastguard Workerjumptable pal_pred_tbl
4098*c0909341SAndroid Build Coastguard Worker        .word 640b - pal_pred_tbl
4099*c0909341SAndroid Build Coastguard Worker        .word 320b - pal_pred_tbl
4100*c0909341SAndroid Build Coastguard Worker        .word 160b - pal_pred_tbl
4101*c0909341SAndroid Build Coastguard Worker        .word 80b  - pal_pred_tbl
4102*c0909341SAndroid Build Coastguard Worker        .word 40b  - pal_pred_tbl
4103*c0909341SAndroid Build Coastguard Workerendjumptable
4104*c0909341SAndroid Build Coastguard Worker
4105*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4106*c0909341SAndroid Build Coastguard Worker//                              const pixel *const topleft,
4107*c0909341SAndroid Build Coastguard Worker//                              const int width, const int height,
4108*c0909341SAndroid Build Coastguard Worker//                              const int16_t *ac, const int alpha);
4109*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_128_8bpc_neon, export=1
4110*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4111*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_128_tbl
4112*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
4113*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x7, w9, uxtw #2]
4114*c0909341SAndroid Build Coastguard Worker        movi            v0.8h,   #128 // dc
4115*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6   // alpha
4116*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x9
4117*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4118*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4119*c0909341SAndroid Build Coastguard Worker        br              x7
4120*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_splat_w4):
4121*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4122*c0909341SAndroid Build Coastguard Worker1:
4123*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x5], #32
4124*c0909341SAndroid Build Coastguard Worker        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4125*c0909341SAndroid Build Coastguard Worker        mul             v3.8h,   v3.8h,   v1.8h
4126*c0909341SAndroid Build Coastguard Worker        cmlt            v4.8h,   v2.8h,   #0     // sign
4127*c0909341SAndroid Build Coastguard Worker        cmlt            v5.8h,   v3.8h,   #0
4128*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v4.8h  // diff + sign
4129*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v5.8h
4130*c0909341SAndroid Build Coastguard Worker        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4131*c0909341SAndroid Build Coastguard Worker        srshr           v3.8h,   v3.8h,   #6
4132*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4133*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v0.8h
4134*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4135*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.8b,   v3.8h
4136*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[0],  [x0], x1
4137*c0909341SAndroid Build Coastguard Worker        st1             {v2.s}[1],  [x6], x1
4138*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4139*c0909341SAndroid Build Coastguard Worker        st1             {v3.s}[0],  [x0], x1
4140*c0909341SAndroid Build Coastguard Worker        st1             {v3.s}[1],  [x6], x1
4141*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4142*c0909341SAndroid Build Coastguard Worker        ret
4143*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_splat_w8):
4144*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4145*c0909341SAndroid Build Coastguard Worker1:
4146*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
4147*c0909341SAndroid Build Coastguard Worker        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4148*c0909341SAndroid Build Coastguard Worker        mul             v3.8h,   v3.8h,   v1.8h
4149*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v4.8h,   v1.8h
4150*c0909341SAndroid Build Coastguard Worker        mul             v5.8h,   v5.8h,   v1.8h
4151*c0909341SAndroid Build Coastguard Worker        cmlt            v16.8h,  v2.8h,   #0     // sign
4152*c0909341SAndroid Build Coastguard Worker        cmlt            v17.8h,  v3.8h,   #0
4153*c0909341SAndroid Build Coastguard Worker        cmlt            v18.8h,  v4.8h,   #0
4154*c0909341SAndroid Build Coastguard Worker        cmlt            v19.8h,  v5.8h,   #0
4155*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v16.8h // diff + sign
4156*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v17.8h
4157*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v18.8h
4158*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v19.8h
4159*c0909341SAndroid Build Coastguard Worker        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4160*c0909341SAndroid Build Coastguard Worker        srshr           v3.8h,   v3.8h,   #6
4161*c0909341SAndroid Build Coastguard Worker        srshr           v4.8h,   v4.8h,   #6
4162*c0909341SAndroid Build Coastguard Worker        srshr           v5.8h,   v5.8h,   #6
4163*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4164*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v0.8h
4165*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v0.8h
4166*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v0.8h
4167*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4168*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.8b,   v3.8h
4169*c0909341SAndroid Build Coastguard Worker        sqxtun          v4.8b,   v4.8h
4170*c0909341SAndroid Build Coastguard Worker        sqxtun          v5.8b,   v5.8h
4171*c0909341SAndroid Build Coastguard Worker        st1             {v2.8b},  [x0], x1
4172*c0909341SAndroid Build Coastguard Worker        st1             {v3.8b},  [x6], x1
4173*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4174*c0909341SAndroid Build Coastguard Worker        st1             {v4.8b},  [x0], x1
4175*c0909341SAndroid Build Coastguard Worker        st1             {v5.8b},  [x6], x1
4176*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4177*c0909341SAndroid Build Coastguard Worker        ret
4178*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_splat_w16):
4179*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4180*c0909341SAndroid Build Coastguard Worker        add             x7,  x5,  w3, uxtw #1
4181*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw
4182*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
4183*c0909341SAndroid Build Coastguard Worker1:
4184*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x5], #32
4185*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x7], #32
4186*c0909341SAndroid Build Coastguard Worker        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4187*c0909341SAndroid Build Coastguard Worker        mul             v3.8h,   v3.8h,   v1.8h
4188*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v4.8h,   v1.8h
4189*c0909341SAndroid Build Coastguard Worker        mul             v5.8h,   v5.8h,   v1.8h
4190*c0909341SAndroid Build Coastguard Worker        cmlt            v16.8h,  v2.8h,   #0     // sign
4191*c0909341SAndroid Build Coastguard Worker        cmlt            v17.8h,  v3.8h,   #0
4192*c0909341SAndroid Build Coastguard Worker        cmlt            v18.8h,  v4.8h,   #0
4193*c0909341SAndroid Build Coastguard Worker        cmlt            v19.8h,  v5.8h,   #0
4194*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v16.8h // diff + sign
4195*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v17.8h
4196*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v18.8h
4197*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v19.8h
4198*c0909341SAndroid Build Coastguard Worker        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4199*c0909341SAndroid Build Coastguard Worker        srshr           v3.8h,   v3.8h,   #6
4200*c0909341SAndroid Build Coastguard Worker        srshr           v4.8h,   v4.8h,   #6
4201*c0909341SAndroid Build Coastguard Worker        srshr           v5.8h,   v5.8h,   #6
4202*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4203*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v0.8h
4204*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v0.8h
4205*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v0.8h
4206*c0909341SAndroid Build Coastguard Worker        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4207*c0909341SAndroid Build Coastguard Worker        sqxtun          v3.8b,   v3.8h
4208*c0909341SAndroid Build Coastguard Worker        sqxtun          v4.8b,   v4.8h
4209*c0909341SAndroid Build Coastguard Worker        sqxtun          v5.8b,   v5.8h
4210*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
4211*c0909341SAndroid Build Coastguard Worker        st1             {v2.8b, v3.8b},  [x0], #16
4212*c0909341SAndroid Build Coastguard Worker        st1             {v4.8b, v5.8b},  [x6], #16
4213*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4214*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
4215*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  w9, uxtw #1
4216*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  w9, uxtw #1
4217*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
4218*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
4219*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
4220*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4221*c0909341SAndroid Build Coastguard Worker        ret
4222*c0909341SAndroid Build Coastguard Workerendfunc
4223*c0909341SAndroid Build Coastguard Worker
4224*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_128_tbl
4225*c0909341SAndroid Build Coastguard Workeripred_cfl_splat_tbl:
4226*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4227*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4228*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w8)  - ipred_cfl_128_tbl
4229*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w4)  - ipred_cfl_128_tbl
4230*c0909341SAndroid Build Coastguard Workerendjumptable
4231*c0909341SAndroid Build Coastguard Worker
4232*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4233*c0909341SAndroid Build Coastguard Worker//                              const pixel *const topleft,
4234*c0909341SAndroid Build Coastguard Worker//                              const int width, const int height,
4235*c0909341SAndroid Build Coastguard Worker//                              const int16_t *ac, const int alpha);
4236*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_top_8bpc_neon, export=1
4237*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4238*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_top_tbl
4239*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
4240*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x7, w9, uxtw #2]
4241*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6   // alpha
4242*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
4243*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x9
4244*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4245*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4246*c0909341SAndroid Build Coastguard Worker        br              x7
4247*c0909341SAndroid Build Coastguard Worker4:
4248*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4249*c0909341SAndroid Build Coastguard Worker        ld1r            {v0.2s},  [x2]
4250*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
4251*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
4252*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4253*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w4)
4254*c0909341SAndroid Build Coastguard Worker8:
4255*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4256*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [x2]
4257*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
4258*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
4259*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4260*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w8)
4261*c0909341SAndroid Build Coastguard Worker16:
4262*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4263*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2]
4264*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
4265*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #4
4266*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4267*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4268*c0909341SAndroid Build Coastguard Worker32:
4269*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4270*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x2]
4271*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
4272*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
4273*c0909341SAndroid Build Coastguard Worker        add             v2.4h,   v2.4h,   v3.4h
4274*c0909341SAndroid Build Coastguard Worker        urshr           v2.4h,   v2.4h,   #5
4275*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v2.h[0]
4276*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4277*c0909341SAndroid Build Coastguard Workerendfunc
4278*c0909341SAndroid Build Coastguard Worker
4279*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_top_tbl
4280*c0909341SAndroid Build Coastguard Worker        .word 32b - ipred_cfl_top_tbl
4281*c0909341SAndroid Build Coastguard Worker        .word 16b - ipred_cfl_top_tbl
4282*c0909341SAndroid Build Coastguard Worker        .word 8b  - ipred_cfl_top_tbl
4283*c0909341SAndroid Build Coastguard Worker        .word 4b  - ipred_cfl_top_tbl
4284*c0909341SAndroid Build Coastguard Workerendjumptable
4285*c0909341SAndroid Build Coastguard Worker
4286*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4287*c0909341SAndroid Build Coastguard Worker//                               const pixel *const topleft,
4288*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height,
4289*c0909341SAndroid Build Coastguard Worker//                               const int16_t *ac, const int alpha);
4290*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_left_8bpc_neon, export=1
4291*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw
4292*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4293*c0909341SAndroid Build Coastguard Worker        clz             w8,  w4
4294*c0909341SAndroid Build Coastguard Worker        movrel          x10, ipred_cfl_splat_tbl
4295*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_left_tbl
4296*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
4297*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #26
4298*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, w9, uxtw #2]
4299*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7,  w8, uxtw #2]
4300*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6   // alpha
4301*c0909341SAndroid Build Coastguard Worker        add             x9,  x10, x9
4302*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
4303*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4304*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4305*c0909341SAndroid Build Coastguard Worker        br              x7
4306*c0909341SAndroid Build Coastguard Worker
4307*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h4):
4308*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4309*c0909341SAndroid Build Coastguard Worker        ld1r            {v0.2s},  [x2]
4310*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
4311*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
4312*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4313*c0909341SAndroid Build Coastguard Worker        br              x9
4314*c0909341SAndroid Build Coastguard Worker
4315*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h8):
4316*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4317*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [x2]
4318*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
4319*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
4320*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4321*c0909341SAndroid Build Coastguard Worker        br              x9
4322*c0909341SAndroid Build Coastguard Worker
4323*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h16):
4324*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4325*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2]
4326*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
4327*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #4
4328*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4329*c0909341SAndroid Build Coastguard Worker        br              x9
4330*c0909341SAndroid Build Coastguard Worker
4331*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h32):
4332*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4333*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x2]
4334*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
4335*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
4336*c0909341SAndroid Build Coastguard Worker        add             v2.4h,   v2.4h,   v3.4h
4337*c0909341SAndroid Build Coastguard Worker        urshr           v2.4h,   v2.4h,   #5
4338*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v2.h[0]
4339*c0909341SAndroid Build Coastguard Worker        br              x9
4340*c0909341SAndroid Build Coastguard Workerendfunc
4341*c0909341SAndroid Build Coastguard Worker
4342*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_left_tbl
4343*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
4344*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
4345*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
4346*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
4347*c0909341SAndroid Build Coastguard Workerendjumptable
4348*c0909341SAndroid Build Coastguard Worker
4349*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4350*c0909341SAndroid Build Coastguard Worker//                          const pixel *const topleft,
4351*c0909341SAndroid Build Coastguard Worker//                          const int width, const int height,
4352*c0909341SAndroid Build Coastguard Worker//                          const int16_t *ac, const int alpha);
4353*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_8bpc_neon, export=1
4354*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw
4355*c0909341SAndroid Build Coastguard Worker        add             w8,  w3,  w4             // width + height
4356*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6              // alpha
4357*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4358*c0909341SAndroid Build Coastguard Worker        clz             w6,  w4
4359*c0909341SAndroid Build Coastguard Worker        dup             v16.8h, w8               // width + height
4360*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_tbl
4361*c0909341SAndroid Build Coastguard Worker        rbit            w8,  w8                  // rbit(width + height)
4362*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
4363*c0909341SAndroid Build Coastguard Worker        sub             w6,  w6,  #26
4364*c0909341SAndroid Build Coastguard Worker        clz             w8,  w8                  // ctz(width + height)
4365*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x7, w9, uxtw #2]
4366*c0909341SAndroid Build Coastguard Worker        ldrsw           x6,  [x7, w6, uxtw #2]
4367*c0909341SAndroid Build Coastguard Worker        neg             w8,  w8                  // -ctz(width + height)
4368*c0909341SAndroid Build Coastguard Worker        add             x9,  x7,  x9
4369*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x6
4370*c0909341SAndroid Build Coastguard Worker        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
4371*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,  w8              // -ctz(width + height)
4372*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4373*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4374*c0909341SAndroid Build Coastguard Worker        br              x7
4375*c0909341SAndroid Build Coastguard Worker
4376*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h4):
4377*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4378*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[0],  [x2], #4
4379*c0909341SAndroid Build Coastguard Worker        ins             v0.s[1], wzr
4380*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
4381*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
4382*c0909341SAndroid Build Coastguard Worker        br              x9
4383*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w4):
4384*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4385*c0909341SAndroid Build Coastguard Worker        ld1             {v2.s}[0],  [x2]
4386*c0909341SAndroid Build Coastguard Worker        ins             v2.s[1], wzr
4387*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
4388*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.8b
4389*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #4
4390*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
4391*c0909341SAndroid Build Coastguard Worker        ushl            v0.4h,   v0.4h,   v17.4h
4392*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4393*c0909341SAndroid Build Coastguard Worker        // h = 8/16
4394*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x3334/2)
4395*c0909341SAndroid Build Coastguard Worker        movk            w16, #(0x5556/2), lsl #16
4396*c0909341SAndroid Build Coastguard Worker        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
4397*c0909341SAndroid Build Coastguard Worker        lsr             w16, w16, w17
4398*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
4399*c0909341SAndroid Build Coastguard Worker        sqdmulh         v0.4h,   v0.4h,   v16.4h
4400*c0909341SAndroid Build Coastguard Worker1:
4401*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4402*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w4)
4403*c0909341SAndroid Build Coastguard Worker
4404*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h8):
4405*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4406*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},  [x2], #8
4407*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.8b
4408*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
4409*c0909341SAndroid Build Coastguard Worker        br              x9
4410*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w8):
4411*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4412*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8b},  [x2]
4413*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
4414*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.8b
4415*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
4416*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
4417*c0909341SAndroid Build Coastguard Worker        ushl            v0.4h,   v0.4h,   v17.4h
4418*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4419*c0909341SAndroid Build Coastguard Worker        // h = 4/16/32
4420*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
4421*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x3334/2)
4422*c0909341SAndroid Build Coastguard Worker        mov             w17, #(0x5556/2)
4423*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
4424*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
4425*c0909341SAndroid Build Coastguard Worker        sqdmulh         v0.4h,   v0.4h,   v16.4h
4426*c0909341SAndroid Build Coastguard Worker1:
4427*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4428*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w8)
4429*c0909341SAndroid Build Coastguard Worker
4430*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h16):
4431*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4432*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x2], #16
4433*c0909341SAndroid Build Coastguard Worker        uaddlv          h0,      v0.16b
4434*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
4435*c0909341SAndroid Build Coastguard Worker        br              x9
4436*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w16):
4437*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4438*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x2]
4439*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
4440*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
4441*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #16
4442*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
4443*c0909341SAndroid Build Coastguard Worker        ushl            v0.4h,   v0.4h,   v17.4h
4444*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4445*c0909341SAndroid Build Coastguard Worker        // h = 4/8/32
4446*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #4
4447*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x3334/2)
4448*c0909341SAndroid Build Coastguard Worker        mov             w17, #(0x5556/2)
4449*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
4450*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
4451*c0909341SAndroid Build Coastguard Worker        sqdmulh         v0.4h,   v0.4h,   v16.4h
4452*c0909341SAndroid Build Coastguard Worker1:
4453*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4454*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4455*c0909341SAndroid Build Coastguard Worker
4456*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h32):
4457*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4458*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x2], #32
4459*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
4460*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
4461*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #1
4462*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v2.4h,   v3.4h
4463*c0909341SAndroid Build Coastguard Worker        br              x9
4464*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w32):
4465*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4466*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x2]
4467*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v16.4h
4468*c0909341SAndroid Build Coastguard Worker        uaddlv          h2,      v2.16b
4469*c0909341SAndroid Build Coastguard Worker        uaddlv          h3,      v3.16b
4470*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
4471*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
4472*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v3.4h
4473*c0909341SAndroid Build Coastguard Worker        ushl            v0.4h,   v0.4h,   v17.4h
4474*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4475*c0909341SAndroid Build Coastguard Worker        // h = 8/16
4476*c0909341SAndroid Build Coastguard Worker        mov             w16, #(0x5556/2)
4477*c0909341SAndroid Build Coastguard Worker        movk            w16, #(0x3334/2), lsl #16
4478*c0909341SAndroid Build Coastguard Worker        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
4479*c0909341SAndroid Build Coastguard Worker        lsr             w16, w16, w17
4480*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w16
4481*c0909341SAndroid Build Coastguard Worker        sqdmulh         v0.4h,   v0.4h,   v16.4h
4482*c0909341SAndroid Build Coastguard Worker1:
4483*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4484*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4485*c0909341SAndroid Build Coastguard Workerendfunc
4486*c0909341SAndroid Build Coastguard Worker
4487*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_tbl
4488*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h32) - ipred_cfl_tbl
4489*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h16) - ipred_cfl_tbl
4490*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
4491*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
4492*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w32) - ipred_cfl_tbl
4493*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w16) - ipred_cfl_tbl
4494*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
4495*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
4496*c0909341SAndroid Build Coastguard Workerendjumptable
4497*c0909341SAndroid Build Coastguard Worker
4498*c0909341SAndroid Build Coastguard Worker// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
4499*c0909341SAndroid Build Coastguard Worker//                           const ptrdiff_t stride, const int w_pad,
4500*c0909341SAndroid Build Coastguard Worker//                           const int h_pad, const int cw, const int ch);
4501*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_ac_420_8bpc_neon, export=1
4502*c0909341SAndroid Build Coastguard Worker        clz             w8,  w5
4503*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  #2
4504*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_420_tbl
4505*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #27
4506*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7, w8, uxtw #2]
4507*c0909341SAndroid Build Coastguard Worker        movi            v16.8h,  #0
4508*c0909341SAndroid Build Coastguard Worker        movi            v17.8h,  #0
4509*c0909341SAndroid Build Coastguard Worker        movi            v18.8h,  #0
4510*c0909341SAndroid Build Coastguard Worker        movi            v19.8h,  #0
4511*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
4512*c0909341SAndroid Build Coastguard Worker        sub             w8,  w6,  w4         // height - h_pad
4513*c0909341SAndroid Build Coastguard Worker        rbit            w9,  w5              // rbit(width)
4514*c0909341SAndroid Build Coastguard Worker        rbit            w10, w6              // rbit(height)
4515*c0909341SAndroid Build Coastguard Worker        clz             w9,  w9              // ctz(width)
4516*c0909341SAndroid Build Coastguard Worker        clz             w10, w10             // ctz(height)
4517*c0909341SAndroid Build Coastguard Worker        add             w9,  w9,  w10        // log2sz
4518*c0909341SAndroid Build Coastguard Worker        add             x10, x1,  x2
4519*c0909341SAndroid Build Coastguard Worker        dup             v31.4s,  w9
4520*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
4521*c0909341SAndroid Build Coastguard Worker        neg             v31.4s,  v31.4s      // -log2sz
4522*c0909341SAndroid Build Coastguard Worker        br              x7
4523*c0909341SAndroid Build Coastguard Worker
4524*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w4):
4525*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4526*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input
4527*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},   [x1],  x2
4528*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b},   [x10], x2
4529*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x1],  x2
4530*c0909341SAndroid Build Coastguard Worker        ld1             {v1.d}[1], [x10], x2
4531*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4532*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.8h,   v1.16b
4533*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
4534*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4535*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4536*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], #16
4537*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4538*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4539*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v0.2d,   v0.2d
4540*c0909341SAndroid Build Coastguard Worker        trn2            v0.2d,   v0.2d,   v0.2d
4541*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w4_hpad):
4542*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
4543*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
4544*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4545*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4546*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4547*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4548*c0909341SAndroid Build Coastguard Worker        b.gt            2b
4549*c0909341SAndroid Build Coastguard Worker3:
4550*c0909341SAndroid Build Coastguard Worker        // Aggregate the sums
4551*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v16.8h,  v17.8h
4552*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,  v0.8h                // sum
4553*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  w6, uxtw #3
4554*c0909341SAndroid Build Coastguard Worker        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
4555*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   v4.h[0]
4556*c0909341SAndroid Build Coastguard Worker6:      // Subtract dc from ac
4557*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x0]
4558*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #4
4559*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v4.8h
4560*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v4.8h
4561*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4562*c0909341SAndroid Build Coastguard Worker        b.gt            6b
4563*c0909341SAndroid Build Coastguard Worker        ret
4564*c0909341SAndroid Build Coastguard Worker
4565*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8):
4566*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4567*c0909341SAndroid Build Coastguard Worker        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
4568*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
4569*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1],  x2
4570*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x10], x2
4571*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x1],  x2
4572*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4573*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x10], x2
4574*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.8h,   v1.16b
4575*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4576*c0909341SAndroid Build Coastguard Worker        uaddlp          v3.8h,   v3.16b
4577*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
4578*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v3.8h
4579*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4580*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v2.8h,   #1
4581*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4582*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4583*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4584*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4585*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4586*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
4587*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
4588*c0909341SAndroid Build Coastguard Worker
4589*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8_wpad):
4590*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
4591*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},   [x1],  x2
4592*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b},   [x10], x2
4593*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x1],  x2
4594*c0909341SAndroid Build Coastguard Worker        ld1             {v1.d}[1], [x10], x2
4595*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4596*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.8h,   v1.16b
4597*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
4598*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4599*c0909341SAndroid Build Coastguard Worker        dup             v1.4h,   v0.h[3]
4600*c0909341SAndroid Build Coastguard Worker        dup             v3.4h,   v0.h[7]
4601*c0909341SAndroid Build Coastguard Worker        trn2            v2.2d,   v0.2d,   v0.2d
4602*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4603*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
4604*c0909341SAndroid Build Coastguard Worker        add             v16.4h,  v16.4h,  v0.4h
4605*c0909341SAndroid Build Coastguard Worker        add             v17.4h,  v17.4h,  v1.4h
4606*c0909341SAndroid Build Coastguard Worker        add             v18.4h,  v18.4h,  v2.4h
4607*c0909341SAndroid Build Coastguard Worker        add             v19.4h,  v19.4h,  v3.4h
4608*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4609*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v2.2d,   v3.2d
4610*c0909341SAndroid Build Coastguard Worker        trn1            v1.2d,   v2.2d,   v3.2d
4611*c0909341SAndroid Build Coastguard Worker
4612*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8_hpad):
4613*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
4614*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
4615*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4616*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4617*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4618*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4619*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4620*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v0.8h
4621*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v1.8h
4622*c0909341SAndroid Build Coastguard Worker        b.gt            2b
4623*c0909341SAndroid Build Coastguard Worker3:
4624*c0909341SAndroid Build Coastguard Worker
4625*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8_calc_subtract_dc):
4626*c0909341SAndroid Build Coastguard Worker        // Aggregate the sums
4627*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v16.8h,  v17.8h
4628*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v18.8h,  v19.8h
4629*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.4s,   v0.8h
4630*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.4s,   v2.8h
4631*c0909341SAndroid Build Coastguard Worker        add             v0.4s,   v0.4s,   v2.4s
4632*c0909341SAndroid Build Coastguard Worker        addv            s0,  v0.4s                // sum
4633*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  w6, uxtw #4
4634*c0909341SAndroid Build Coastguard Worker        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
4635*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   v4.h[0]
4636*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8_subtract_dc):
4637*c0909341SAndroid Build Coastguard Worker6:      // Subtract dc from ac
4638*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
4639*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #4
4640*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v4.8h
4641*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v4.8h
4642*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v4.8h
4643*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v4.8h
4644*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4645*c0909341SAndroid Build Coastguard Worker        b.gt            6b
4646*c0909341SAndroid Build Coastguard Worker        ret
4647*c0909341SAndroid Build Coastguard Worker
4648*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16):
4649*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4650*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_420_w16_tbl
4651*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x7, w3, uxtw #2]
4652*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x3
4653*c0909341SAndroid Build Coastguard Worker        br              x7
4654*c0909341SAndroid Build Coastguard Worker
4655*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad0):
4656*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4657*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
4658*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x1],  x2
4659*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x10], x2
4660*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4661*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b, v5.16b}, [x1],  x2
4662*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.8h,   v1.16b
4663*c0909341SAndroid Build Coastguard Worker        ld1             {v6.16b, v7.16b}, [x10], x2
4664*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4665*c0909341SAndroid Build Coastguard Worker        uaddlp          v3.8h,   v3.16b
4666*c0909341SAndroid Build Coastguard Worker        uaddlp          v4.8h,   v4.16b
4667*c0909341SAndroid Build Coastguard Worker        uaddlp          v5.8h,   v5.16b
4668*c0909341SAndroid Build Coastguard Worker        uaddlp          v6.8h,   v6.16b
4669*c0909341SAndroid Build Coastguard Worker        uaddlp          v7.8h,   v7.16b
4670*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
4671*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v3.8h
4672*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v6.8h
4673*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v7.8h
4674*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4675*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #1
4676*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v4.8h,   #1
4677*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v5.8h,   #1
4678*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4679*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4680*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4681*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4682*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4683*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4684*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4685*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
4686*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4687*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
4688*c0909341SAndroid Build Coastguard Worker
4689*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad1):
4690*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4691*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
4692*c0909341SAndroid Build Coastguard Worker        ldr             d1,  [x1,  #16]
4693*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1],  x2
4694*c0909341SAndroid Build Coastguard Worker        ldr             d3,  [x10, #16]
4695*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x10], x2
4696*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.4h,   v1.8b
4697*c0909341SAndroid Build Coastguard Worker        ldr             d5,  [x1,  #16]
4698*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4699*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x1],  x2
4700*c0909341SAndroid Build Coastguard Worker        uaddlp          v3.4h,   v3.8b
4701*c0909341SAndroid Build Coastguard Worker        ldr             d7,  [x10, #16]
4702*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4703*c0909341SAndroid Build Coastguard Worker        ld1             {v6.16b}, [x10], x2
4704*c0909341SAndroid Build Coastguard Worker        uaddlp          v5.4h,   v5.8b
4705*c0909341SAndroid Build Coastguard Worker        uaddlp          v4.8h,   v4.16b
4706*c0909341SAndroid Build Coastguard Worker        uaddlp          v7.4h,   v7.8b
4707*c0909341SAndroid Build Coastguard Worker        uaddlp          v6.8h,   v6.16b
4708*c0909341SAndroid Build Coastguard Worker        add             v1.4h,   v1.4h,   v3.4h
4709*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
4710*c0909341SAndroid Build Coastguard Worker        add             v5.4h,   v5.4h,   v7.4h
4711*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v6.8h
4712*c0909341SAndroid Build Coastguard Worker        shl             v1.4h,   v1.4h,   #1
4713*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4714*c0909341SAndroid Build Coastguard Worker        shl             v3.4h,   v5.4h,   #1
4715*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v4.8h,   #1
4716*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   v1.h[3]
4717*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   v3.h[3]
4718*c0909341SAndroid Build Coastguard Worker        trn1            v1.2d,   v1.2d,   v4.2d
4719*c0909341SAndroid Build Coastguard Worker        trn1            v3.2d,   v3.2d,   v5.2d
4720*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4721*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4722*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4723*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4724*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4725*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4726*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4727*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
4728*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4729*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
4730*c0909341SAndroid Build Coastguard Worker
4731*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad2):
4732*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4733*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 8
4734*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1],  x2
4735*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x10], x2
4736*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x1],  x2
4737*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4738*c0909341SAndroid Build Coastguard Worker        ld1             {v6.16b}, [x10], x2
4739*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4740*c0909341SAndroid Build Coastguard Worker        uaddlp          v4.8h,   v4.16b
4741*c0909341SAndroid Build Coastguard Worker        uaddlp          v6.8h,   v6.16b
4742*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
4743*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v6.8h
4744*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4745*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v4.8h,   #1
4746*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
4747*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
4748*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4749*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4750*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4751*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4752*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4753*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4754*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4755*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
4756*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4757*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
4758*c0909341SAndroid Build Coastguard Worker
4759*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad3):
4760*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4761*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 12
4762*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [x1],  x2
4763*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8b}, [x10], x2
4764*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x1],  x2
4765*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.4h,   v0.8b
4766*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8b}, [x10], x2
4767*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.4h,   v2.8b
4768*c0909341SAndroid Build Coastguard Worker        uaddlp          v4.4h,   v4.8b
4769*c0909341SAndroid Build Coastguard Worker        uaddlp          v6.4h,   v6.8b
4770*c0909341SAndroid Build Coastguard Worker        add             v0.4h,   v0.4h,   v2.4h
4771*c0909341SAndroid Build Coastguard Worker        add             v4.4h,   v4.4h,   v6.4h
4772*c0909341SAndroid Build Coastguard Worker        shl             v0.4h,   v0.4h,   #1
4773*c0909341SAndroid Build Coastguard Worker        shl             v2.4h,   v4.4h,   #1
4774*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[3]
4775*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[3]
4776*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v0.2d,   v1.2d
4777*c0909341SAndroid Build Coastguard Worker        trn1            v2.2d,   v2.2d,   v3.2d
4778*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4779*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4780*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4781*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4782*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4783*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4784*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4785*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
4786*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4787*c0909341SAndroid Build Coastguard Worker
4788*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_hpad):
4789*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
4790*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
4791*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4792*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4793*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4794*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4795*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4796*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4797*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4798*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4799*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4800*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4801*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4802*c0909341SAndroid Build Coastguard Worker        b.gt            2b
4803*c0909341SAndroid Build Coastguard Worker3:
4804*c0909341SAndroid Build Coastguard Worker
4805*c0909341SAndroid Build Coastguard Worker        // Double the height and reuse the w8 summing/subtracting
4806*c0909341SAndroid Build Coastguard Worker        lsl             w6,  w6,  #1
4807*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
4808*c0909341SAndroid Build Coastguard Workerendfunc
4809*c0909341SAndroid Build Coastguard Worker
4810*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_420_tbl
4811*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
4812*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
4813*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
4814*c0909341SAndroid Build Coastguard Workerendjumptable
4815*c0909341SAndroid Build Coastguard Worker
4816*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_420_w16_tbl
4817*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
4818*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
4819*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
4820*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
4821*c0909341SAndroid Build Coastguard Workerendjumptable
4822*c0909341SAndroid Build Coastguard Worker
4823*c0909341SAndroid Build Coastguard Worker// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
4824*c0909341SAndroid Build Coastguard Worker//                           const ptrdiff_t stride, const int w_pad,
4825*c0909341SAndroid Build Coastguard Worker//                           const int h_pad, const int cw, const int ch);
4826*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_ac_422_8bpc_neon, export=1
4827*c0909341SAndroid Build Coastguard Worker        clz             w8,  w5
4828*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  #2
4829*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_422_tbl
4830*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #27
4831*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7, w8, uxtw #2]
4832*c0909341SAndroid Build Coastguard Worker        movi            v16.8h,  #0
4833*c0909341SAndroid Build Coastguard Worker        movi            v17.8h,  #0
4834*c0909341SAndroid Build Coastguard Worker        movi            v18.8h,  #0
4835*c0909341SAndroid Build Coastguard Worker        movi            v19.8h,  #0
4836*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
4837*c0909341SAndroid Build Coastguard Worker        sub             w8,  w6,  w4         // height - h_pad
4838*c0909341SAndroid Build Coastguard Worker        rbit            w9,  w5              // rbit(width)
4839*c0909341SAndroid Build Coastguard Worker        rbit            w10, w6              // rbit(height)
4840*c0909341SAndroid Build Coastguard Worker        clz             w9,  w9              // ctz(width)
4841*c0909341SAndroid Build Coastguard Worker        clz             w10, w10             // ctz(height)
4842*c0909341SAndroid Build Coastguard Worker        add             w9,  w9,  w10        // log2sz
4843*c0909341SAndroid Build Coastguard Worker        add             x10, x1,  x2
4844*c0909341SAndroid Build Coastguard Worker        dup             v31.4s,  w9
4845*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
4846*c0909341SAndroid Build Coastguard Worker        neg             v31.4s,  v31.4s      // -log2sz
4847*c0909341SAndroid Build Coastguard Worker        br              x7
4848*c0909341SAndroid Build Coastguard Worker
4849*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w4):
4850*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4851*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input
4852*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},   [x1],  x2
4853*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x10], x2
4854*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b},   [x1],  x2
4855*c0909341SAndroid Build Coastguard Worker        ld1             {v1.d}[1], [x10], x2
4856*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4857*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.8h,   v1.16b
4858*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
4859*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #2
4860*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
4861*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4862*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4863*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4864*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4865*c0909341SAndroid Build Coastguard Worker        trn2            v0.2d,   v1.2d,   v1.2d
4866*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v1.2d,   v1.2d
4867*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w4_hpad)
4868*c0909341SAndroid Build Coastguard Worker
4869*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w8):
4870*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4871*c0909341SAndroid Build Coastguard Worker        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
4872*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
4873*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1],  x2
4874*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x10], x2
4875*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x1],  x2
4876*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4877*c0909341SAndroid Build Coastguard Worker        ld1             {v3.16b}, [x10], x2
4878*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.8h,   v1.16b
4879*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4880*c0909341SAndroid Build Coastguard Worker        uaddlp          v3.8h,   v3.16b
4881*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
4882*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #2
4883*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #2
4884*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v3.8h,   #2
4885*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
4886*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4887*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4888*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4889*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4890*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4891*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4892*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v3.16b
4893*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4894*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
4895*c0909341SAndroid Build Coastguard Worker
4896*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w8_wpad):
4897*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
4898*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b},   [x1],  x2
4899*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x10], x2
4900*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8b},   [x1],  x2
4901*c0909341SAndroid Build Coastguard Worker        ld1             {v2.d}[1], [x10], x2
4902*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4903*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4904*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
4905*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #2
4906*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   v0.h[3]
4907*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   v0.h[7]
4908*c0909341SAndroid Build Coastguard Worker        dup             v6.4h,   v2.h[3]
4909*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   v2.h[7]
4910*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v0.2d,   v5.2d
4911*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v0.2d,   v4.2d
4912*c0909341SAndroid Build Coastguard Worker        trn2            v3.2d,   v2.2d,   v7.2d
4913*c0909341SAndroid Build Coastguard Worker        trn1            v2.2d,   v2.2d,   v6.2d
4914*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
4915*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4916*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4917*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4918*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4919*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4920*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4921*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v3.16b
4922*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4923*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
4924*c0909341SAndroid Build Coastguard Worker
4925*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16):
4926*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4927*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_422_w16_tbl
4928*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x7, w3, uxtw #2]
4929*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x3
4930*c0909341SAndroid Build Coastguard Worker        br              x7
4931*c0909341SAndroid Build Coastguard Worker
4932*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad0):
4933*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4934*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
4935*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b, v1.16b}, [x1],  x2
4936*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x10], x2
4937*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4938*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.8h,   v1.16b
4939*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4940*c0909341SAndroid Build Coastguard Worker        uaddlp          v3.8h,   v3.16b
4941*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
4942*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #2
4943*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #2
4944*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v3.8h,   #2
4945*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4946*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4947*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4948*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4949*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4950*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4951*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4952*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
4953*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4954*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
4955*c0909341SAndroid Build Coastguard Worker
4956*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad1):
4957*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4958*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
4959*c0909341SAndroid Build Coastguard Worker        ldr             d1,  [x1,  #16]
4960*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1],  x2
4961*c0909341SAndroid Build Coastguard Worker        ldr             d3,  [x10, #16]
4962*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x10], x2
4963*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.4h,   v1.8b
4964*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4965*c0909341SAndroid Build Coastguard Worker        uaddlp          v3.4h,   v3.8b
4966*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4967*c0909341SAndroid Build Coastguard Worker        shl             v1.4h,   v1.4h,   #2
4968*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
4969*c0909341SAndroid Build Coastguard Worker        shl             v3.4h,   v3.4h,   #2
4970*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #2
4971*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   v1.h[3]
4972*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   v3.h[3]
4973*c0909341SAndroid Build Coastguard Worker        trn1            v1.2d,   v1.2d,   v4.2d
4974*c0909341SAndroid Build Coastguard Worker        trn1            v3.2d,   v3.2d,   v5.2d
4975*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4976*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4977*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
4978*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
4979*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
4980*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
4981*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4982*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
4983*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
4984*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
4985*c0909341SAndroid Build Coastguard Worker
4986*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad2):
4987*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4988*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 8
4989*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1],  x2
4990*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x10], x2
4991*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.8h,   v0.16b
4992*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.8h,   v2.16b
4993*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
4994*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #2
4995*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
4996*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
4997*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4998*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4999*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5000*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5001*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5002*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5003*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5004*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5005*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5006*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5007*c0909341SAndroid Build Coastguard Worker
5008*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad3):
5009*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5010*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 12
5011*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [x1],  x2
5012*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8b}, [x10], x2
5013*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.4h,   v0.8b
5014*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.4h,   v2.8b
5015*c0909341SAndroid Build Coastguard Worker        shl             v0.4h,   v0.4h,   #2
5016*c0909341SAndroid Build Coastguard Worker        shl             v2.4h,   v2.4h,   #2
5017*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[3]
5018*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[3]
5019*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v0.2d,   v1.2d
5020*c0909341SAndroid Build Coastguard Worker        trn1            v2.2d,   v2.2d,   v3.2d
5021*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5022*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5023*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5024*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5025*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5026*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5027*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5028*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5029*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5030*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5031*c0909341SAndroid Build Coastguard Workerendfunc
5032*c0909341SAndroid Build Coastguard Worker
5033*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_422_tbl
5034*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
5035*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl
5036*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl
5037*c0909341SAndroid Build Coastguard Workerendjumptable
5038*c0909341SAndroid Build Coastguard Worker
5039*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_422_w16_tbl
5040*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
5041*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
5042*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
5043*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
5044*c0909341SAndroid Build Coastguard Workerendjumptable
5045*c0909341SAndroid Build Coastguard Worker
5046*c0909341SAndroid Build Coastguard Worker// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
5047*c0909341SAndroid Build Coastguard Worker//                           const ptrdiff_t stride, const int w_pad,
5048*c0909341SAndroid Build Coastguard Worker//                           const int h_pad, const int cw, const int ch);
5049*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_ac_444_8bpc_neon, export=1
5050*c0909341SAndroid Build Coastguard Worker        clz             w8,  w5
5051*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  #2
5052*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_444_tbl
5053*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #26
5054*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7, w8, uxtw #2]
5055*c0909341SAndroid Build Coastguard Worker        movi            v16.8h,  #0
5056*c0909341SAndroid Build Coastguard Worker        movi            v17.8h,  #0
5057*c0909341SAndroid Build Coastguard Worker        movi            v18.8h,  #0
5058*c0909341SAndroid Build Coastguard Worker        movi            v19.8h,  #0
5059*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
5060*c0909341SAndroid Build Coastguard Worker        sub             w8,  w6,  w4         // height - h_pad
5061*c0909341SAndroid Build Coastguard Worker        rbit            w9,  w5              // rbit(width)
5062*c0909341SAndroid Build Coastguard Worker        rbit            w10, w6              // rbit(height)
5063*c0909341SAndroid Build Coastguard Worker        clz             w9,  w9              // ctz(width)
5064*c0909341SAndroid Build Coastguard Worker        clz             w10, w10             // ctz(height)
5065*c0909341SAndroid Build Coastguard Worker        add             w9,  w9,  w10        // log2sz
5066*c0909341SAndroid Build Coastguard Worker        add             x10, x1,  x2
5067*c0909341SAndroid Build Coastguard Worker        dup             v31.4s,  w9
5068*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
5069*c0909341SAndroid Build Coastguard Worker        neg             v31.4s,  v31.4s      // -log2sz
5070*c0909341SAndroid Build Coastguard Worker        br              x7
5071*c0909341SAndroid Build Coastguard Worker
5072*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w4):
5073*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5074*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input
5075*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[0], [x1],  x2
5076*c0909341SAndroid Build Coastguard Worker        ld1             {v0.s}[1], [x10], x2
5077*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[0], [x1],  x2
5078*c0909341SAndroid Build Coastguard Worker        ld1             {v1.s}[1], [x10], x2
5079*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v0.8b,   #3
5080*c0909341SAndroid Build Coastguard Worker        ushll           v1.8h,   v1.8b,   #3
5081*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5082*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5083*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5084*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
5085*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5086*c0909341SAndroid Build Coastguard Worker        trn2            v0.2d,   v1.2d,   v1.2d
5087*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v1.2d,   v1.2d
5088*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w4_hpad)
5089*c0909341SAndroid Build Coastguard Worker
5090*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w8):
5091*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5092*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input
5093*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [x1],  x2
5094*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b}, [x10], x2
5095*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8b}, [x1],  x2
5096*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v0.8b,   #3
5097*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8b}, [x10], x2
5098*c0909341SAndroid Build Coastguard Worker        ushll           v1.8h,   v1.8b,   #3
5099*c0909341SAndroid Build Coastguard Worker        ushll           v2.8h,   v2.8b,   #3
5100*c0909341SAndroid Build Coastguard Worker        ushll           v3.8h,   v3.8b,   #3
5101*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5102*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5103*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5104*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5105*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5106*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5107*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5108*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v3.16b
5109*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5110*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
5111*c0909341SAndroid Build Coastguard Worker
5112*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w16):
5113*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5114*c0909341SAndroid Build Coastguard Worker        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
5115*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, without padding
5116*c0909341SAndroid Build Coastguard Worker        ld1             {v0.16b}, [x1],  x2
5117*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x10], x2
5118*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b}, [x1],  x2
5119*c0909341SAndroid Build Coastguard Worker        ushll2          v1.8h,   v0.16b,  #3
5120*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v0.8b,   #3
5121*c0909341SAndroid Build Coastguard Worker        ld1             {v6.16b}, [x10], x2
5122*c0909341SAndroid Build Coastguard Worker        ushll2          v3.8h,   v2.16b,  #3
5123*c0909341SAndroid Build Coastguard Worker        ushll           v2.8h,   v2.8b,   #3
5124*c0909341SAndroid Build Coastguard Worker        ushll2          v5.8h,   v4.16b,  #3
5125*c0909341SAndroid Build Coastguard Worker        ushll           v4.8h,   v4.8b,   #3
5126*c0909341SAndroid Build Coastguard Worker        ushll2          v7.8h,   v6.16b,  #3
5127*c0909341SAndroid Build Coastguard Worker        ushll           v6.8h,   v6.8b,   #3
5128*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5129*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5130*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5131*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5132*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5133*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5134*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5135*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5136*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5137*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5138*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5139*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5140*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v6.16b
5141*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v7.16b
5142*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v6.16b
5143*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v7.16b
5144*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5145*c0909341SAndroid Build Coastguard Worker
5146*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w16_wpad):
5147*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 8
5148*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [x1],  x2
5149*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8b}, [x10], x2
5150*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x1],  x2
5151*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8b}, [x10], x2
5152*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v0.8b,   #3
5153*c0909341SAndroid Build Coastguard Worker        ushll           v2.8h,   v2.8b,   #3
5154*c0909341SAndroid Build Coastguard Worker        ushll           v4.8h,   v4.8b,   #3
5155*c0909341SAndroid Build Coastguard Worker        ushll           v6.8h,   v6.8b,   #3
5156*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
5157*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
5158*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   v4.h[7]
5159*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   v6.h[7]
5160*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5161*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5162*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5163*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5164*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5165*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5166*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5167*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5168*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5169*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5170*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5171*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5172*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v6.16b
5173*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v7.16b
5174*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v6.16b
5175*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v7.16b
5176*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5177*c0909341SAndroid Build Coastguard Worker
5178*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32):
5179*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5180*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_444_w32_tbl
5181*c0909341SAndroid Build Coastguard Worker        lsr             w3,  w3,  #1
5182*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x7, w3, uxtw #2]
5183*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x3
5184*c0909341SAndroid Build Coastguard Worker        br              x7
5185*c0909341SAndroid Build Coastguard Worker
5186*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad0):
5187*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5188*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, without padding
5189*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b, v3.16b}, [x1],  x2
5190*c0909341SAndroid Build Coastguard Worker        ld1             {v6.16b, v7.16b}, [x10], x2
5191*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v2.8b,   #3
5192*c0909341SAndroid Build Coastguard Worker        ushll2          v1.8h,   v2.16b,  #3
5193*c0909341SAndroid Build Coastguard Worker        ushll           v2.8h,   v3.8b,   #3
5194*c0909341SAndroid Build Coastguard Worker        ushll2          v3.8h,   v3.16b,  #3
5195*c0909341SAndroid Build Coastguard Worker        ushll           v4.8h,   v6.8b,   #3
5196*c0909341SAndroid Build Coastguard Worker        ushll2          v5.8h,   v6.16b,  #3
5197*c0909341SAndroid Build Coastguard Worker        ushll           v6.8h,   v7.8b,   #3
5198*c0909341SAndroid Build Coastguard Worker        ushll2          v7.8h,   v7.16b,  #3
5199*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5200*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5201*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5202*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5203*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5204*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5205*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5206*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5207*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5208*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5209*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5210*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5211*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_444_w32_hpad)
5212*c0909341SAndroid Build Coastguard Worker
5213*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad2):
5214*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5215*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 8
5216*c0909341SAndroid Build Coastguard Worker        ldr             d2,  [x1,  #16]
5217*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x1],  x2
5218*c0909341SAndroid Build Coastguard Worker        ldr             d6,  [x10, #16]
5219*c0909341SAndroid Build Coastguard Worker        ld1             {v5.16b}, [x10], x2
5220*c0909341SAndroid Build Coastguard Worker        ushll           v2.8h,   v2.8b,   #3
5221*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v1.8b,   #3
5222*c0909341SAndroid Build Coastguard Worker        ushll2          v1.8h,   v1.16b,  #3
5223*c0909341SAndroid Build Coastguard Worker        ushll           v6.8h,   v6.8b,   #3
5224*c0909341SAndroid Build Coastguard Worker        ushll           v4.8h,   v5.8b,   #3
5225*c0909341SAndroid Build Coastguard Worker        ushll2          v5.8h,   v5.16b,  #3
5226*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
5227*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   v6.h[7]
5228*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5229*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5230*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5231*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5232*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5233*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5234*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5235*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5236*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5237*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5238*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5239*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5240*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_444_w32_hpad)
5241*c0909341SAndroid Build Coastguard Worker
5242*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad4):
5243*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5244*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 16
5245*c0909341SAndroid Build Coastguard Worker        ld1             {v1.16b}, [x1],  x2
5246*c0909341SAndroid Build Coastguard Worker        ld1             {v5.16b}, [x10], x2
5247*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v1.8b,   #3
5248*c0909341SAndroid Build Coastguard Worker        ushll2          v1.8h,   v1.16b,  #3
5249*c0909341SAndroid Build Coastguard Worker        ushll           v4.8h,   v5.8b,   #3
5250*c0909341SAndroid Build Coastguard Worker        ushll2          v5.8h,   v5.16b,  #3
5251*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v1.h[7]
5252*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v1.h[7]
5253*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   v5.h[7]
5254*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   v5.h[7]
5255*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5256*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5257*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5258*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5259*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5260*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5261*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5262*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5263*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5264*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5265*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5266*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5267*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_444_w32_hpad)
5268*c0909341SAndroid Build Coastguard Worker
5269*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad6):
5270*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5271*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 24
5272*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8b}, [x1],  x2
5273*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8b}, [x10], x2
5274*c0909341SAndroid Build Coastguard Worker        ushll           v0.8h,   v0.8b,   #3
5275*c0909341SAndroid Build Coastguard Worker        ushll           v4.8h,   v4.8b,   #3
5276*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
5277*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v0.h[7]
5278*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v0.h[7]
5279*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   v4.h[7]
5280*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   v4.h[7]
5281*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   v4.h[7]
5282*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5283*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5284*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v0.8h
5285*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v1.8h
5286*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v2.8h
5287*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v3.8h
5288*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5289*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5290*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5291*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5292*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5293*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5294*c0909341SAndroid Build Coastguard Worker
5295*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_hpad):
5296*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
5297*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
5298*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
5299*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5300*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5301*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5302*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5303*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5304*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5305*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v4.8h
5306*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v17.8h,  v5.8h
5307*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v6.8h
5308*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v19.8h,  v7.8h
5309*c0909341SAndroid Build Coastguard Worker        b.gt            2b
5310*c0909341SAndroid Build Coastguard Worker3:
5311*c0909341SAndroid Build Coastguard Worker
5312*c0909341SAndroid Build Coastguard Worker        // Quadruple the height and reuse the w8 subtracting
5313*c0909341SAndroid Build Coastguard Worker        lsl             w6,  w6,  #2
5314*c0909341SAndroid Build Coastguard Worker        // Aggregate the sums, with wider intermediates earlier than in
5315*c0909341SAndroid Build Coastguard Worker        // ipred_cfl_ac_420_w8_calc_subtract_dc.
5316*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.4s,   v16.8h
5317*c0909341SAndroid Build Coastguard Worker        uaddlp          v1.4s,   v17.8h
5318*c0909341SAndroid Build Coastguard Worker        uaddlp          v2.4s,   v18.8h
5319*c0909341SAndroid Build Coastguard Worker        uaddlp          v3.4s,   v19.8h
5320*c0909341SAndroid Build Coastguard Worker        add             v0.4s,   v0.4s,   v1.4s
5321*c0909341SAndroid Build Coastguard Worker        add             v2.4s,   v2.4s,   v3.4s
5322*c0909341SAndroid Build Coastguard Worker        add             v0.4s,   v0.4s,   v2.4s
5323*c0909341SAndroid Build Coastguard Worker        addv            s0,  v0.4s                // sum
5324*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  w6, uxtw #4
5325*c0909341SAndroid Build Coastguard Worker        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
5326*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   v4.h[0]
5327*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_subtract_dc)
5328*c0909341SAndroid Build Coastguard Workerendfunc
5329*c0909341SAndroid Build Coastguard Worker
5330*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_444_tbl
5331*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
5332*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
5333*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
5334*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
5335*c0909341SAndroid Build Coastguard Workerendjumptable
5336*c0909341SAndroid Build Coastguard Worker
5337*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_444_w32_tbl
5338*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
5339*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
5340*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
5341*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
5342*c0909341SAndroid Build Coastguard Workerendjumptable
5343