xref: /aosp_15_r20/external/libdav1d/src/arm/64/ipred16.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1*c0909341SAndroid Build Coastguard Worker/*
2*c0909341SAndroid Build Coastguard Worker * Copyright © 2018, VideoLAN and dav1d authors
3*c0909341SAndroid Build Coastguard Worker * Copyright © 2019, Martin Storsjo
4*c0909341SAndroid Build Coastguard Worker * All rights reserved.
5*c0909341SAndroid Build Coastguard Worker *
6*c0909341SAndroid Build Coastguard Worker * Redistribution and use in source and binary forms, with or without
7*c0909341SAndroid Build Coastguard Worker * modification, are permitted provided that the following conditions are met:
8*c0909341SAndroid Build Coastguard Worker *
9*c0909341SAndroid Build Coastguard Worker * 1. Redistributions of source code must retain the above copyright notice, this
10*c0909341SAndroid Build Coastguard Worker *    list of conditions and the following disclaimer.
11*c0909341SAndroid Build Coastguard Worker *
12*c0909341SAndroid Build Coastguard Worker * 2. Redistributions in binary form must reproduce the above copyright notice,
13*c0909341SAndroid Build Coastguard Worker *    this list of conditions and the following disclaimer in the documentation
14*c0909341SAndroid Build Coastguard Worker *    and/or other materials provided with the distribution.
15*c0909341SAndroid Build Coastguard Worker *
16*c0909341SAndroid Build Coastguard Worker * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17*c0909341SAndroid Build Coastguard Worker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18*c0909341SAndroid Build Coastguard Worker * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19*c0909341SAndroid Build Coastguard Worker * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20*c0909341SAndroid Build Coastguard Worker * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21*c0909341SAndroid Build Coastguard Worker * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22*c0909341SAndroid Build Coastguard Worker * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23*c0909341SAndroid Build Coastguard Worker * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24*c0909341SAndroid Build Coastguard Worker * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25*c0909341SAndroid Build Coastguard Worker * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*c0909341SAndroid Build Coastguard Worker */
27*c0909341SAndroid Build Coastguard Worker
28*c0909341SAndroid Build Coastguard Worker#include "src/arm/asm.S"
29*c0909341SAndroid Build Coastguard Worker#include "util.S"
30*c0909341SAndroid Build Coastguard Worker
31*c0909341SAndroid Build Coastguard Worker// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
32*c0909341SAndroid Build Coastguard Worker//                              const pixel *const topleft,
33*c0909341SAndroid Build Coastguard Worker//                              const int width, const int height, const int a,
34*c0909341SAndroid Build Coastguard Worker//                              const int max_width, const int max_height,
35*c0909341SAndroid Build Coastguard Worker//                              const int bitdepth_max);
36*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_128_16bpc_neon, export=1
37*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]
38*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
39*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_128_tbl
40*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
41*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
42*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   w8
43*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
44*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
45*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
46*c0909341SAndroid Build Coastguard Worker        urshr           v0.8h,   v0.8h,  #1
47*c0909341SAndroid Build Coastguard Worker        br              x5
48*c0909341SAndroid Build Coastguard Worker40:
49*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
50*c0909341SAndroid Build Coastguard Worker4:
51*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
52*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
53*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
54*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
55*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
56*c0909341SAndroid Build Coastguard Worker        b.gt            4b
57*c0909341SAndroid Build Coastguard Worker        ret
58*c0909341SAndroid Build Coastguard Worker80:
59*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
60*c0909341SAndroid Build Coastguard Worker8:
61*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
62*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
63*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
64*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
65*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
66*c0909341SAndroid Build Coastguard Worker        b.gt            8b
67*c0909341SAndroid Build Coastguard Worker        ret
68*c0909341SAndroid Build Coastguard Worker160:
69*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
70*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
71*c0909341SAndroid Build Coastguard Worker16:
72*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
73*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
74*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
75*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
76*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
77*c0909341SAndroid Build Coastguard Worker        b.gt            16b
78*c0909341SAndroid Build Coastguard Worker        ret
79*c0909341SAndroid Build Coastguard Worker320:
80*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
81*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
82*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v0.16b
83*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v0.16b
84*c0909341SAndroid Build Coastguard Worker32:
85*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
86*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
87*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
88*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
89*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
90*c0909341SAndroid Build Coastguard Worker        b.gt            32b
91*c0909341SAndroid Build Coastguard Worker        ret
92*c0909341SAndroid Build Coastguard Worker640:
93*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
94*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
95*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v0.16b
96*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v0.16b
97*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #64
98*c0909341SAndroid Build Coastguard Worker64:
99*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
100*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
101*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
102*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
103*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
104*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
105*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
106*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
107*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
108*c0909341SAndroid Build Coastguard Worker        b.gt            64b
109*c0909341SAndroid Build Coastguard Worker        ret
110*c0909341SAndroid Build Coastguard Workerendfunc
111*c0909341SAndroid Build Coastguard Worker
112*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_128_tbl
113*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_dc_128_tbl
114*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_dc_128_tbl
115*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_dc_128_tbl
116*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_dc_128_tbl
117*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_dc_128_tbl
118*c0909341SAndroid Build Coastguard Workerendjumptable
119*c0909341SAndroid Build Coastguard Worker
120*c0909341SAndroid Build Coastguard Worker// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
121*c0909341SAndroid Build Coastguard Worker//                         const pixel *const topleft,
122*c0909341SAndroid Build Coastguard Worker//                         const int width, const int height, const int a,
123*c0909341SAndroid Build Coastguard Worker//                         const int max_width, const int max_height);
124*c0909341SAndroid Build Coastguard Workerfunction ipred_v_16bpc_neon, export=1
125*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
126*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_v_tbl
127*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
128*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
129*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
130*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
131*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
132*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
133*c0909341SAndroid Build Coastguard Worker        br              x5
134*c0909341SAndroid Build Coastguard Worker40:
135*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
136*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x2]
137*c0909341SAndroid Build Coastguard Worker4:
138*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
139*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
140*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
141*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
142*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
143*c0909341SAndroid Build Coastguard Worker        b.gt            4b
144*c0909341SAndroid Build Coastguard Worker        ret
145*c0909341SAndroid Build Coastguard Worker80:
146*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
147*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x2]
148*c0909341SAndroid Build Coastguard Worker8:
149*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
150*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
151*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
152*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
153*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
154*c0909341SAndroid Build Coastguard Worker        b.gt            8b
155*c0909341SAndroid Build Coastguard Worker        ret
156*c0909341SAndroid Build Coastguard Worker160:
157*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
158*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x2]
159*c0909341SAndroid Build Coastguard Worker16:
160*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
161*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
162*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
163*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
164*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
165*c0909341SAndroid Build Coastguard Worker        b.gt            16b
166*c0909341SAndroid Build Coastguard Worker        ret
167*c0909341SAndroid Build Coastguard Worker320:
168*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
169*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
170*c0909341SAndroid Build Coastguard Worker32:
171*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
172*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
173*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
174*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
175*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
176*c0909341SAndroid Build Coastguard Worker        b.gt            32b
177*c0909341SAndroid Build Coastguard Worker        ret
178*c0909341SAndroid Build Coastguard Worker640:
179*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
180*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
181*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #64
182*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
183*c0909341SAndroid Build Coastguard Worker64:
184*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
185*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
186*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
187*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
188*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
189*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
190*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
191*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
192*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
193*c0909341SAndroid Build Coastguard Worker        b.gt            64b
194*c0909341SAndroid Build Coastguard Worker        ret
195*c0909341SAndroid Build Coastguard Workerendfunc
196*c0909341SAndroid Build Coastguard Worker
197*c0909341SAndroid Build Coastguard Workerjumptable ipred_v_tbl
198*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_v_tbl
199*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_v_tbl
200*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_v_tbl
201*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_v_tbl
202*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_v_tbl
203*c0909341SAndroid Build Coastguard Workerendjumptable
204*c0909341SAndroid Build Coastguard Worker
205*c0909341SAndroid Build Coastguard Worker// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
206*c0909341SAndroid Build Coastguard Worker//                         const pixel *const topleft,
207*c0909341SAndroid Build Coastguard Worker//                         const int width, const int height, const int a,
208*c0909341SAndroid Build Coastguard Worker//                         const int max_width, const int max_height);
209*c0909341SAndroid Build Coastguard Workerfunction ipred_h_16bpc_neon, export=1
210*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
211*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_h_tbl
212*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
213*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
214*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #8
215*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
216*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-8
217*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
218*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
219*c0909341SAndroid Build Coastguard Worker        br              x5
220*c0909341SAndroid Build Coastguard Worker40:
221*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
222*c0909341SAndroid Build Coastguard Worker4:
223*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
224*c0909341SAndroid Build Coastguard Worker        st1             {v3.4h},  [x0], x1
225*c0909341SAndroid Build Coastguard Worker        st1             {v2.4h},  [x6], x1
226*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
227*c0909341SAndroid Build Coastguard Worker        st1             {v1.4h},  [x0], x1
228*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
229*c0909341SAndroid Build Coastguard Worker        b.gt            4b
230*c0909341SAndroid Build Coastguard Worker        ret
231*c0909341SAndroid Build Coastguard Worker80:
232*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
233*c0909341SAndroid Build Coastguard Worker8:
234*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
235*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h},  [x0], x1
236*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h},  [x6], x1
237*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
238*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h},  [x0], x1
239*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
240*c0909341SAndroid Build Coastguard Worker        b.gt            8b
241*c0909341SAndroid Build Coastguard Worker        ret
242*c0909341SAndroid Build Coastguard Worker160:
243*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
244*c0909341SAndroid Build Coastguard Worker16:
245*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
246*c0909341SAndroid Build Coastguard Worker        str             q3,  [x0, #16]
247*c0909341SAndroid Build Coastguard Worker        str             q2,  [x6, #16]
248*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h}, [x0], x1
249*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h}, [x6], x1
250*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
251*c0909341SAndroid Build Coastguard Worker        str             q1,  [x0, #16]
252*c0909341SAndroid Build Coastguard Worker        str             q0,  [x6, #16]
253*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h}, [x0], x1
254*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x6], x1
255*c0909341SAndroid Build Coastguard Worker        b.gt            16b
256*c0909341SAndroid Build Coastguard Worker        ret
257*c0909341SAndroid Build Coastguard Worker320:
258*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
259*c0909341SAndroid Build Coastguard Worker32:
260*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
261*c0909341SAndroid Build Coastguard Worker        str             q3,  [x0, #16]
262*c0909341SAndroid Build Coastguard Worker        str             q2,  [x6, #16]
263*c0909341SAndroid Build Coastguard Worker        stp             q3,  q3,  [x0, #32]
264*c0909341SAndroid Build Coastguard Worker        stp             q2,  q2,  [x6, #32]
265*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h}, [x0], x1
266*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h}, [x6], x1
267*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
268*c0909341SAndroid Build Coastguard Worker        str             q1,  [x0, #16]
269*c0909341SAndroid Build Coastguard Worker        str             q0,  [x6, #16]
270*c0909341SAndroid Build Coastguard Worker        stp             q1,  q1,  [x0, #32]
271*c0909341SAndroid Build Coastguard Worker        stp             q0,  q0,  [x6, #32]
272*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h}, [x0], x1
273*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x6], x1
274*c0909341SAndroid Build Coastguard Worker        b.gt            32b
275*c0909341SAndroid Build Coastguard Worker        ret
276*c0909341SAndroid Build Coastguard Worker640:
277*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
278*c0909341SAndroid Build Coastguard Worker64:
279*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
280*c0909341SAndroid Build Coastguard Worker        str             q3,  [x0, #16]
281*c0909341SAndroid Build Coastguard Worker        str             q2,  [x6, #16]
282*c0909341SAndroid Build Coastguard Worker        stp             q3,  q3,  [x0, #32]
283*c0909341SAndroid Build Coastguard Worker        stp             q2,  q2,  [x6, #32]
284*c0909341SAndroid Build Coastguard Worker        stp             q3,  q3,  [x0, #64]
285*c0909341SAndroid Build Coastguard Worker        stp             q2,  q2,  [x6, #64]
286*c0909341SAndroid Build Coastguard Worker        stp             q3,  q3,  [x0, #96]
287*c0909341SAndroid Build Coastguard Worker        stp             q2,  q2,  [x6, #96]
288*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h}, [x0], x1
289*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h}, [x6], x1
290*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
291*c0909341SAndroid Build Coastguard Worker        str             q1,  [x0, #16]
292*c0909341SAndroid Build Coastguard Worker        str             q0,  [x6, #16]
293*c0909341SAndroid Build Coastguard Worker        stp             q1,  q1,  [x0, #32]
294*c0909341SAndroid Build Coastguard Worker        stp             q0,  q0,  [x6, #32]
295*c0909341SAndroid Build Coastguard Worker        stp             q1,  q1,  [x0, #64]
296*c0909341SAndroid Build Coastguard Worker        stp             q0,  q0,  [x6, #64]
297*c0909341SAndroid Build Coastguard Worker        stp             q1,  q1,  [x0, #96]
298*c0909341SAndroid Build Coastguard Worker        stp             q0,  q0,  [x6, #96]
299*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h}, [x0], x1
300*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x6], x1
301*c0909341SAndroid Build Coastguard Worker        b.gt            64b
302*c0909341SAndroid Build Coastguard Worker        ret
303*c0909341SAndroid Build Coastguard Workerendfunc
304*c0909341SAndroid Build Coastguard Worker
305*c0909341SAndroid Build Coastguard Workerjumptable ipred_h_tbl
306*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_h_tbl
307*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_h_tbl
308*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_h_tbl
309*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_h_tbl
310*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_h_tbl
311*c0909341SAndroid Build Coastguard Workerendjumptable
312*c0909341SAndroid Build Coastguard Worker
313*c0909341SAndroid Build Coastguard Worker// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
314*c0909341SAndroid Build Coastguard Worker//                              const pixel *const topleft,
315*c0909341SAndroid Build Coastguard Worker//                              const int width, const int height, const int a,
316*c0909341SAndroid Build Coastguard Worker//                              const int max_width, const int max_height);
317*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_top_16bpc_neon, export=1
318*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
319*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_top_tbl
320*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #25
321*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
322*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
323*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x3
324*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
325*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
326*c0909341SAndroid Build Coastguard Worker        br              x5
327*c0909341SAndroid Build Coastguard Worker40:
328*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
329*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x2]
330*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.4h
331*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #2
332*c0909341SAndroid Build Coastguard Worker        dup             v0.4h,   v0.h[0]
333*c0909341SAndroid Build Coastguard Worker4:
334*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
335*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
336*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
337*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
338*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
339*c0909341SAndroid Build Coastguard Worker        b.gt            4b
340*c0909341SAndroid Build Coastguard Worker        ret
341*c0909341SAndroid Build Coastguard Worker80:
342*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
343*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x2]
344*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
345*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
346*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
347*c0909341SAndroid Build Coastguard Worker8:
348*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
349*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
350*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
351*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
352*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
353*c0909341SAndroid Build Coastguard Worker        b.gt            8b
354*c0909341SAndroid Build Coastguard Worker        ret
355*c0909341SAndroid Build Coastguard Worker160:
356*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
357*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x2]
358*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
359*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
360*c0909341SAndroid Build Coastguard Worker        urshr           v2.4h,   v0.4h,   #4
361*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v2.h[0]
362*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v2.h[0]
363*c0909341SAndroid Build Coastguard Worker16:
364*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
365*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
366*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
367*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
368*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
369*c0909341SAndroid Build Coastguard Worker        b.gt            16b
370*c0909341SAndroid Build Coastguard Worker        ret
371*c0909341SAndroid Build Coastguard Worker320:
372*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
373*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
374*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
375*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
376*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
377*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
378*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v0.4s,   #5
379*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v4.h[0]
380*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v4.h[0]
381*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v4.h[0]
382*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v4.h[0]
383*c0909341SAndroid Build Coastguard Worker32:
384*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
385*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
386*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
387*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
388*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
389*c0909341SAndroid Build Coastguard Worker        b.gt            32b
390*c0909341SAndroid Build Coastguard Worker        ret
391*c0909341SAndroid Build Coastguard Worker640:
392*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
393*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
394*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
395*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
396*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
397*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
398*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
399*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
400*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v6.8h
401*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v4.8h
402*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
403*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v0.4s,   #6
404*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #64
405*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v4.h[0]
406*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v4.h[0]
407*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v4.h[0]
408*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v4.h[0]
409*c0909341SAndroid Build Coastguard Worker64:
410*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
411*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
412*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
413*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
414*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
415*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
416*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
417*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
418*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
419*c0909341SAndroid Build Coastguard Worker        b.gt            64b
420*c0909341SAndroid Build Coastguard Worker        ret
421*c0909341SAndroid Build Coastguard Workerendfunc
422*c0909341SAndroid Build Coastguard Worker
423*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_top_tbl
424*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_dc_top_tbl
425*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_dc_top_tbl
426*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_dc_top_tbl
427*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_dc_top_tbl
428*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_dc_top_tbl
429*c0909341SAndroid Build Coastguard Workerendjumptable
430*c0909341SAndroid Build Coastguard Worker
431*c0909341SAndroid Build Coastguard Worker// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
432*c0909341SAndroid Build Coastguard Worker//                               const pixel *const topleft,
433*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height, const int a,
434*c0909341SAndroid Build Coastguard Worker//                               const int max_width, const int max_height);
435*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_left_16bpc_neon, export=1
436*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw #1
437*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
438*c0909341SAndroid Build Coastguard Worker        clz             w7,  w4
439*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_left_tbl
440*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
441*c0909341SAndroid Build Coastguard Worker        sub             w7,  w7,  #25
442*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
443*c0909341SAndroid Build Coastguard Worker        ldrsw           x7,  [x5, w7, uxtw #2]
444*c0909341SAndroid Build Coastguard Worker        add             x3,  x5,  x3
445*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x7
446*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
447*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
448*c0909341SAndroid Build Coastguard Worker        br              x5
449*c0909341SAndroid Build Coastguard Worker
450*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h4):
451*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
452*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x2]
453*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.4h
454*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #2
455*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
456*c0909341SAndroid Build Coastguard Worker        br              x3
457*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w4):
458*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
459*c0909341SAndroid Build Coastguard Worker1:
460*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
461*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
462*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
463*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
464*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
465*c0909341SAndroid Build Coastguard Worker        b.gt            1b
466*c0909341SAndroid Build Coastguard Worker        ret
467*c0909341SAndroid Build Coastguard Worker
468*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h8):
469*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
470*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x2]
471*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
472*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
473*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
474*c0909341SAndroid Build Coastguard Worker        br              x3
475*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w8):
476*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
477*c0909341SAndroid Build Coastguard Worker1:
478*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
479*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
480*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
481*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
482*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
483*c0909341SAndroid Build Coastguard Worker        b.gt            1b
484*c0909341SAndroid Build Coastguard Worker        ret
485*c0909341SAndroid Build Coastguard Worker
486*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h16):
487*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
488*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x2]
489*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
490*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
491*c0909341SAndroid Build Coastguard Worker        urshr           v2.4h,   v0.4h,   #4
492*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v2.h[0]
493*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v2.h[0]
494*c0909341SAndroid Build Coastguard Worker        br              x3
495*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w16):
496*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
497*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
498*c0909341SAndroid Build Coastguard Worker1:
499*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
500*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
501*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
502*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
503*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
504*c0909341SAndroid Build Coastguard Worker        b.gt            1b
505*c0909341SAndroid Build Coastguard Worker        ret
506*c0909341SAndroid Build Coastguard Worker
507*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h32):
508*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
509*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
510*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
511*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
512*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
513*c0909341SAndroid Build Coastguard Worker        uaddlp          v0.4s,   v0.8h
514*c0909341SAndroid Build Coastguard Worker        addv            s0,      v0.4s
515*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v0.4s,   #5
516*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v4.h[0]
517*c0909341SAndroid Build Coastguard Worker        br              x3
518*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w32):
519*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
520*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
521*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v0.16b
522*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v0.16b
523*c0909341SAndroid Build Coastguard Worker1:
524*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
525*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
526*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
527*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
528*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
529*c0909341SAndroid Build Coastguard Worker        b.gt            1b
530*c0909341SAndroid Build Coastguard Worker        ret
531*c0909341SAndroid Build Coastguard Worker
532*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_h64):
533*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
534*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
535*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
536*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
537*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
538*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
539*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
540*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
541*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v6.8h
542*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v4.8h
543*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
544*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v0.4s,   #6
545*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v4.h[0]
546*c0909341SAndroid Build Coastguard Worker        br              x3
547*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_left_w64):
548*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
549*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v0.16b
550*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v0.16b
551*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v0.16b
552*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #64
553*c0909341SAndroid Build Coastguard Worker1:
554*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
555*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
556*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
557*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
558*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
559*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
560*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
561*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
562*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
563*c0909341SAndroid Build Coastguard Worker        b.gt            1b
564*c0909341SAndroid Build Coastguard Worker        ret
565*c0909341SAndroid Build Coastguard Workerendfunc
566*c0909341SAndroid Build Coastguard Worker
567*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_left_tbl
568*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
569*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
570*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
571*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
572*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
573*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
574*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
575*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
576*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
577*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
578*c0909341SAndroid Build Coastguard Workerendjumptable
579*c0909341SAndroid Build Coastguard Worker
580*c0909341SAndroid Build Coastguard Worker// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
581*c0909341SAndroid Build Coastguard Worker//                          const pixel *const topleft,
582*c0909341SAndroid Build Coastguard Worker//                          const int width, const int height, const int a,
583*c0909341SAndroid Build Coastguard Worker//                          const int max_width, const int max_height);
584*c0909341SAndroid Build Coastguard Workerfunction ipred_dc_16bpc_neon, export=1
585*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw #1
586*c0909341SAndroid Build Coastguard Worker        add             w7,  w3,  w4             // width + height
587*c0909341SAndroid Build Coastguard Worker        clz             w3,  w3
588*c0909341SAndroid Build Coastguard Worker        clz             w6,  w4
589*c0909341SAndroid Build Coastguard Worker        dup             v16.4s, w7               // width + height
590*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_dc_tbl
591*c0909341SAndroid Build Coastguard Worker        rbit            w7,  w7                  // rbit(width + height)
592*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
593*c0909341SAndroid Build Coastguard Worker        sub             w6,  w6,  #25
594*c0909341SAndroid Build Coastguard Worker        clz             w7,  w7                  // ctz(width + height)
595*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x5, w3, uxtw #2]
596*c0909341SAndroid Build Coastguard Worker        ldrsw           x6,  [x5, w6, uxtw #2]
597*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                  // -ctz(width + height)
598*c0909341SAndroid Build Coastguard Worker        add             x3,  x5,  x3
599*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x6
600*c0909341SAndroid Build Coastguard Worker        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
601*c0909341SAndroid Build Coastguard Worker        dup             v17.4s,  w7              // -ctz(width + height)
602*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
603*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
604*c0909341SAndroid Build Coastguard Worker        br              x5
605*c0909341SAndroid Build Coastguard Worker
606*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h4):
607*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
608*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x2], #8
609*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.4h
610*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
611*c0909341SAndroid Build Coastguard Worker        br              x3
612*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w4):
613*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
614*c0909341SAndroid Build Coastguard Worker        ld1             {v1.4h},  [x2]
615*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
616*c0909341SAndroid Build Coastguard Worker        uaddlv          s1,      v1.4h
617*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #4
618*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v1.2s
619*c0909341SAndroid Build Coastguard Worker        ushl            v0.2s,   v0.2s,   v17.2s
620*c0909341SAndroid Build Coastguard Worker        b.eq            1f
621*c0909341SAndroid Build Coastguard Worker        // h = 8/16
622*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #16
623*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
624*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
625*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
626*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
627*c0909341SAndroid Build Coastguard Worker        mul             v0.2s,   v0.2s,   v16.2s
628*c0909341SAndroid Build Coastguard Worker        ushr            v0.2s,   v0.2s,   #17
629*c0909341SAndroid Build Coastguard Worker1:
630*c0909341SAndroid Build Coastguard Worker        dup             v0.4h,   v0.h[0]
631*c0909341SAndroid Build Coastguard Worker2:
632*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
633*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
634*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
635*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x0], x1
636*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h},  [x6], x1
637*c0909341SAndroid Build Coastguard Worker        b.gt            2b
638*c0909341SAndroid Build Coastguard Worker        ret
639*c0909341SAndroid Build Coastguard Worker
640*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h8):
641*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
642*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x2], #16
643*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
644*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
645*c0909341SAndroid Build Coastguard Worker        br              x3
646*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w8):
647*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
648*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h},  [x2]
649*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
650*c0909341SAndroid Build Coastguard Worker        uaddlv          s1,      v1.8h
651*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
652*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v1.2s
653*c0909341SAndroid Build Coastguard Worker        ushl            v0.2s,   v0.2s,   v17.2s
654*c0909341SAndroid Build Coastguard Worker        b.eq            1f
655*c0909341SAndroid Build Coastguard Worker        // h = 4/16/32
656*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
657*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
658*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
659*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
660*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
661*c0909341SAndroid Build Coastguard Worker        mul             v0.2s,   v0.2s,   v16.2s
662*c0909341SAndroid Build Coastguard Worker        ushr            v0.2s,   v0.2s,   #17
663*c0909341SAndroid Build Coastguard Worker1:
664*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
665*c0909341SAndroid Build Coastguard Worker2:
666*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
667*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
668*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
669*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x0], x1
670*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h},  [x6], x1
671*c0909341SAndroid Build Coastguard Worker        b.gt            2b
672*c0909341SAndroid Build Coastguard Worker        ret
673*c0909341SAndroid Build Coastguard Worker
674*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h16):
675*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
676*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x2], #32
677*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
678*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
679*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
680*c0909341SAndroid Build Coastguard Worker        br              x3
681*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w16):
682*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
683*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h}, [x2]
684*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
685*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v2.8h
686*c0909341SAndroid Build Coastguard Worker        uaddlv          s1,      v1.8h
687*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #16
688*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v1.2s
689*c0909341SAndroid Build Coastguard Worker        ushl            v4.2s,   v0.2s,   v17.2s
690*c0909341SAndroid Build Coastguard Worker        b.eq            1f
691*c0909341SAndroid Build Coastguard Worker        // h = 4/8/32/64
692*c0909341SAndroid Build Coastguard Worker        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
693*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
694*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
695*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
696*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
697*c0909341SAndroid Build Coastguard Worker        mul             v4.2s,   v4.2s,   v16.2s
698*c0909341SAndroid Build Coastguard Worker        ushr            v4.2s,   v4.2s,   #17
699*c0909341SAndroid Build Coastguard Worker1:
700*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v4.h[0]
701*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v4.h[0]
702*c0909341SAndroid Build Coastguard Worker2:
703*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
704*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
705*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
706*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
707*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x6], x1
708*c0909341SAndroid Build Coastguard Worker        b.gt            2b
709*c0909341SAndroid Build Coastguard Worker        ret
710*c0909341SAndroid Build Coastguard Worker
711*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h32):
712*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
713*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
714*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
715*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
716*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
717*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
718*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
719*c0909341SAndroid Build Coastguard Worker        br              x3
720*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w32):
721*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
722*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
723*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
724*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v2.8h
725*c0909341SAndroid Build Coastguard Worker        addp            v3.8h,   v3.8h,   v4.8h
726*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v3.8h
727*c0909341SAndroid Build Coastguard Worker        uaddlv          s1,      v1.8h
728*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
729*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v1.2s
730*c0909341SAndroid Build Coastguard Worker        ushl            v4.2s,   v0.2s,   v17.2s
731*c0909341SAndroid Build Coastguard Worker        b.eq            1f
732*c0909341SAndroid Build Coastguard Worker        // h = 8/16/64
733*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
734*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
735*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
736*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
737*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
738*c0909341SAndroid Build Coastguard Worker        mul             v4.2s,   v4.2s,   v16.2s
739*c0909341SAndroid Build Coastguard Worker        ushr            v4.2s,   v4.2s,   #17
740*c0909341SAndroid Build Coastguard Worker1:
741*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v4.h[0]
742*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v4.h[0]
743*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v4.h[0]
744*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v4.h[0]
745*c0909341SAndroid Build Coastguard Worker2:
746*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
747*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
748*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
749*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
750*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
751*c0909341SAndroid Build Coastguard Worker        b.gt            2b
752*c0909341SAndroid Build Coastguard Worker        ret
753*c0909341SAndroid Build Coastguard Worker
754*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_h64):
755*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
756*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
757*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
758*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
759*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
760*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
761*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
762*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
763*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v6.8h
764*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v4.8h
765*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
766*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
767*c0909341SAndroid Build Coastguard Worker        br              x3
768*c0909341SAndroid Build Coastguard WorkerL(ipred_dc_w64):
769*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
770*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
771*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
772*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v2.8h
773*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
774*c0909341SAndroid Build Coastguard Worker        addp            v3.8h,   v3.8h,   v4.8h
775*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v21.8h
776*c0909341SAndroid Build Coastguard Worker        addp            v22.8h,  v22.8h,  v23.8h
777*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v3.8h
778*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v22.8h
779*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v20.8h
780*c0909341SAndroid Build Coastguard Worker        uaddlv          s1,      v1.8h
781*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #64
782*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v1.2s
783*c0909341SAndroid Build Coastguard Worker        ushl            v4.2s,   v0.2s,   v17.2s
784*c0909341SAndroid Build Coastguard Worker        b.eq            1f
785*c0909341SAndroid Build Coastguard Worker        // h = 16/32
786*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #16
787*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
788*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
789*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
790*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
791*c0909341SAndroid Build Coastguard Worker        mul             v4.2s,   v4.2s,   v16.2s
792*c0909341SAndroid Build Coastguard Worker        ushr            v4.2s,   v4.2s,   #17
793*c0909341SAndroid Build Coastguard Worker1:
794*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #64
795*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v4.h[0]
796*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v4.h[0]
797*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v4.h[0]
798*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v4.h[0]
799*c0909341SAndroid Build Coastguard Worker2:
800*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
801*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
802*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
803*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
804*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
805*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
806*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
807*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
808*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
809*c0909341SAndroid Build Coastguard Worker        b.gt            2b
810*c0909341SAndroid Build Coastguard Worker        ret
811*c0909341SAndroid Build Coastguard Workerendfunc
812*c0909341SAndroid Build Coastguard Worker
813*c0909341SAndroid Build Coastguard Workerjumptable ipred_dc_tbl
814*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h64) - ipred_dc_tbl
815*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h32) - ipred_dc_tbl
816*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h16) - ipred_dc_tbl
817*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h8)  - ipred_dc_tbl
818*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_h4)  - ipred_dc_tbl
819*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w64) - ipred_dc_tbl
820*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w32) - ipred_dc_tbl
821*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w16) - ipred_dc_tbl
822*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w8)  - ipred_dc_tbl
823*c0909341SAndroid Build Coastguard Worker        .word L(ipred_dc_w4)  - ipred_dc_tbl
824*c0909341SAndroid Build Coastguard Workerendjumptable
825*c0909341SAndroid Build Coastguard Worker
826*c0909341SAndroid Build Coastguard Worker// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
827*c0909341SAndroid Build Coastguard Worker//                             const pixel *const topleft,
828*c0909341SAndroid Build Coastguard Worker//                             const int width, const int height, const int a,
829*c0909341SAndroid Build Coastguard Worker//                             const int max_width, const int max_height);
830*c0909341SAndroid Build Coastguard Workerfunction ipred_paeth_16bpc_neon, export=1
831*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
832*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_paeth_tbl
833*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
834*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
835*c0909341SAndroid Build Coastguard Worker        ld1r            {v4.8h},  [x2]
836*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  #2
837*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #8
838*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
839*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-8
840*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
841*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
842*c0909341SAndroid Build Coastguard Worker        br              x5
843*c0909341SAndroid Build Coastguard Worker40:
844*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
845*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.2d},  [x8]
846*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
847*c0909341SAndroid Build Coastguard Worker4:
848*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
849*c0909341SAndroid Build Coastguard Worker        zip1            v0.2d,   v0.2d,   v1.2d
850*c0909341SAndroid Build Coastguard Worker        zip1            v2.2d,   v2.2d,   v3.2d
851*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v6.8h,   v0.8h   // base
852*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v6.8h,   v2.8h
853*c0909341SAndroid Build Coastguard Worker        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
854*c0909341SAndroid Build Coastguard Worker        sabd            v21.8h,  v5.8h,   v17.8h
855*c0909341SAndroid Build Coastguard Worker        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
856*c0909341SAndroid Build Coastguard Worker        sabd            v23.8h,  v4.8h,   v17.8h
857*c0909341SAndroid Build Coastguard Worker        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
858*c0909341SAndroid Build Coastguard Worker        sabd            v17.8h,  v2.8h,   v17.8h
859*c0909341SAndroid Build Coastguard Worker        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
860*c0909341SAndroid Build Coastguard Worker        umin            v19.8h,  v21.8h,  v23.8h
861*c0909341SAndroid Build Coastguard Worker        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
862*c0909341SAndroid Build Coastguard Worker        cmge            v21.8h,  v23.8h,  v21.8h
863*c0909341SAndroid Build Coastguard Worker        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
864*c0909341SAndroid Build Coastguard Worker        cmge            v17.8h,  v19.8h,  v17.8h
865*c0909341SAndroid Build Coastguard Worker        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
866*c0909341SAndroid Build Coastguard Worker        bsl             v20.16b, v5.16b,  v4.16b
867*c0909341SAndroid Build Coastguard Worker        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
868*c0909341SAndroid Build Coastguard Worker        bit             v20.16b, v0.16b,  v16.16b
869*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[1], [x0], x1
870*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[0], [x6], x1
871*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
872*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[1], [x0], x1
873*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[0], [x6], x1
874*c0909341SAndroid Build Coastguard Worker        b.gt            4b
875*c0909341SAndroid Build Coastguard Worker        ret
876*c0909341SAndroid Build Coastguard Worker80:
877*c0909341SAndroid Build Coastguard Worker160:
878*c0909341SAndroid Build Coastguard Worker320:
879*c0909341SAndroid Build Coastguard Worker640:
880*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
881*c0909341SAndroid Build Coastguard Worker        ld1             {v5.8h},  [x8], #16
882*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
883*c0909341SAndroid Build Coastguard Worker        // Set up pointers for four rows in parallel; x0, x6, x5, x10
884*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  x1
885*c0909341SAndroid Build Coastguard Worker        add             x10, x6,  x1
886*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
887*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw #1
888*c0909341SAndroid Build Coastguard Worker1:
889*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
890*c0909341SAndroid Build Coastguard Worker2:
891*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
892*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v6.8h,   v0.8h   // base
893*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v6.8h,   v1.8h
894*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v6.8h,   v2.8h
895*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v6.8h,   v3.8h
896*c0909341SAndroid Build Coastguard Worker        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
897*c0909341SAndroid Build Coastguard Worker        sabd            v21.8h,  v5.8h,   v17.8h
898*c0909341SAndroid Build Coastguard Worker        sabd            v22.8h,  v5.8h,   v18.8h
899*c0909341SAndroid Build Coastguard Worker        sabd            v23.8h,  v5.8h,   v19.8h
900*c0909341SAndroid Build Coastguard Worker        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
901*c0909341SAndroid Build Coastguard Worker        sabd            v25.8h,  v4.8h,   v17.8h
902*c0909341SAndroid Build Coastguard Worker        sabd            v26.8h,  v4.8h,   v18.8h
903*c0909341SAndroid Build Coastguard Worker        sabd            v27.8h,  v4.8h,   v19.8h
904*c0909341SAndroid Build Coastguard Worker        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
905*c0909341SAndroid Build Coastguard Worker        sabd            v17.8h,  v1.8h,   v17.8h
906*c0909341SAndroid Build Coastguard Worker        sabd            v18.8h,  v2.8h,   v18.8h
907*c0909341SAndroid Build Coastguard Worker        sabd            v19.8h,  v3.8h,   v19.8h
908*c0909341SAndroid Build Coastguard Worker        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
909*c0909341SAndroid Build Coastguard Worker        umin            v29.8h,  v21.8h,  v25.8h
910*c0909341SAndroid Build Coastguard Worker        umin            v30.8h,  v22.8h,  v26.8h
911*c0909341SAndroid Build Coastguard Worker        umin            v31.8h,  v23.8h,  v27.8h
912*c0909341SAndroid Build Coastguard Worker        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
913*c0909341SAndroid Build Coastguard Worker        cmge            v21.8h,  v25.8h,  v21.8h
914*c0909341SAndroid Build Coastguard Worker        cmge            v22.8h,  v26.8h,  v22.8h
915*c0909341SAndroid Build Coastguard Worker        cmge            v23.8h,  v27.8h,  v23.8h
916*c0909341SAndroid Build Coastguard Worker        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
917*c0909341SAndroid Build Coastguard Worker        cmge            v17.8h,  v29.8h,  v17.8h
918*c0909341SAndroid Build Coastguard Worker        cmge            v18.8h,  v30.8h,  v18.8h
919*c0909341SAndroid Build Coastguard Worker        cmge            v19.8h,  v31.8h,  v19.8h
920*c0909341SAndroid Build Coastguard Worker        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
921*c0909341SAndroid Build Coastguard Worker        bsl             v22.16b, v5.16b,  v4.16b
922*c0909341SAndroid Build Coastguard Worker        bsl             v21.16b, v5.16b,  v4.16b
923*c0909341SAndroid Build Coastguard Worker        bsl             v20.16b, v5.16b,  v4.16b
924*c0909341SAndroid Build Coastguard Worker        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
925*c0909341SAndroid Build Coastguard Worker        bit             v22.16b, v2.16b,  v18.16b
926*c0909341SAndroid Build Coastguard Worker        bit             v21.16b, v1.16b,  v17.16b
927*c0909341SAndroid Build Coastguard Worker        bit             v20.16b, v0.16b,  v16.16b
928*c0909341SAndroid Build Coastguard Worker        st1             {v23.8h}, [x0], #16
929*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h}, [x6], #16
930*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #8
931*c0909341SAndroid Build Coastguard Worker        st1             {v21.8h}, [x5], #16
932*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h}, [x10], #16
933*c0909341SAndroid Build Coastguard Worker        b.le            8f
934*c0909341SAndroid Build Coastguard Worker        ld1             {v5.8h},  [x8], #16
935*c0909341SAndroid Build Coastguard Worker        b               2b
936*c0909341SAndroid Build Coastguard Worker8:
937*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
938*c0909341SAndroid Build Coastguard Worker        b.le            9f
939*c0909341SAndroid Build Coastguard Worker        // End of horizontal loop, move pointers to next four rows
940*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  w9, uxtw #1
941*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
942*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
943*c0909341SAndroid Build Coastguard Worker        // Load the top row as early as possible
944*c0909341SAndroid Build Coastguard Worker        ld1             {v5.8h},  [x8], #16
945*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x1
946*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x1
947*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
948*c0909341SAndroid Build Coastguard Worker        b               1b
949*c0909341SAndroid Build Coastguard Worker9:
950*c0909341SAndroid Build Coastguard Worker        ret
951*c0909341SAndroid Build Coastguard Workerendfunc
952*c0909341SAndroid Build Coastguard Worker
953*c0909341SAndroid Build Coastguard Workerjumptable ipred_paeth_tbl
954*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_paeth_tbl
955*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_paeth_tbl
956*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_paeth_tbl
957*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_paeth_tbl
958*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_paeth_tbl
959*c0909341SAndroid Build Coastguard Workerendjumptable
960*c0909341SAndroid Build Coastguard Worker
961*c0909341SAndroid Build Coastguard Worker// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
962*c0909341SAndroid Build Coastguard Worker//                              const pixel *const topleft,
963*c0909341SAndroid Build Coastguard Worker//                              const int width, const int height, const int a,
964*c0909341SAndroid Build Coastguard Worker//                              const int max_width, const int max_height);
965*c0909341SAndroid Build Coastguard Workerfunction ipred_smooth_16bpc_neon, export=1
966*c0909341SAndroid Build Coastguard Worker        movrel          x10, X(sm_weights)
967*c0909341SAndroid Build Coastguard Worker        add             x11, x10, w4, uxtw
968*c0909341SAndroid Build Coastguard Worker        add             x10, x10, w3, uxtw
969*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
970*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_smooth_tbl
971*c0909341SAndroid Build Coastguard Worker        sub             x12, x2,  w4, uxtw #1
972*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
973*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
974*c0909341SAndroid Build Coastguard Worker        ld1r            {v4.8h},  [x12] // bottom
975*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  #2
976*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
977*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
978*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
979*c0909341SAndroid Build Coastguard Worker        br              x5
980*c0909341SAndroid Build Coastguard Worker40:
981*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
982*c0909341SAndroid Build Coastguard Worker        ld1r            {v6.2d}, [x8]             // top
983*c0909341SAndroid Build Coastguard Worker        ld1r            {v7.2s}, [x10]            // weights_hor
984*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #8
985*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-8
986*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   v6.h[3]          // right
987*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
988*c0909341SAndroid Build Coastguard Worker        uxtl            v7.8h,   v7.8b            // weights_hor
989*c0909341SAndroid Build Coastguard Worker        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
990*c0909341SAndroid Build Coastguard Worker4:
991*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
992*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
993*c0909341SAndroid Build Coastguard Worker        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
994*c0909341SAndroid Build Coastguard Worker        ushll           v21.4s,  v31.4h,  #8
995*c0909341SAndroid Build Coastguard Worker        ushll           v22.4s,  v31.4h,  #8
996*c0909341SAndroid Build Coastguard Worker        ushll           v23.4s,  v31.4h,  #8
997*c0909341SAndroid Build Coastguard Worker        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
998*c0909341SAndroid Build Coastguard Worker        zip1            v0.2d,   v3.2d,   v2.2d
999*c0909341SAndroid Build Coastguard Worker        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1000*c0909341SAndroid Build Coastguard Worker        zip1            v18.2s,  v18.2s,  v19.2s
1001*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1002*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v5.8h
1003*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1004*c0909341SAndroid Build Coastguard Worker        uxtl            v18.8h,  v18.8b
1005*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
1006*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v0.8h,   v7.8h
1007*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v1.4h,   v7.4h
1008*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v1.8h,   v7.8h
1009*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1010*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v6.8h,   v16.8h
1011*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v6.4h,   v18.4h
1012*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v6.8h,   v18.8h
1013*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h,  v20.4s,  #9
1014*c0909341SAndroid Build Coastguard Worker        rshrn           v21.4h,  v21.4s,  #9
1015*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v22.4s,  #9
1016*c0909341SAndroid Build Coastguard Worker        rshrn           v23.4h,  v23.4s,  #9
1017*c0909341SAndroid Build Coastguard Worker        st1             {v20.4h}, [x0], x1
1018*c0909341SAndroid Build Coastguard Worker        st1             {v21.4h}, [x6], x1
1019*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1020*c0909341SAndroid Build Coastguard Worker        st1             {v22.4h}, [x0], x1
1021*c0909341SAndroid Build Coastguard Worker        st1             {v23.4h}, [x6], x1
1022*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1023*c0909341SAndroid Build Coastguard Worker        ret
1024*c0909341SAndroid Build Coastguard Worker80:
1025*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1026*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h}, [x8]             // top
1027*c0909341SAndroid Build Coastguard Worker        ld1             {v7.8b}, [x10]            // weights_hor
1028*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #8
1029*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-8
1030*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   v6.h[7]          // right
1031*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
1032*c0909341SAndroid Build Coastguard Worker        uxtl            v7.8h,   v7.8b            // weights_hor
1033*c0909341SAndroid Build Coastguard Worker        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
1034*c0909341SAndroid Build Coastguard Worker8:
1035*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1036*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
1037*c0909341SAndroid Build Coastguard Worker        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1038*c0909341SAndroid Build Coastguard Worker        ushll           v21.4s,  v31.4h,  #8
1039*c0909341SAndroid Build Coastguard Worker        ushll           v22.4s,  v31.4h,  #8
1040*c0909341SAndroid Build Coastguard Worker        ushll           v23.4s,  v31.4h,  #8
1041*c0909341SAndroid Build Coastguard Worker        ushll           v24.4s,  v31.4h,  #8
1042*c0909341SAndroid Build Coastguard Worker        ushll           v25.4s,  v31.4h,  #8
1043*c0909341SAndroid Build Coastguard Worker        ushll           v26.4s,  v31.4h,  #8
1044*c0909341SAndroid Build Coastguard Worker        ushll           v27.4s,  v31.4h,  #8
1045*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1046*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v5.8h
1047*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v5.8h
1048*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v5.8h
1049*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1050*c0909341SAndroid Build Coastguard Worker        uxtl            v17.8h,  v17.8b
1051*c0909341SAndroid Build Coastguard Worker        uxtl            v18.8h,  v18.8b
1052*c0909341SAndroid Build Coastguard Worker        uxtl            v19.8h,  v19.8b
1053*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
1054*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
1055*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v2.4h,   v7.4h
1056*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v2.8h,   v7.8h
1057*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v1.4h,   v7.4h
1058*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v1.8h,   v7.8h
1059*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v0.4h,   v7.4h
1060*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v0.8h,   v7.8h
1061*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1062*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v6.8h,   v16.8h
1063*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v6.4h,   v17.4h
1064*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v6.8h,   v17.8h
1065*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v6.4h,   v18.4h
1066*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v6.8h,   v18.8h
1067*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v6.4h,   v19.4h
1068*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v6.8h,   v19.8h
1069*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h,  v20.4s,  #9
1070*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.8h,  v21.4s,  #9
1071*c0909341SAndroid Build Coastguard Worker        rshrn           v21.4h,  v22.4s,  #9
1072*c0909341SAndroid Build Coastguard Worker        rshrn2          v21.8h,  v23.4s,  #9
1073*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v24.4s,  #9
1074*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h,  v25.4s,  #9
1075*c0909341SAndroid Build Coastguard Worker        rshrn           v23.4h,  v26.4s,  #9
1076*c0909341SAndroid Build Coastguard Worker        rshrn2          v23.8h,  v27.4s,  #9
1077*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h}, [x0], x1
1078*c0909341SAndroid Build Coastguard Worker        st1             {v21.8h}, [x6], x1
1079*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1080*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h}, [x0], x1
1081*c0909341SAndroid Build Coastguard Worker        st1             {v23.8h}, [x6], x1
1082*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1083*c0909341SAndroid Build Coastguard Worker        ret
1084*c0909341SAndroid Build Coastguard Worker160:
1085*c0909341SAndroid Build Coastguard Worker320:
1086*c0909341SAndroid Build Coastguard Worker640:
1087*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1088*c0909341SAndroid Build Coastguard Worker        add             x12, x2,  w3, uxtw #1
1089*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw #1
1090*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.8h}, [x12]            // right
1091*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
1092*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
1093*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
1094*c0909341SAndroid Build Coastguard Worker        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
1095*c0909341SAndroid Build Coastguard Worker
1096*c0909341SAndroid Build Coastguard Worker1:
1097*c0909341SAndroid Build Coastguard Worker        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
1098*c0909341SAndroid Build Coastguard Worker        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1099*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1100*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v5.8h
1101*c0909341SAndroid Build Coastguard Worker        uxtl            v16.8h,  v16.8b           // weights_ver
1102*c0909341SAndroid Build Coastguard Worker        uxtl            v17.8h,  v17.8b
1103*c0909341SAndroid Build Coastguard Worker2:
1104*c0909341SAndroid Build Coastguard Worker        ld1             {v7.16b}, [x10],  #16     // weights_hor
1105*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x8], #32 // top
1106*c0909341SAndroid Build Coastguard Worker        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1107*c0909341SAndroid Build Coastguard Worker        ushll           v21.4s,  v31.4h,  #8
1108*c0909341SAndroid Build Coastguard Worker        ushll           v22.4s,  v31.4h,  #8
1109*c0909341SAndroid Build Coastguard Worker        ushll           v23.4s,  v31.4h,  #8
1110*c0909341SAndroid Build Coastguard Worker        ushll           v24.4s,  v31.4h,  #8
1111*c0909341SAndroid Build Coastguard Worker        ushll           v25.4s,  v31.4h,  #8
1112*c0909341SAndroid Build Coastguard Worker        ushll           v26.4s,  v31.4h,  #8
1113*c0909341SAndroid Build Coastguard Worker        ushll           v27.4s,  v31.4h,  #8
1114*c0909341SAndroid Build Coastguard Worker        uxtl            v6.8h,   v7.8b            // weights_hor
1115*c0909341SAndroid Build Coastguard Worker        uxtl2           v7.8h,   v7.16b
1116*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1117*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v4.8h
1118*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
1119*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
1120*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v1.4h,   v7.4h
1121*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v1.8h,   v7.8h
1122*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v0.4h,   v6.4h
1123*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v0.8h,   v6.8h
1124*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v0.4h,   v7.4h
1125*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v0.8h,   v7.8h
1126*c0909341SAndroid Build Coastguard Worker        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
1127*c0909341SAndroid Build Coastguard Worker        smlal2          v21.4s,  v2.8h,   v16.8h
1128*c0909341SAndroid Build Coastguard Worker        smlal           v22.4s,  v3.4h,   v16.4h
1129*c0909341SAndroid Build Coastguard Worker        smlal2          v23.4s,  v3.8h,   v16.8h
1130*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v2.4h,   v17.4h
1131*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v2.8h,   v17.8h
1132*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v3.4h,   v17.4h
1133*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v3.8h,   v17.8h
1134*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h,  v20.4s,  #9
1135*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.8h,  v21.4s,  #9
1136*c0909341SAndroid Build Coastguard Worker        rshrn           v21.4h,  v22.4s,  #9
1137*c0909341SAndroid Build Coastguard Worker        rshrn2          v21.8h,  v23.4s,  #9
1138*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v24.4s,  #9
1139*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h,  v25.4s,  #9
1140*c0909341SAndroid Build Coastguard Worker        rshrn           v23.4h,  v26.4s,  #9
1141*c0909341SAndroid Build Coastguard Worker        rshrn2          v23.8h,  v27.4s,  #9
1142*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1143*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h, v21.8h}, [x0], #32
1144*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h, v23.8h}, [x6], #32
1145*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1146*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1147*c0909341SAndroid Build Coastguard Worker        b.le            9f
1148*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  w9, uxtw #1
1149*c0909341SAndroid Build Coastguard Worker        sub             x10, x10, w9, uxtw
1150*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1151*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
1152*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
1153*c0909341SAndroid Build Coastguard Worker        b               1b
1154*c0909341SAndroid Build Coastguard Worker9:
1155*c0909341SAndroid Build Coastguard Worker        ret
1156*c0909341SAndroid Build Coastguard Workerendfunc
1157*c0909341SAndroid Build Coastguard Worker
1158*c0909341SAndroid Build Coastguard Workerjumptable ipred_smooth_tbl
1159*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_smooth_tbl
1160*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_smooth_tbl
1161*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_smooth_tbl
1162*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_smooth_tbl
1163*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_smooth_tbl
1164*c0909341SAndroid Build Coastguard Workerendjumptable
1165*c0909341SAndroid Build Coastguard Worker
1166*c0909341SAndroid Build Coastguard Worker// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1167*c0909341SAndroid Build Coastguard Worker//                                const pixel *const topleft,
1168*c0909341SAndroid Build Coastguard Worker//                                const int width, const int height, const int a,
1169*c0909341SAndroid Build Coastguard Worker//                                const int max_width, const int max_height);
1170*c0909341SAndroid Build Coastguard Workerfunction ipred_smooth_v_16bpc_neon, export=1
1171*c0909341SAndroid Build Coastguard Worker        movrel          x7,  X(sm_weights)
1172*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  w4, uxtw
1173*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
1174*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_smooth_v_tbl
1175*c0909341SAndroid Build Coastguard Worker        sub             x8,  x2,  w4, uxtw #1
1176*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
1177*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
1178*c0909341SAndroid Build Coastguard Worker        ld1r            {v4.8h},  [x8] // bottom
1179*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
1180*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
1181*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
1182*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1183*c0909341SAndroid Build Coastguard Worker        br              x5
1184*c0909341SAndroid Build Coastguard Worker40:
1185*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1186*c0909341SAndroid Build Coastguard Worker        ld1r            {v6.2d}, [x2]             // top
1187*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
1188*c0909341SAndroid Build Coastguard Worker4:
1189*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1190*c0909341SAndroid Build Coastguard Worker        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1191*c0909341SAndroid Build Coastguard Worker        zip1            v18.2s,  v18.2s,  v19.2s
1192*c0909341SAndroid Build Coastguard Worker        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1193*c0909341SAndroid Build Coastguard Worker        ushll           v18.8h,  v18.8b,  #7
1194*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1195*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v6.8h,   v18.8h
1196*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v4.8h
1197*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v4.8h
1198*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[0], [x0], x1
1199*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[1], [x6], x1
1200*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1201*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[0], [x0], x1
1202*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[1], [x6], x1
1203*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1204*c0909341SAndroid Build Coastguard Worker        ret
1205*c0909341SAndroid Build Coastguard Worker80:
1206*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1207*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h}, [x2]             // top
1208*c0909341SAndroid Build Coastguard Worker        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
1209*c0909341SAndroid Build Coastguard Worker8:
1210*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1211*c0909341SAndroid Build Coastguard Worker        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1212*c0909341SAndroid Build Coastguard Worker        ushll           v17.8h,  v17.8b,  #7
1213*c0909341SAndroid Build Coastguard Worker        ushll           v18.8h,  v18.8b,  #7
1214*c0909341SAndroid Build Coastguard Worker        ushll           v19.8h,  v19.8b,  #7
1215*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1216*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v6.8h,   v17.8h
1217*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v22.8h,  v6.8h,   v18.8h
1218*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v23.8h,  v6.8h,   v19.8h
1219*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v4.8h
1220*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v4.8h
1221*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v4.8h
1222*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v23.8h,  v4.8h
1223*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h}, [x0], x1
1224*c0909341SAndroid Build Coastguard Worker        st1             {v21.8h}, [x6], x1
1225*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1226*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h}, [x0], x1
1227*c0909341SAndroid Build Coastguard Worker        st1             {v23.8h}, [x6], x1
1228*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1229*c0909341SAndroid Build Coastguard Worker        ret
1230*c0909341SAndroid Build Coastguard Worker160:
1231*c0909341SAndroid Build Coastguard Worker320:
1232*c0909341SAndroid Build Coastguard Worker640:
1233*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1234*c0909341SAndroid Build Coastguard Worker        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1235*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  x1
1236*c0909341SAndroid Build Coastguard Worker        add             x8,  x6,  x1
1237*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1238*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw #1
1239*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
1240*c0909341SAndroid Build Coastguard Worker
1241*c0909341SAndroid Build Coastguard Worker1:
1242*c0909341SAndroid Build Coastguard Worker        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1243*c0909341SAndroid Build Coastguard Worker        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1244*c0909341SAndroid Build Coastguard Worker        ushll           v17.8h,  v17.8b,  #7
1245*c0909341SAndroid Build Coastguard Worker        ushll           v18.8h,  v18.8b,  #7
1246*c0909341SAndroid Build Coastguard Worker        ushll           v19.8h,  v19.8b,  #7
1247*c0909341SAndroid Build Coastguard Worker2:
1248*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2], #32 // top
1249*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1250*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v4.8h
1251*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1252*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v3.8h,   v16.8h
1253*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v22.8h,  v2.8h,   v17.8h
1254*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v23.8h,  v3.8h,   v17.8h
1255*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v24.8h,  v2.8h,   v18.8h
1256*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v25.8h,  v3.8h,   v18.8h
1257*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v26.8h,  v2.8h,   v19.8h
1258*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v27.8h,  v3.8h,   v19.8h
1259*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v4.8h
1260*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v4.8h
1261*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v4.8h
1262*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v23.8h,  v4.8h
1263*c0909341SAndroid Build Coastguard Worker        add             v24.8h,  v24.8h,  v4.8h
1264*c0909341SAndroid Build Coastguard Worker        add             v25.8h,  v25.8h,  v4.8h
1265*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v26.8h,  v4.8h
1266*c0909341SAndroid Build Coastguard Worker        add             v27.8h,  v27.8h,  v4.8h
1267*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1268*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h, v21.8h}, [x0], #32
1269*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h, v23.8h}, [x6], #32
1270*c0909341SAndroid Build Coastguard Worker        st1             {v24.8h, v25.8h}, [x5], #32
1271*c0909341SAndroid Build Coastguard Worker        st1             {v26.8h, v27.8h}, [x8], #32
1272*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1273*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1274*c0909341SAndroid Build Coastguard Worker        b.le            9f
1275*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w9, uxtw #1
1276*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1277*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
1278*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x1
1279*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x1
1280*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
1281*c0909341SAndroid Build Coastguard Worker        b               1b
1282*c0909341SAndroid Build Coastguard Worker9:
1283*c0909341SAndroid Build Coastguard Worker        ret
1284*c0909341SAndroid Build Coastguard Workerendfunc
1285*c0909341SAndroid Build Coastguard Worker
1286*c0909341SAndroid Build Coastguard Workerjumptable ipred_smooth_v_tbl
1287*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_smooth_v_tbl
1288*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_smooth_v_tbl
1289*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_smooth_v_tbl
1290*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_smooth_v_tbl
1291*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_smooth_v_tbl
1292*c0909341SAndroid Build Coastguard Workerendjumptable
1293*c0909341SAndroid Build Coastguard Worker
1294*c0909341SAndroid Build Coastguard Worker// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1295*c0909341SAndroid Build Coastguard Worker//                                const pixel *const topleft,
1296*c0909341SAndroid Build Coastguard Worker//                                const int width, const int height, const int a,
1297*c0909341SAndroid Build Coastguard Worker//                                const int max_width, const int max_height);
1298*c0909341SAndroid Build Coastguard Workerfunction ipred_smooth_h_16bpc_neon, export=1
1299*c0909341SAndroid Build Coastguard Worker        movrel          x8,  X(sm_weights)
1300*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  w3, uxtw
1301*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
1302*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_smooth_h_tbl
1303*c0909341SAndroid Build Coastguard Worker        add             x12, x2,  w3, uxtw #1
1304*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
1305*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
1306*c0909341SAndroid Build Coastguard Worker        ld1r            {v5.8h},  [x12] // right
1307*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
1308*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
1309*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1310*c0909341SAndroid Build Coastguard Worker        br              x5
1311*c0909341SAndroid Build Coastguard Worker40:
1312*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1313*c0909341SAndroid Build Coastguard Worker        ld1r            {v7.2s}, [x8]             // weights_hor
1314*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #8
1315*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-8
1316*c0909341SAndroid Build Coastguard Worker        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
1317*c0909341SAndroid Build Coastguard Worker4:
1318*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
1319*c0909341SAndroid Build Coastguard Worker        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
1320*c0909341SAndroid Build Coastguard Worker        zip1            v0.2d,   v3.2d,   v2.2d
1321*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1322*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v5.8h
1323*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1324*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v1.8h,   v7.8h
1325*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v5.8h
1326*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v5.8h
1327*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[0], [x0], x1
1328*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[1], [x6], x1
1329*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1330*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[0], [x0], x1
1331*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[1], [x6], x1
1332*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1333*c0909341SAndroid Build Coastguard Worker        ret
1334*c0909341SAndroid Build Coastguard Worker80:
1335*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1336*c0909341SAndroid Build Coastguard Worker        ld1             {v7.8b}, [x8]             // weights_hor
1337*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #8
1338*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-8
1339*c0909341SAndroid Build Coastguard Worker        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
1340*c0909341SAndroid Build Coastguard Worker8:
1341*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1342*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v5.8h   // left-right
1343*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v5.8h
1344*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v5.8h
1345*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v5.8h
1346*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1347*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
1348*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v22.8h,  v1.8h,   v7.8h
1349*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v23.8h,  v0.8h,   v7.8h
1350*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v5.8h
1351*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v5.8h
1352*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v5.8h
1353*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v23.8h,  v5.8h
1354*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h}, [x0], x1
1355*c0909341SAndroid Build Coastguard Worker        st1             {v21.8h}, [x6], x1
1356*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1357*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h}, [x0], x1
1358*c0909341SAndroid Build Coastguard Worker        st1             {v23.8h}, [x6], x1
1359*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1360*c0909341SAndroid Build Coastguard Worker        ret
1361*c0909341SAndroid Build Coastguard Worker160:
1362*c0909341SAndroid Build Coastguard Worker320:
1363*c0909341SAndroid Build Coastguard Worker640:
1364*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1365*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #8
1366*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-8
1367*c0909341SAndroid Build Coastguard Worker        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1368*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  x1
1369*c0909341SAndroid Build Coastguard Worker        add             x10, x6,  x1
1370*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1371*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw #1
1372*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
1373*c0909341SAndroid Build Coastguard Worker
1374*c0909341SAndroid Build Coastguard Worker1:
1375*c0909341SAndroid Build Coastguard Worker        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
1376*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1377*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v5.8h
1378*c0909341SAndroid Build Coastguard Worker        sub             v2.8h,   v2.8h,   v5.8h
1379*c0909341SAndroid Build Coastguard Worker        sub             v3.8h,   v3.8h,   v5.8h
1380*c0909341SAndroid Build Coastguard Worker2:
1381*c0909341SAndroid Build Coastguard Worker        ld1             {v7.16b}, [x8],   #16     // weights_hor
1382*c0909341SAndroid Build Coastguard Worker        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
1383*c0909341SAndroid Build Coastguard Worker        ushll2          v7.8h,   v7.16b,  #7
1384*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
1385*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
1386*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v22.8h,  v2.8h,   v6.8h
1387*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v23.8h,  v2.8h,   v7.8h
1388*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v24.8h,  v1.8h,   v6.8h
1389*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v25.8h,  v1.8h,   v7.8h
1390*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v26.8h,  v0.8h,   v6.8h
1391*c0909341SAndroid Build Coastguard Worker        sqrdmulh        v27.8h,  v0.8h,   v7.8h
1392*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v5.8h
1393*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v5.8h
1394*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v5.8h
1395*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v23.8h,  v5.8h
1396*c0909341SAndroid Build Coastguard Worker        add             v24.8h,  v24.8h,  v5.8h
1397*c0909341SAndroid Build Coastguard Worker        add             v25.8h,  v25.8h,  v5.8h
1398*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v26.8h,  v5.8h
1399*c0909341SAndroid Build Coastguard Worker        add             v27.8h,  v27.8h,  v5.8h
1400*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1401*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h, v21.8h}, [x0],  #32
1402*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h, v23.8h}, [x6],  #32
1403*c0909341SAndroid Build Coastguard Worker        st1             {v24.8h, v25.8h}, [x5],  #32
1404*c0909341SAndroid Build Coastguard Worker        st1             {v26.8h, v27.8h}, [x10], #32
1405*c0909341SAndroid Build Coastguard Worker        b.gt            2b
1406*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
1407*c0909341SAndroid Build Coastguard Worker        b.le            9f
1408*c0909341SAndroid Build Coastguard Worker        sub             x8,  x8,  w9, uxtw
1409*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1410*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
1411*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x1
1412*c0909341SAndroid Build Coastguard Worker        add             x10, x10, x1
1413*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
1414*c0909341SAndroid Build Coastguard Worker        b               1b
1415*c0909341SAndroid Build Coastguard Worker9:
1416*c0909341SAndroid Build Coastguard Worker        ret
1417*c0909341SAndroid Build Coastguard Workerendfunc
1418*c0909341SAndroid Build Coastguard Worker
1419*c0909341SAndroid Build Coastguard Workerjumptable ipred_smooth_h_tbl
1420*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_smooth_h_tbl
1421*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_smooth_h_tbl
1422*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_smooth_h_tbl
1423*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_smooth_h_tbl
1424*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_smooth_h_tbl
1425*c0909341SAndroid Build Coastguard Workerendjumptable
1426*c0909341SAndroid Build Coastguard Worker
1427*c0909341SAndroid Build Coastguard Workerconst padding_mask_buf
1428*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1429*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1430*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1431*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1432*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1433*c0909341SAndroid Build Coastguard Worker        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1434*c0909341SAndroid Build Coastguard Workerpadding_mask:
1435*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1436*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1437*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1438*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1439*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1440*c0909341SAndroid Build Coastguard Worker        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1441*c0909341SAndroid Build Coastguard Workerendconst
1442*c0909341SAndroid Build Coastguard Worker
1443*c0909341SAndroid Build Coastguard Worker// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
1444*c0909341SAndroid Build Coastguard Worker//                                        const pixel *const in, const int end,
1445*c0909341SAndroid Build Coastguard Worker//                                        const int bitdepth_max);
1446*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_upsample_edge_16bpc_neon, export=1
1447*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  w4               // bitdepth_max
1448*c0909341SAndroid Build Coastguard Worker        movrel          x4,  padding_mask
1449*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x2]     // in[]
1450*c0909341SAndroid Build Coastguard Worker        add             x5,  x2,  w3,  uxtw #1    // in[end]
1451*c0909341SAndroid Build Coastguard Worker        sub             x4,  x4,  w3,  uxtw #1
1452*c0909341SAndroid Build Coastguard Worker
1453*c0909341SAndroid Build Coastguard Worker        ld1r            {v2.8h},  [x5]            // padding
1454*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h}, [x4]      // padding_mask
1455*c0909341SAndroid Build Coastguard Worker
1456*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #9
1457*c0909341SAndroid Build Coastguard Worker
1458*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v2.16b,  v3.16b  // padded in[]
1459*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v2.16b,  v4.16b
1460*c0909341SAndroid Build Coastguard Worker
1461*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v0.16b,  v1.16b,  #2
1462*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v1.16b,  v2.16b,  #2
1463*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v0.16b,  v1.16b,  #4
1464*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v1.16b,  v2.16b,  #4
1465*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v1.16b,  #6
1466*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v1.16b,  v2.16b,  #6
1467*c0909341SAndroid Build Coastguard Worker
1468*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v4.8h,   v6.8h   // in[i+1] + in[i+2]
1469*c0909341SAndroid Build Coastguard Worker        add             v19.8h,  v5.8h,   v7.8h
1470*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v0.8h,   v16.8h
1471*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v1.8h,   v17.8h
1472*c0909341SAndroid Build Coastguard Worker        umull           v22.4s,  v18.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
1473*c0909341SAndroid Build Coastguard Worker        umull2          v23.4s,  v18.8h,  v31.8h
1474*c0909341SAndroid Build Coastguard Worker        umull           v24.4s,  v19.4h,  v31.4h
1475*c0909341SAndroid Build Coastguard Worker        umull2          v25.4s,  v19.8h,  v31.8h
1476*c0909341SAndroid Build Coastguard Worker        usubw           v22.4s,  v22.4s,  v20.4h
1477*c0909341SAndroid Build Coastguard Worker        usubw2          v23.4s,  v23.4s,  v20.8h
1478*c0909341SAndroid Build Coastguard Worker        usubw           v24.4s,  v24.4s,  v21.4h
1479*c0909341SAndroid Build Coastguard Worker        usubw2          v25.4s,  v25.4s,  v21.8h
1480*c0909341SAndroid Build Coastguard Worker
1481*c0909341SAndroid Build Coastguard Worker        sqrshrun        v16.4h,  v22.4s,  #4
1482*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v16.8h,  v23.4s,  #4
1483*c0909341SAndroid Build Coastguard Worker        sqrshrun        v17.4h,  v24.4s,  #4
1484*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v17.8h,  v25.4s,  #4
1485*c0909341SAndroid Build Coastguard Worker
1486*c0909341SAndroid Build Coastguard Worker        smin            v16.8h,  v16.8h,  v30.8h
1487*c0909341SAndroid Build Coastguard Worker        smin            v17.8h,  v17.8h,  v30.8h
1488*c0909341SAndroid Build Coastguard Worker
1489*c0909341SAndroid Build Coastguard Worker        zip1            v0.8h,   v4.8h,   v16.8h
1490*c0909341SAndroid Build Coastguard Worker        zip2            v1.8h,   v4.8h,   v16.8h
1491*c0909341SAndroid Build Coastguard Worker        zip1            v2.8h,   v5.8h,   v17.8h
1492*c0909341SAndroid Build Coastguard Worker        zip2            v3.8h,   v5.8h,   v17.8h
1493*c0909341SAndroid Build Coastguard Worker
1494*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
1495*c0909341SAndroid Build Coastguard Worker
1496*c0909341SAndroid Build Coastguard Worker        ret
1497*c0909341SAndroid Build Coastguard Workerendfunc
1498*c0909341SAndroid Build Coastguard Worker
1499*c0909341SAndroid Build Coastguard Worker// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
1500*c0909341SAndroid Build Coastguard Worker//                                        const pixel *const in,
1501*c0909341SAndroid Build Coastguard Worker//                                        const int bitdepth_max);
1502*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_upsample_edge_16bpc_neon, export=1
1503*c0909341SAndroid Build Coastguard Worker        dup             v30.8h,  w3               // bitdepth_max
1504*c0909341SAndroid Build Coastguard Worker        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
1505*c0909341SAndroid Build Coastguard Worker        movrel          x4,  padding_mask
1506*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x2]      // in[]
1507*c0909341SAndroid Build Coastguard Worker        add             x5,  x2,  w1,  uxtw #1    // in[sz]
1508*c0909341SAndroid Build Coastguard Worker        sub             x4,  x4,  w1,  uxtw #1
1509*c0909341SAndroid Build Coastguard Worker
1510*c0909341SAndroid Build Coastguard Worker        ld1r            {v3.8h},  [x2]            // in[0] for padding
1511*c0909341SAndroid Build Coastguard Worker        ld1r            {v2.8h},  [x5]            // padding
1512*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x4]      // padding_mask
1513*c0909341SAndroid Build Coastguard Worker
1514*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #9
1515*c0909341SAndroid Build Coastguard Worker
1516*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v2.16b,  v4.16b  // padded in[]
1517*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v2.16b,  v5.16b
1518*c0909341SAndroid Build Coastguard Worker
1519*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v3.16b,  v0.16b,  #14
1520*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #2
1521*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v0.16b,  v1.16b,  #4
1522*c0909341SAndroid Build Coastguard Worker
1523*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v0.8h,   v5.8h   // in[i+0] + in[i+1]
1524*c0909341SAndroid Build Coastguard Worker        add             v17.8h,  v4.8h,   v6.8h   // in[i-1] + in[i+2]
1525*c0909341SAndroid Build Coastguard Worker        umull           v18.4s,  v16.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
1526*c0909341SAndroid Build Coastguard Worker        umull2          v19.4s,  v16.8h,  v31.8h
1527*c0909341SAndroid Build Coastguard Worker        usubw           v18.4s,  v18.4s,  v17.4h
1528*c0909341SAndroid Build Coastguard Worker        usubw2          v19.4s,  v19.4s,  v17.8h
1529*c0909341SAndroid Build Coastguard Worker
1530*c0909341SAndroid Build Coastguard Worker        sqrshrun        v16.4h,  v18.4s,  #4
1531*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v16.8h,  v19.4s,  #4
1532*c0909341SAndroid Build Coastguard Worker
1533*c0909341SAndroid Build Coastguard Worker        add             x5,  x0,  #2*16
1534*c0909341SAndroid Build Coastguard Worker
1535*c0909341SAndroid Build Coastguard Worker        smin            v16.8h,  v16.8h,  v30.8h
1536*c0909341SAndroid Build Coastguard Worker
1537*c0909341SAndroid Build Coastguard Worker        zip1            v4.8h,   v0.8h,   v16.8h
1538*c0909341SAndroid Build Coastguard Worker        zip2            v5.8h,   v0.8h,   v16.8h
1539*c0909341SAndroid Build Coastguard Worker
1540*c0909341SAndroid Build Coastguard Worker        st1             {v2.h}[0], [x5]
1541*c0909341SAndroid Build Coastguard Worker        // In case sz=8, output one single pixel in out[16].
1542*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h}, [x0]
1543*c0909341SAndroid Build Coastguard Worker
1544*c0909341SAndroid Build Coastguard Worker        ret
1545*c0909341SAndroid Build Coastguard Workerendfunc
1546*c0909341SAndroid Build Coastguard Worker
1547*c0909341SAndroid Build Coastguard Workerconst edge_filter
1548*c0909341SAndroid Build Coastguard Worker        .short 0, 4, 8, 0
1549*c0909341SAndroid Build Coastguard Worker        .short 0, 5, 6, 0
1550*c0909341SAndroid Build Coastguard Worker// Leaving out the coeffs for strength=3
1551*c0909341SAndroid Build Coastguard Worker//      .byte 2, 4, 4, 0
1552*c0909341SAndroid Build Coastguard Workerendconst
1553*c0909341SAndroid Build Coastguard Worker
1554*c0909341SAndroid Build Coastguard Worker// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
1555*c0909341SAndroid Build Coastguard Worker//                                      const pixel *const in, const int end,
1556*c0909341SAndroid Build Coastguard Worker//                                      const int strength);
1557*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_filter_edge_16bpc_neon, export=1
1558*c0909341SAndroid Build Coastguard Worker        cmp             w4, #3
1559*c0909341SAndroid Build Coastguard Worker        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
1560*c0909341SAndroid Build Coastguard Worker
1561*c0909341SAndroid Build Coastguard Worker        movrel          x5,  edge_filter, -6
1562*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  w4,  uxtw #3    // edge_filter + 2*((strength - 1)*4 + 1)
1563*c0909341SAndroid Build Coastguard Worker
1564*c0909341SAndroid Build Coastguard Worker        ld1             {v31.s}[0], [x5]          // kernel[1-2]
1565*c0909341SAndroid Build Coastguard Worker
1566*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x2], #16
1567*c0909341SAndroid Build Coastguard Worker
1568*c0909341SAndroid Build Coastguard Worker        dup             v30.8h, v31.h[0]
1569*c0909341SAndroid Build Coastguard Worker        dup             v31.8h, v31.h[1]
1570*c0909341SAndroid Build Coastguard Worker1:
1571*c0909341SAndroid Build Coastguard Worker        // in[end], is the last valid pixel. We produce 16 pixels out by
1572*c0909341SAndroid Build Coastguard Worker        // using 18 pixels in - the last pixel used is [17] of the ones
1573*c0909341SAndroid Build Coastguard Worker        // read/buffered.
1574*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #17
1575*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h}, [x2], #32
1576*c0909341SAndroid Build Coastguard Worker        b.lt            2f
1577*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1578*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v1.16b,  v2.16b,  #2
1579*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #4
1580*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v1.16b,  v2.16b,  #4
1581*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v0.8h,   v30.8h
1582*c0909341SAndroid Build Coastguard Worker        mla             v16.8h,  v3.8h,   v31.8h
1583*c0909341SAndroid Build Coastguard Worker        mla             v16.8h,  v5.8h,   v30.8h
1584*c0909341SAndroid Build Coastguard Worker        mul             v17.8h,  v1.8h,   v30.8h
1585*c0909341SAndroid Build Coastguard Worker        mla             v17.8h,  v4.8h,   v31.8h
1586*c0909341SAndroid Build Coastguard Worker        mla             v17.8h,  v6.8h,   v30.8h
1587*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1588*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
1589*c0909341SAndroid Build Coastguard Worker        urshr           v16.8h,  v16.8h,  #4
1590*c0909341SAndroid Build Coastguard Worker        urshr           v17.8h,  v17.8h,  #4
1591*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #16
1592*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h, v17.8h}, [x0], #32
1593*c0909341SAndroid Build Coastguard Worker        b.gt            1b
1594*c0909341SAndroid Build Coastguard Worker        ret
1595*c0909341SAndroid Build Coastguard Worker2:
1596*c0909341SAndroid Build Coastguard Worker        // Right padding
1597*c0909341SAndroid Build Coastguard Worker
1598*c0909341SAndroid Build Coastguard Worker        // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
1599*c0909341SAndroid Build Coastguard Worker        movrel          x5,  padding_mask
1600*c0909341SAndroid Build Coastguard Worker        sub             w6,  w3,  #24
1601*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  w3,  uxtw #1
1602*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w6,  sxtw #1
1603*c0909341SAndroid Build Coastguard Worker
1604*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h}, [x5] // padding_mask
1605*c0909341SAndroid Build Coastguard Worker
1606*c0909341SAndroid Build Coastguard Worker        ld1r            {v2.8h}, [x6]
1607*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v2.16b,  v3.16b  // Pad v0-v1
1608*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v2.16b,  v4.16b
1609*c0909341SAndroid Build Coastguard Worker
1610*c0909341SAndroid Build Coastguard Worker        // Filter one block
1611*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1612*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v1.16b,  v2.16b,  #2
1613*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #4
1614*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v1.16b,  v2.16b,  #4
1615*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v0.8h,   v30.8h
1616*c0909341SAndroid Build Coastguard Worker        mla             v16.8h,  v3.8h,   v31.8h
1617*c0909341SAndroid Build Coastguard Worker        mla             v16.8h,  v5.8h,   v30.8h
1618*c0909341SAndroid Build Coastguard Worker        mul             v17.8h,  v1.8h,   v30.8h
1619*c0909341SAndroid Build Coastguard Worker        mla             v17.8h,  v4.8h,   v31.8h
1620*c0909341SAndroid Build Coastguard Worker        mla             v17.8h,  v6.8h,   v30.8h
1621*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1622*c0909341SAndroid Build Coastguard Worker        urshr           v16.8h,  v16.8h,  #4
1623*c0909341SAndroid Build Coastguard Worker        urshr           v17.8h,  v17.8h,  #4
1624*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h, v17.8h}, [x0], #32
1625*c0909341SAndroid Build Coastguard Worker        b.le            9f
1626*c0909341SAndroid Build Coastguard Worker5:
1627*c0909341SAndroid Build Coastguard Worker        // After one block, any remaining output would only be filtering
1628*c0909341SAndroid Build Coastguard Worker        // padding - thus just store the padding.
1629*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1630*c0909341SAndroid Build Coastguard Worker        st1             {v2.16b}, [x0], #16
1631*c0909341SAndroid Build Coastguard Worker        b.gt            5b
1632*c0909341SAndroid Build Coastguard Worker9:
1633*c0909341SAndroid Build Coastguard Worker        ret
1634*c0909341SAndroid Build Coastguard Worker
1635*c0909341SAndroid Build Coastguard WorkerL(fivetap):
1636*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #2              // topleft -= 1 pixel
1637*c0909341SAndroid Build Coastguard Worker        movi            v29.8h, #2
1638*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x2], #16
1639*c0909341SAndroid Build Coastguard Worker        movi            v30.8h, #4
1640*c0909341SAndroid Build Coastguard Worker        movi            v31.8h, #4
1641*c0909341SAndroid Build Coastguard Worker        ins             v0.h[0], v0.h[1]
1642*c0909341SAndroid Build Coastguard Worker1:
1643*c0909341SAndroid Build Coastguard Worker        // in[end+1], is the last valid pixel. We produce 16 pixels out by
1644*c0909341SAndroid Build Coastguard Worker        // using 20 pixels in - the last pixel used is [19] of the ones
1645*c0909341SAndroid Build Coastguard Worker        // read/buffered.
1646*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #18
1647*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h}, [x2], #32
1648*c0909341SAndroid Build Coastguard Worker        b.lt            2f                        // if (end + 1 < 19)
1649*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1650*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v1.16b,  v2.16b,  #2
1651*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #4
1652*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v1.16b,  v2.16b,  #4
1653*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v1.16b,  #6
1654*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v1.16b,  v2.16b,  #6
1655*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v0.16b,  v1.16b,  #8
1656*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v1.16b,  v2.16b,  #8
1657*c0909341SAndroid Build Coastguard Worker        mul             v20.8h,  v0.8h,   v29.8h
1658*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v3.8h,   v30.8h
1659*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v5.8h,   v31.8h
1660*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v16.8h,  v30.8h
1661*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v18.8h,  v29.8h
1662*c0909341SAndroid Build Coastguard Worker        mul             v21.8h,  v1.8h,   v29.8h
1663*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v4.8h,   v30.8h
1664*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v6.8h,   v31.8h
1665*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v17.8h,  v30.8h
1666*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v19.8h,  v29.8h
1667*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1668*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
1669*c0909341SAndroid Build Coastguard Worker        urshr           v20.8h,  v20.8h,  #4
1670*c0909341SAndroid Build Coastguard Worker        urshr           v21.8h,  v21.8h,  #4
1671*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #16
1672*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h, v21.8h}, [x0], #32
1673*c0909341SAndroid Build Coastguard Worker        b.gt            1b
1674*c0909341SAndroid Build Coastguard Worker        ret
1675*c0909341SAndroid Build Coastguard Worker2:
1676*c0909341SAndroid Build Coastguard Worker        // Right padding
1677*c0909341SAndroid Build Coastguard Worker
1678*c0909341SAndroid Build Coastguard Worker        // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
1679*c0909341SAndroid Build Coastguard Worker        movrel          x5,  padding_mask, -2
1680*c0909341SAndroid Build Coastguard Worker        sub             w6,  w3,  #23
1681*c0909341SAndroid Build Coastguard Worker        sub             x5,  x5,  w3,  uxtw #1
1682*c0909341SAndroid Build Coastguard Worker        add             x6,  x2,  w6,  sxtw #1
1683*c0909341SAndroid Build Coastguard Worker
1684*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
1685*c0909341SAndroid Build Coastguard Worker
1686*c0909341SAndroid Build Coastguard Worker        ld1r            {v28.8h}, [x6]
1687*c0909341SAndroid Build Coastguard Worker        bit             v0.16b,  v28.16b, v3.16b  // Pad v0-v2
1688*c0909341SAndroid Build Coastguard Worker        bit             v1.16b,  v28.16b, v4.16b
1689*c0909341SAndroid Build Coastguard Worker        bit             v2.16b,  v28.16b, v5.16b
1690*c0909341SAndroid Build Coastguard Worker4:
1691*c0909341SAndroid Build Coastguard Worker        // Filter one block
1692*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v0.16b,  v1.16b,  #2
1693*c0909341SAndroid Build Coastguard Worker        ext             v4.16b,  v1.16b,  v2.16b,  #2
1694*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v0.16b,  v1.16b,  #4
1695*c0909341SAndroid Build Coastguard Worker        ext             v6.16b,  v1.16b,  v2.16b,  #4
1696*c0909341SAndroid Build Coastguard Worker        ext             v16.16b, v0.16b,  v1.16b,  #6
1697*c0909341SAndroid Build Coastguard Worker        ext             v17.16b, v1.16b,  v2.16b,  #6
1698*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v0.16b,  v1.16b,  #8
1699*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v1.16b,  v2.16b,  #8
1700*c0909341SAndroid Build Coastguard Worker        mul             v20.8h,  v0.8h,   v29.8h
1701*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v3.8h,   v30.8h
1702*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v5.8h,   v31.8h
1703*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v16.8h,  v30.8h
1704*c0909341SAndroid Build Coastguard Worker        mla             v20.8h,  v18.8h,  v29.8h
1705*c0909341SAndroid Build Coastguard Worker        mul             v21.8h,  v1.8h,   v29.8h
1706*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v4.8h,   v30.8h
1707*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v6.8h,   v31.8h
1708*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v17.8h,  v30.8h
1709*c0909341SAndroid Build Coastguard Worker        mla             v21.8h,  v19.8h,  v29.8h
1710*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #16
1711*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
1712*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v28.16b
1713*c0909341SAndroid Build Coastguard Worker        mov             v2.16b,  v28.16b
1714*c0909341SAndroid Build Coastguard Worker        urshr           v20.8h,  v20.8h,  #4
1715*c0909341SAndroid Build Coastguard Worker        urshr           v21.8h,  v21.8h,  #4
1716*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  #16
1717*c0909341SAndroid Build Coastguard Worker        st1             {v20.8h, v21.8h}, [x0], #32
1718*c0909341SAndroid Build Coastguard Worker        b.le            9f
1719*c0909341SAndroid Build Coastguard Worker        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
1720*c0909341SAndroid Build Coastguard Worker        // filter properly once more - aka (w3 >= 0).
1721*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #0
1722*c0909341SAndroid Build Coastguard Worker        b.ge            4b
1723*c0909341SAndroid Build Coastguard Worker5:
1724*c0909341SAndroid Build Coastguard Worker        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
1725*c0909341SAndroid Build Coastguard Worker        // last valid pixel - thus just output that without filtering.
1726*c0909341SAndroid Build Coastguard Worker        subs            w1,  w1,  #8
1727*c0909341SAndroid Build Coastguard Worker        st1             {v28.8h}, [x0], #16
1728*c0909341SAndroid Build Coastguard Worker        b.gt            5b
1729*c0909341SAndroid Build Coastguard Worker9:
1730*c0909341SAndroid Build Coastguard Worker        ret
1731*c0909341SAndroid Build Coastguard Workerendfunc
1732*c0909341SAndroid Build Coastguard Worker
1733*c0909341SAndroid Build Coastguard Worker// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
1734*c0909341SAndroid Build Coastguard Worker//                                 const int n);
1735*c0909341SAndroid Build Coastguard Workerfunction ipred_pixel_set_16bpc_neon, export=1
1736*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   w1
1737*c0909341SAndroid Build Coastguard Worker1:
1738*c0909341SAndroid Build Coastguard Worker        subs            w2,  w2,  #8
1739*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], #16
1740*c0909341SAndroid Build Coastguard Worker        b.gt            1b
1741*c0909341SAndroid Build Coastguard Worker        ret
1742*c0909341SAndroid Build Coastguard Workerendfunc
1743*c0909341SAndroid Build Coastguard Worker
1744*c0909341SAndroid Build Coastguard Worker// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1745*c0909341SAndroid Build Coastguard Worker//                                const pixel *const top,
1746*c0909341SAndroid Build Coastguard Worker//                                const int width, const int height,
1747*c0909341SAndroid Build Coastguard Worker//                                const int dx, const int max_base_x);
1748*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_fill1_16bpc_neon, export=1
1749*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
1750*c0909341SAndroid Build Coastguard Worker        movrel          x8,  ipred_z1_fill1_tbl
1751*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
1752*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x8, w9, uxtw #2]
1753*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
1754*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x9
1755*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.8h}, [x10]           // padding
1756*c0909341SAndroid Build Coastguard Worker        mov             w7,  w5
1757*c0909341SAndroid Build Coastguard Worker        mov             w15, #64
1758*c0909341SAndroid Build Coastguard Worker        br              x8
1759*c0909341SAndroid Build Coastguard Worker40:
1760*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1761*c0909341SAndroid Build Coastguard Worker4:
1762*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1763*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1764*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1765*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1766*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1767*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1768*c0909341SAndroid Build Coastguard Worker        b.ge            49f
1769*c0909341SAndroid Build Coastguard Worker        lsl             w8,  w8,  #1
1770*c0909341SAndroid Build Coastguard Worker        lsl             w10, w10, #1
1771*c0909341SAndroid Build Coastguard Worker        ldr             q0,  [x2, w8, uxtw]       // top[base]
1772*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x2, w10, uxtw]
1773*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   w9               // frac
1774*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   w11
1775*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v0.16b,  #2 // top[base+1]
1776*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v2.16b,  #2
1777*c0909341SAndroid Build Coastguard Worker        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
1778*c0909341SAndroid Build Coastguard Worker        sub             v7.4h,   v3.4h,   v2.4h
1779*c0909341SAndroid Build Coastguard Worker        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
1780*c0909341SAndroid Build Coastguard Worker        ushll           v17.4s,  v2.4h,   #6
1781*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
1782*c0909341SAndroid Build Coastguard Worker        smlal           v17.4s,  v7.4h,   v5.4h
1783*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
1784*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v17.4s,  #6
1785*c0909341SAndroid Build Coastguard Worker        st1             {v16.4h}, [x0], x1
1786*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1787*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1788*c0909341SAndroid Build Coastguard Worker        st1             {v17.4h}, [x0], x1
1789*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1790*c0909341SAndroid Build Coastguard Worker        ret
1791*c0909341SAndroid Build Coastguard Worker
1792*c0909341SAndroid Build Coastguard Worker49:
1793*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x0], x1
1794*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1795*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x0], x1
1796*c0909341SAndroid Build Coastguard Worker        b.gt            49b
1797*c0909341SAndroid Build Coastguard Worker        ret
1798*c0909341SAndroid Build Coastguard Worker
1799*c0909341SAndroid Build Coastguard Worker80:
1800*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1801*c0909341SAndroid Build Coastguard Worker8:
1802*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1803*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1804*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1805*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1806*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1807*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1808*c0909341SAndroid Build Coastguard Worker        b.ge            89f
1809*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw #1
1810*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw #1
1811*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   w9               // frac
1812*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   w11
1813*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x8]            // top[base]
1814*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h},  [x10]
1815*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
1816*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
1817*c0909341SAndroid Build Coastguard Worker        ldr             h1, [x8, #16]
1818*c0909341SAndroid Build Coastguard Worker        ldr             h3, [x10, #16]
1819*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   w9               // 64 - frac
1820*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   w11
1821*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v1.16b,  #2 // top[base+1]
1822*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #2
1823*c0909341SAndroid Build Coastguard Worker        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
1824*c0909341SAndroid Build Coastguard Worker        umlal           v16.4s,  v1.4h,   v4.4h   // + top[base+1]*frac
1825*c0909341SAndroid Build Coastguard Worker        umull2          v17.4s,  v0.8h,   v6.8h
1826*c0909341SAndroid Build Coastguard Worker        umlal2          v17.4s,  v1.8h,   v4.8h
1827*c0909341SAndroid Build Coastguard Worker        umull           v18.4s,  v2.4h,   v7.4h
1828*c0909341SAndroid Build Coastguard Worker        umlal           v18.4s,  v3.4h,   v5.4h
1829*c0909341SAndroid Build Coastguard Worker        umull2          v19.4s,  v2.8h,   v7.8h
1830*c0909341SAndroid Build Coastguard Worker        umlal2          v19.4s,  v3.8h,   v5.8h
1831*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
1832*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.8h,  v17.4s,  #6
1833*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v18.4s,  #6
1834*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.8h,  v19.4s,  #6
1835*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h}, [x0], x1
1836*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1837*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1838*c0909341SAndroid Build Coastguard Worker        st1             {v17.8h}, [x0], x1
1839*c0909341SAndroid Build Coastguard Worker        b.gt            8b
1840*c0909341SAndroid Build Coastguard Worker        ret
1841*c0909341SAndroid Build Coastguard Worker
1842*c0909341SAndroid Build Coastguard Worker89:
1843*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0], x1
1844*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1845*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0], x1
1846*c0909341SAndroid Build Coastguard Worker        b.gt            89b
1847*c0909341SAndroid Build Coastguard Worker        ret
1848*c0909341SAndroid Build Coastguard Worker
1849*c0909341SAndroid Build Coastguard Worker160:
1850*c0909341SAndroid Build Coastguard Worker320:
1851*c0909341SAndroid Build Coastguard Worker640:
1852*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
1853*c0909341SAndroid Build Coastguard Worker
1854*c0909341SAndroid Build Coastguard Worker        mov             w12, w3
1855*c0909341SAndroid Build Coastguard Worker
1856*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1
1857*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
1858*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3,  uxtw #1
1859*c0909341SAndroid Build Coastguard Worker1:
1860*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1861*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1862*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1863*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1864*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1865*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1866*c0909341SAndroid Build Coastguard Worker        b.ge            169f
1867*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw #1
1868*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw #1
1869*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   w9               // frac
1870*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   w11
1871*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // top[base]
1872*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
1873*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
1874*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
1875*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,  w9               // 64 - frac
1876*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,  w11
1877*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1878*c0909341SAndroid Build Coastguard Worker2:
1879*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v0.16b,  v1.16b,  #2 // top[base+1]
1880*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v1.16b,  v2.16b,  #2
1881*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v3.16b,  v4.16b,  #2
1882*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v4.16b,  v5.16b,  #2
1883*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
1884*c0909341SAndroid Build Coastguard Worker        umull           v22.4s,  v0.4h,   v16.4h  // top[base]*(64-frac)
1885*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v18.4h,  v6.4h   // + top[base+1]*frac
1886*c0909341SAndroid Build Coastguard Worker        umull2          v23.4s,  v0.8h,   v16.8h
1887*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v18.8h,  v6.8h
1888*c0909341SAndroid Build Coastguard Worker        umull           v24.4s,  v1.4h,   v16.4h
1889*c0909341SAndroid Build Coastguard Worker        umlal           v24.4s,  v19.4h,  v6.4h
1890*c0909341SAndroid Build Coastguard Worker        umull2          v25.4s,  v1.8h,   v16.8h
1891*c0909341SAndroid Build Coastguard Worker        umlal2          v25.4s,  v19.8h,  v6.8h
1892*c0909341SAndroid Build Coastguard Worker        umull           v26.4s,  v3.4h,   v17.4h
1893*c0909341SAndroid Build Coastguard Worker        umlal           v26.4s,  v20.4h,  v7.4h
1894*c0909341SAndroid Build Coastguard Worker        umull2          v27.4s,  v3.8h,   v17.8h
1895*c0909341SAndroid Build Coastguard Worker        umlal2          v27.4s,  v20.8h,  v7.8h
1896*c0909341SAndroid Build Coastguard Worker        umull           v28.4s,  v4.4h,   v17.4h
1897*c0909341SAndroid Build Coastguard Worker        umlal           v28.4s,  v21.4h,  v7.4h
1898*c0909341SAndroid Build Coastguard Worker        umull2          v29.4s,  v4.8h,   v17.8h
1899*c0909341SAndroid Build Coastguard Worker        umlal2          v29.4s,  v21.8h,  v7.8h
1900*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v22.4s,  #6
1901*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h,  v23.4s,  #6
1902*c0909341SAndroid Build Coastguard Worker        rshrn           v23.4h,  v24.4s,  #6
1903*c0909341SAndroid Build Coastguard Worker        rshrn2          v23.8h,  v25.4s,  #6
1904*c0909341SAndroid Build Coastguard Worker        rshrn           v24.4h,  v26.4s,  #6
1905*c0909341SAndroid Build Coastguard Worker        rshrn2          v24.8h,  v27.4s,  #6
1906*c0909341SAndroid Build Coastguard Worker        rshrn           v25.4h,  v28.4s,  #6
1907*c0909341SAndroid Build Coastguard Worker        rshrn2          v25.8h,  v29.4s,  #6
1908*c0909341SAndroid Build Coastguard Worker        st1             {v22.8h, v23.8h}, [x0],  #32
1909*c0909341SAndroid Build Coastguard Worker        st1             {v24.8h, v25.8h}, [x13], #32
1910*c0909341SAndroid Build Coastguard Worker        b.le            3f
1911*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
1912*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h}, [x8],  #32 // top[base]
1913*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v5.16b
1914*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x10], #32
1915*c0909341SAndroid Build Coastguard Worker        b               2b
1916*c0909341SAndroid Build Coastguard Worker
1917*c0909341SAndroid Build Coastguard Worker3:
1918*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1919*c0909341SAndroid Build Coastguard Worker        b.le            9f
1920*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1921*c0909341SAndroid Build Coastguard Worker        add             x13, x13, x1
1922*c0909341SAndroid Build Coastguard Worker        mov             w3,  w12
1923*c0909341SAndroid Build Coastguard Worker        b               1b
1924*c0909341SAndroid Build Coastguard Worker9:
1925*c0909341SAndroid Build Coastguard Worker        ret
1926*c0909341SAndroid Build Coastguard Worker
1927*c0909341SAndroid Build Coastguard Worker169:
1928*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0],  #16
1929*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #8
1930*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x13], #16
1931*c0909341SAndroid Build Coastguard Worker        b.gt            169b
1932*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1933*c0909341SAndroid Build Coastguard Worker        b.le            9b
1934*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
1935*c0909341SAndroid Build Coastguard Worker        add             x13, x13, x1
1936*c0909341SAndroid Build Coastguard Worker        mov             w3,  w12
1937*c0909341SAndroid Build Coastguard Worker        b               169b
1938*c0909341SAndroid Build Coastguard Workerendfunc
1939*c0909341SAndroid Build Coastguard Worker
1940*c0909341SAndroid Build Coastguard Workerjumptable ipred_z1_fill1_tbl
1941*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z1_fill1_tbl
1942*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z1_fill1_tbl
1943*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z1_fill1_tbl
1944*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z1_fill1_tbl
1945*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z1_fill1_tbl
1946*c0909341SAndroid Build Coastguard Workerendjumptable
1947*c0909341SAndroid Build Coastguard Worker
1948*c0909341SAndroid Build Coastguard Workerfunction ipred_z1_fill2_16bpc_neon, export=1
1949*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #8
1950*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1951*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.16b}, [x10]          // padding
1952*c0909341SAndroid Build Coastguard Worker        mov             w7,  w5
1953*c0909341SAndroid Build Coastguard Worker        mov             w15, #64
1954*c0909341SAndroid Build Coastguard Worker        b.eq            8f
1955*c0909341SAndroid Build Coastguard Worker
1956*c0909341SAndroid Build Coastguard Worker4:      // w == 4
1957*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1958*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1959*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1960*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
1961*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
1962*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
1963*c0909341SAndroid Build Coastguard Worker        b.ge            49f
1964*c0909341SAndroid Build Coastguard Worker        lsl             w8,  w8,  #1
1965*c0909341SAndroid Build Coastguard Worker        lsl             w10, w10, #1
1966*c0909341SAndroid Build Coastguard Worker        ldr             q0,  [x2, w8, uxtw]       // top[base]
1967*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x2, w10, uxtw]
1968*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   w9               // frac
1969*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   w11
1970*c0909341SAndroid Build Coastguard Worker        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
1971*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
1972*c0909341SAndroid Build Coastguard Worker        uzp2            v3.8h,   v2.8h,   v2.8h
1973*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h,   v2.8h,   v2.8h
1974*c0909341SAndroid Build Coastguard Worker        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
1975*c0909341SAndroid Build Coastguard Worker        sub             v7.4h,   v3.4h,   v2.4h
1976*c0909341SAndroid Build Coastguard Worker        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
1977*c0909341SAndroid Build Coastguard Worker        ushll           v17.4s,  v2.4h,   #6
1978*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
1979*c0909341SAndroid Build Coastguard Worker        smlal           v17.4s,  v7.4h,   v5.4h
1980*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
1981*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v17.4s,  #6
1982*c0909341SAndroid Build Coastguard Worker        st1             {v16.4h}, [x0], x1
1983*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
1984*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1985*c0909341SAndroid Build Coastguard Worker        st1             {v17.4h}, [x0], x1
1986*c0909341SAndroid Build Coastguard Worker        b.gt            4b
1987*c0909341SAndroid Build Coastguard Worker        ret
1988*c0909341SAndroid Build Coastguard Worker
1989*c0909341SAndroid Build Coastguard Worker49:
1990*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x0], x1
1991*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
1992*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x0], x1
1993*c0909341SAndroid Build Coastguard Worker        b.gt            49b
1994*c0909341SAndroid Build Coastguard Worker        ret
1995*c0909341SAndroid Build Coastguard Worker
1996*c0909341SAndroid Build Coastguard Worker8:      // w == 8
1997*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
1998*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
1999*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
2000*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
2001*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
2002*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
2003*c0909341SAndroid Build Coastguard Worker        b.ge            89f
2004*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw #1
2005*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw #1
2006*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   w9               // frac
2007*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   w11
2008*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
2009*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h},  [x10]
2010*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
2011*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
2012*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   w9               // 64 - frac
2013*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   w11
2014*c0909341SAndroid Build Coastguard Worker        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
2015*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
2016*c0909341SAndroid Build Coastguard Worker        uzp2            v21.8h,  v2.8h,   v3.8h
2017*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h,   v2.8h,   v3.8h
2018*c0909341SAndroid Build Coastguard Worker        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
2019*c0909341SAndroid Build Coastguard Worker        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
2020*c0909341SAndroid Build Coastguard Worker        umull2          v17.4s,  v0.8h,   v6.8h
2021*c0909341SAndroid Build Coastguard Worker        umlal2          v17.4s,  v20.8h,  v4.8h
2022*c0909341SAndroid Build Coastguard Worker        umull           v18.4s,  v2.4h,   v7.4h
2023*c0909341SAndroid Build Coastguard Worker        umlal           v18.4s,  v21.4h,  v5.4h
2024*c0909341SAndroid Build Coastguard Worker        umull2          v19.4s,  v2.8h,   v7.8h
2025*c0909341SAndroid Build Coastguard Worker        umlal2          v19.4s,  v21.8h,  v5.8h
2026*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
2027*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.8h,  v17.4s,  #6
2028*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v18.4s,  #6
2029*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.8h,  v19.4s,  #6
2030*c0909341SAndroid Build Coastguard Worker        st1             {v16.8h}, [x0], x1
2031*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
2032*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
2033*c0909341SAndroid Build Coastguard Worker        st1             {v17.8h}, [x0], x1
2034*c0909341SAndroid Build Coastguard Worker        b.gt            8b
2035*c0909341SAndroid Build Coastguard Worker        ret
2036*c0909341SAndroid Build Coastguard Worker
2037*c0909341SAndroid Build Coastguard Worker89:
2038*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0], x1
2039*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
2040*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0], x1
2041*c0909341SAndroid Build Coastguard Worker        b.gt            89b
2042*c0909341SAndroid Build Coastguard Worker        ret
2043*c0909341SAndroid Build Coastguard Workerendfunc
2044*c0909341SAndroid Build Coastguard Worker
2045*c0909341SAndroid Build Coastguard Worker// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
2046*c0909341SAndroid Build Coastguard Worker//                               const int n);
2047*c0909341SAndroid Build Coastguard Workerfunction ipred_reverse_16bpc_neon, export=1
2048*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #16
2049*c0909341SAndroid Build Coastguard Worker        add             x3,  x0,  #8
2050*c0909341SAndroid Build Coastguard Worker        mov             x4,  #16
2051*c0909341SAndroid Build Coastguard Worker1:
2052*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1]
2053*c0909341SAndroid Build Coastguard Worker        subs            w2,  w2,  #8
2054*c0909341SAndroid Build Coastguard Worker        rev64           v0.8h,  v0.8h
2055*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  #16
2056*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[1], [x0], x4
2057*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[0], [x3], x4
2058*c0909341SAndroid Build Coastguard Worker        b.gt            1b
2059*c0909341SAndroid Build Coastguard Worker        ret
2060*c0909341SAndroid Build Coastguard Workerendfunc
2061*c0909341SAndroid Build Coastguard Worker
2062*c0909341SAndroid Build Coastguard Workerconst increments
2063*c0909341SAndroid Build Coastguard Worker        .short          0,  1,  2,  3,  4,  5,  6,  7
2064*c0909341SAndroid Build Coastguard Workerendconst
2065*c0909341SAndroid Build Coastguard Worker
2066*c0909341SAndroid Build Coastguard Worker// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2067*c0909341SAndroid Build Coastguard Worker//                                const pixel *const top,
2068*c0909341SAndroid Build Coastguard Worker//                                const pixel *const left,
2069*c0909341SAndroid Build Coastguard Worker//                                const int width, const int height,
2070*c0909341SAndroid Build Coastguard Worker//                                const int dx, const int dy);
2071*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_fill1_16bpc_neon, export=1
2072*c0909341SAndroid Build Coastguard Worker        clz             w10, w4
2073*c0909341SAndroid Build Coastguard Worker        movrel          x9,  ipred_z2_fill1_tbl
2074*c0909341SAndroid Build Coastguard Worker        sub             w10, w10, #25
2075*c0909341SAndroid Build Coastguard Worker        ldrsw           x10, [x9, w10, uxtw #2]
2076*c0909341SAndroid Build Coastguard Worker        mov             w8,  #(1 << 6)            // xpos = 1 << 6
2077*c0909341SAndroid Build Coastguard Worker        add             x9,  x9,  x10
2078*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2079*c0909341SAndroid Build Coastguard Worker
2080*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
2081*c0909341SAndroid Build Coastguard Worker        ld1             {v31.8h},  [x11]          // increments
2082*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                   // -dy
2083*c0909341SAndroid Build Coastguard Worker
2084*c0909341SAndroid Build Coastguard Worker        br              x9
2085*c0909341SAndroid Build Coastguard Worker40:
2086*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2087*c0909341SAndroid Build Coastguard Worker
2088*c0909341SAndroid Build Coastguard Worker        dup             v30.4h,  w7               // -dy
2089*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
2090*c0909341SAndroid Build Coastguard Worker
2091*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2092*c0909341SAndroid Build Coastguard Worker        movi            v25.8h,  #0x3e
2093*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2094*c0909341SAndroid Build Coastguard Worker
2095*c0909341SAndroid Build Coastguard Worker        // Worst case height for w=4 is 16, but we need at least h+1 elements
2096*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
2097*c0909341SAndroid Build Coastguard Worker
2098*c0909341SAndroid Build Coastguard Worker        movi            v26.8h,  #64
2099*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #4
2100*c0909341SAndroid Build Coastguard Worker
2101*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2102*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v30.8b,  v25.8b  // frac_y
2103*c0909341SAndroid Build Coastguard Worker
2104*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2105*c0909341SAndroid Build Coastguard Worker
2106*c0909341SAndroid Build Coastguard Worker        movi            v23.4h,  #1, lsl #8
2107*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2108*c0909341SAndroid Build Coastguard Worker        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
2109*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #2
2110*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
2111*c0909341SAndroid Build Coastguard Worker
2112*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
2113*c0909341SAndroid Build Coastguard Worker        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
2114*c0909341SAndroid Build Coastguard Worker
2115*c0909341SAndroid Build Coastguard Worker        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2116*c0909341SAndroid Build Coastguard Worker
2117*c0909341SAndroid Build Coastguard Worker        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
2118*c0909341SAndroid Build Coastguard Worker
2119*c0909341SAndroid Build Coastguard Worker        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
2120*c0909341SAndroid Build Coastguard Worker
2121*c0909341SAndroid Build Coastguard Worker        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
2122*c0909341SAndroid Build Coastguard Worker
2123*c0909341SAndroid Build Coastguard Worker        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
2124*c0909341SAndroid Build Coastguard Worker        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
2125*c0909341SAndroid Build Coastguard Worker
2126*c0909341SAndroid Build Coastguard Worker        movi            v29.16b, #4
2127*c0909341SAndroid Build Coastguard Worker4:
2128*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2129*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w8               // xpos
2130*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2131*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-4                  // base_x <= -4
2132*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2133*c0909341SAndroid Build Coastguard Worker        b.le            49f
2134*c0909341SAndroid Build Coastguard Worker
2135*c0909341SAndroid Build Coastguard Worker        lsl             w9,  w9,  #1
2136*c0909341SAndroid Build Coastguard Worker        lsl             w11, w11, #1
2137*c0909341SAndroid Build Coastguard Worker
2138*c0909341SAndroid Build Coastguard Worker        dup             v17.4h,  w8               // xpos
2139*c0909341SAndroid Build Coastguard Worker
2140*c0909341SAndroid Build Coastguard Worker        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2141*c0909341SAndroid Build Coastguard Worker        ldr             q6,  [x2, w11, sxtw]
2142*c0909341SAndroid Build Coastguard Worker
2143*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
2144*c0909341SAndroid Build Coastguard Worker
2145*c0909341SAndroid Build Coastguard Worker        // Cut corners here; only doing tbl over v0-v1 here; we only
2146*c0909341SAndroid Build Coastguard Worker        // seem to need the last pixel, from v2, after skipping to the
2147*c0909341SAndroid Build Coastguard Worker        // left-only codepath below.
2148*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2149*c0909341SAndroid Build Coastguard Worker
2150*c0909341SAndroid Build Coastguard Worker        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
2151*c0909341SAndroid Build Coastguard Worker
2152*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
2153*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v6.16b,  #2
2154*c0909341SAndroid Build Coastguard Worker
2155*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
2156*c0909341SAndroid Build Coastguard Worker
2157*c0909341SAndroid Build Coastguard Worker        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2158*c0909341SAndroid Build Coastguard Worker
2159*c0909341SAndroid Build Coastguard Worker        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
2160*c0909341SAndroid Build Coastguard Worker        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
2161*c0909341SAndroid Build Coastguard Worker
2162*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
2163*c0909341SAndroid Build Coastguard Worker
2164*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
2165*c0909341SAndroid Build Coastguard Worker
2166*c0909341SAndroid Build Coastguard Worker        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2167*c0909341SAndroid Build Coastguard Worker        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2168*c0909341SAndroid Build Coastguard Worker        umull2          v22.4s,  v18.8h,  v28.8h
2169*c0909341SAndroid Build Coastguard Worker        umlal2          v22.4s,  v19.8h,  v27.8h
2170*c0909341SAndroid Build Coastguard Worker
2171*c0909341SAndroid Build Coastguard Worker        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
2172*c0909341SAndroid Build Coastguard Worker        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2173*c0909341SAndroid Build Coastguard Worker        umull2          v24.4s,  v4.8h,   v17.8h
2174*c0909341SAndroid Build Coastguard Worker        umlal2          v24.4s,  v5.8h,   v16.8h
2175*c0909341SAndroid Build Coastguard Worker
2176*c0909341SAndroid Build Coastguard Worker        cmge            v20.8h,  v20.8h,  #0
2177*c0909341SAndroid Build Coastguard Worker
2178*c0909341SAndroid Build Coastguard Worker        rshrn           v21.4h,  v21.4s,  #6
2179*c0909341SAndroid Build Coastguard Worker        rshrn2          v21.8h,  v22.4s,  #6
2180*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v23.4s,  #6
2181*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h,  v24.4s,  #6
2182*c0909341SAndroid Build Coastguard Worker
2183*c0909341SAndroid Build Coastguard Worker        bit             v21.16b, v22.16b, v20.16b
2184*c0909341SAndroid Build Coastguard Worker
2185*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[0], [x0], x1
2186*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2187*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2188*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[1], [x0], x1
2189*c0909341SAndroid Build Coastguard Worker        b.le            9f
2190*c0909341SAndroid Build Coastguard Worker
2191*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v19.16b, v19.16b, #8
2192*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2193*c0909341SAndroid Build Coastguard Worker        b               4b
2194*c0909341SAndroid Build Coastguard Worker
2195*c0909341SAndroid Build Coastguard Worker49:
2196*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2]
2197*c0909341SAndroid Build Coastguard Worker
2198*c0909341SAndroid Build Coastguard Worker        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2199*c0909341SAndroid Build Coastguard Worker
2200*c0909341SAndroid Build Coastguard Worker        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2201*c0909341SAndroid Build Coastguard Worker        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2202*c0909341SAndroid Build Coastguard Worker        umull2          v21.4s,  v18.8h,  v28.8h
2203*c0909341SAndroid Build Coastguard Worker        umlal2          v21.4s,  v19.8h,  v27.8h
2204*c0909341SAndroid Build Coastguard Worker
2205*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h,  v20.4s,  #6
2206*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.8h,  v21.4s,  #6
2207*c0909341SAndroid Build Coastguard Worker
2208*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[0], [x0], x1
2209*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2210*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[1], [x0], x1
2211*c0909341SAndroid Build Coastguard Worker        b.le            9f
2212*c0909341SAndroid Build Coastguard Worker
2213*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v19.16b, v19.16b, #8
2214*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2215*c0909341SAndroid Build Coastguard Worker        b               49b
2216*c0909341SAndroid Build Coastguard Worker
2217*c0909341SAndroid Build Coastguard Worker9:
2218*c0909341SAndroid Build Coastguard Worker        ret
2219*c0909341SAndroid Build Coastguard Worker
2220*c0909341SAndroid Build Coastguard Worker80:
2221*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2222*c0909341SAndroid Build Coastguard Worker
2223*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #-0x40]!
2224*c0909341SAndroid Build Coastguard Worker        stp             d10, d11, [sp, #0x10]
2225*c0909341SAndroid Build Coastguard Worker        stp             d12, d13, [sp, #0x20]
2226*c0909341SAndroid Build Coastguard Worker        stp             d14, d15, [sp, #0x30]
2227*c0909341SAndroid Build Coastguard Worker
2228*c0909341SAndroid Build Coastguard Worker        dup             v18.8h,  w7               // -dy
2229*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  #2              // Skip past left[0]
2230*c0909341SAndroid Build Coastguard Worker
2231*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2232*c0909341SAndroid Build Coastguard Worker        movi            v25.8h,  #0x3e
2233*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2234*c0909341SAndroid Build Coastguard Worker
2235*c0909341SAndroid Build Coastguard Worker        // Worst case height for w=8 is 32.
2236*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2237*c0909341SAndroid Build Coastguard Worker        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
2238*c0909341SAndroid Build Coastguard Worker
2239*c0909341SAndroid Build Coastguard Worker        movi            v26.8h,  #64
2240*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #4
2241*c0909341SAndroid Build Coastguard Worker
2242*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
2243*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v16.16b, v25.16b // frac_y
2244*c0909341SAndroid Build Coastguard Worker
2245*c0909341SAndroid Build Coastguard Worker        movi            v23.8h,  #1, lsl #8
2246*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2247*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v15.16b          // left[0]
2248*c0909341SAndroid Build Coastguard Worker        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2249*c0909341SAndroid Build Coastguard Worker        movi            v17.16b, #2
2250*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
2251*c0909341SAndroid Build Coastguard Worker
2252*c0909341SAndroid Build Coastguard Worker        // Cut corners here; for the first row we don't expect to need to
2253*c0909341SAndroid Build Coastguard Worker        // read outside of v0.
2254*c0909341SAndroid Build Coastguard Worker        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
2255*c0909341SAndroid Build Coastguard Worker
2256*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
2257*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
2258*c0909341SAndroid Build Coastguard Worker
2259*c0909341SAndroid Build Coastguard Worker        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
2260*c0909341SAndroid Build Coastguard Worker
2261*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #4
2262*c0909341SAndroid Build Coastguard Worker8:
2263*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2264*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
2265*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2266*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-16                 // base_x <= -16
2267*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2268*c0909341SAndroid Build Coastguard Worker        b.le            89f
2269*c0909341SAndroid Build Coastguard Worker
2270*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
2271*c0909341SAndroid Build Coastguard Worker
2272*c0909341SAndroid Build Coastguard Worker        add             x9,  x2,  w9,  sxtw #1
2273*c0909341SAndroid Build Coastguard Worker        add             x11, x2,  w11, sxtw #1
2274*c0909341SAndroid Build Coastguard Worker
2275*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
2276*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b          // left[0]
2277*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x11]
2278*c0909341SAndroid Build Coastguard Worker
2279*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2280*c0909341SAndroid Build Coastguard Worker
2281*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b          // left[0]
2282*c0909341SAndroid Build Coastguard Worker
2283*c0909341SAndroid Build Coastguard Worker        sshr            v21.8h,  v16.8h,  #6      // first base_x
2284*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h,  v17.8h,  #6
2285*c0909341SAndroid Build Coastguard Worker
2286*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2287*c0909341SAndroid Build Coastguard Worker
2288*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
2289*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v7.16b,  #2
2290*c0909341SAndroid Build Coastguard Worker
2291*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
2292*c0909341SAndroid Build Coastguard Worker        and             v17.16b, v17.16b, v25.16b
2293*c0909341SAndroid Build Coastguard Worker
2294*c0909341SAndroid Build Coastguard Worker        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2295*c0909341SAndroid Build Coastguard Worker        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2296*c0909341SAndroid Build Coastguard Worker
2297*c0909341SAndroid Build Coastguard Worker        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
2298*c0909341SAndroid Build Coastguard Worker        sub             v9.8h,   v26.8h,  v17.8h
2299*c0909341SAndroid Build Coastguard Worker
2300*c0909341SAndroid Build Coastguard Worker        umull2          v11.4s,  v18.8h,  v28.8h
2301*c0909341SAndroid Build Coastguard Worker        umlal2          v11.4s,  v19.8h,  v27.8h
2302*c0909341SAndroid Build Coastguard Worker
2303*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
2304*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v31.8h
2305*c0909341SAndroid Build Coastguard Worker
2306*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v19.4h,  v28.4h
2307*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v20.4h,  v27.4h
2308*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v19.8h,  v28.8h
2309*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v20.8h,  v27.8h
2310*c0909341SAndroid Build Coastguard Worker
2311*c0909341SAndroid Build Coastguard Worker        rshrn           v10.4h,  v10.4s,  #6
2312*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.8h,  v11.4s,  #6
2313*c0909341SAndroid Build Coastguard Worker        rshrn           v11.4h,  v12.4s,  #6
2314*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.8h,  v13.4s,  #6
2315*c0909341SAndroid Build Coastguard Worker
2316*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2317*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2318*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v4.8h,   v8.8h
2319*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v5.8h,   v16.8h
2320*c0909341SAndroid Build Coastguard Worker        umull           v14.4s,  v6.4h,   v9.4h
2321*c0909341SAndroid Build Coastguard Worker        umlal           v14.4s,  v7.4h,   v17.4h
2322*c0909341SAndroid Build Coastguard Worker        umull2          v18.4s,  v6.8h,   v9.8h
2323*c0909341SAndroid Build Coastguard Worker        umlal2          v18.4s,  v7.8h,   v17.8h
2324*c0909341SAndroid Build Coastguard Worker
2325*c0909341SAndroid Build Coastguard Worker        cmge            v21.8h,  v21.8h,  #0
2326*c0909341SAndroid Build Coastguard Worker        cmge            v22.8h,  v22.8h,  #0
2327*c0909341SAndroid Build Coastguard Worker
2328*c0909341SAndroid Build Coastguard Worker        rshrn           v12.4h,  v12.4s,  #6
2329*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.8h,  v13.4s,  #6
2330*c0909341SAndroid Build Coastguard Worker        rshrn           v13.4h,  v14.4s,  #6
2331*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.8h,  v18.4s,  #6
2332*c0909341SAndroid Build Coastguard Worker
2333*c0909341SAndroid Build Coastguard Worker        bit             v10.16b, v12.16b, v21.16b
2334*c0909341SAndroid Build Coastguard Worker        bit             v11.16b, v13.16b, v22.16b
2335*c0909341SAndroid Build Coastguard Worker
2336*c0909341SAndroid Build Coastguard Worker        st1             {v10.8h}, [x0], x1
2337*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2338*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2339*c0909341SAndroid Build Coastguard Worker        st1             {v11.8h}, [x0], x1
2340*c0909341SAndroid Build Coastguard Worker        b.le            9f
2341*c0909341SAndroid Build Coastguard Worker
2342*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2343*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
2344*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
2345*c0909341SAndroid Build Coastguard Worker        b               8b
2346*c0909341SAndroid Build Coastguard Worker
2347*c0909341SAndroid Build Coastguard Worker89:
2348*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b
2349*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b
2350*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2351*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2352*c0909341SAndroid Build Coastguard Worker
2353*c0909341SAndroid Build Coastguard Worker        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2354*c0909341SAndroid Build Coastguard Worker        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2355*c0909341SAndroid Build Coastguard Worker        umull2          v5.4s,   v18.8h,  v28.8h
2356*c0909341SAndroid Build Coastguard Worker        umlal2          v5.4s,   v19.8h,  v27.8h
2357*c0909341SAndroid Build Coastguard Worker        umull           v6.4s,   v19.4h,  v28.4h
2358*c0909341SAndroid Build Coastguard Worker        umlal           v6.4s,   v20.4h,  v27.4h
2359*c0909341SAndroid Build Coastguard Worker        umull2          v7.4s,   v19.8h,  v28.8h
2360*c0909341SAndroid Build Coastguard Worker        umlal2          v7.4s,   v20.8h,  v27.8h
2361*c0909341SAndroid Build Coastguard Worker
2362*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v4.4s,   #6
2363*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.8h,   v5.4s,   #6
2364*c0909341SAndroid Build Coastguard Worker        rshrn           v5.4h,   v6.4s,   #6
2365*c0909341SAndroid Build Coastguard Worker        rshrn2          v5.8h,   v7.4s,   #6
2366*c0909341SAndroid Build Coastguard Worker
2367*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h}, [x0], x1
2368*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2369*c0909341SAndroid Build Coastguard Worker        st1             {v5.8h}, [x0], x1
2370*c0909341SAndroid Build Coastguard Worker        b.le            9f
2371*c0909341SAndroid Build Coastguard Worker
2372*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2373*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
2374*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
2375*c0909341SAndroid Build Coastguard Worker        b               89b
2376*c0909341SAndroid Build Coastguard Worker
2377*c0909341SAndroid Build Coastguard Worker9:
2378*c0909341SAndroid Build Coastguard Worker        ldp             d14, d15, [sp, #0x30]
2379*c0909341SAndroid Build Coastguard Worker        ldp             d12, d13, [sp, #0x20]
2380*c0909341SAndroid Build Coastguard Worker        ldp             d10, d11, [sp, #0x10]
2381*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp], 0x40
2382*c0909341SAndroid Build Coastguard Worker        ret
2383*c0909341SAndroid Build Coastguard Worker
2384*c0909341SAndroid Build Coastguard Worker160:
2385*c0909341SAndroid Build Coastguard Worker320:
2386*c0909341SAndroid Build Coastguard Worker640:
2387*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
2388*c0909341SAndroid Build Coastguard Worker
2389*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #-0x40]!
2390*c0909341SAndroid Build Coastguard Worker        stp             d10, d11, [sp, #0x10]
2391*c0909341SAndroid Build Coastguard Worker        stp             d12, d13, [sp, #0x20]
2392*c0909341SAndroid Build Coastguard Worker        stp             d14, d15, [sp, #0x30]
2393*c0909341SAndroid Build Coastguard Worker
2394*c0909341SAndroid Build Coastguard Worker        dup             v25.8h,  w7               // -dy
2395*c0909341SAndroid Build Coastguard Worker        add             x3,  x3,  #2              // Skip past left[0]
2396*c0909341SAndroid Build Coastguard Worker
2397*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1              // alternating row
2398*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1              // stride *= 2
2399*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w4,  uxtw #1    // stride -= width
2400*c0909341SAndroid Build Coastguard Worker
2401*c0909341SAndroid Build Coastguard Worker        movi            v11.8h,  #8
2402*c0909341SAndroid Build Coastguard Worker        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
2403*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v26.8h,  v25.8h  // -= dy
2404*c0909341SAndroid Build Coastguard Worker        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
2405*c0909341SAndroid Build Coastguard Worker
2406*c0909341SAndroid Build Coastguard Worker        // Worst case height is 64, but we can only fit 32 pixels into
2407*c0909341SAndroid Build Coastguard Worker        // v0-v3 usable within one tbx instruction. As long as base_y is
2408*c0909341SAndroid Build Coastguard Worker        // up to 32, we use tbx.
2409*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2410*c0909341SAndroid Build Coastguard Worker        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
2411*c0909341SAndroid Build Coastguard Worker
2412*c0909341SAndroid Build Coastguard Worker        mov             w12, w4                   // orig w
2413*c0909341SAndroid Build Coastguard Worker        neg             w14, w4                   // -w
2414*c0909341SAndroid Build Coastguard Worker
2415*c0909341SAndroid Build Coastguard Worker1:
2416*c0909341SAndroid Build Coastguard Worker        mov             v23.16b, v26.16b          // reset ypos
2417*c0909341SAndroid Build Coastguard Worker
2418*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2419*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
2420*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2421*c0909341SAndroid Build Coastguard Worker        cmp             w9,  w14                  // base_x <= -2*w
2422*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2423*c0909341SAndroid Build Coastguard Worker        b.le            169f
2424*c0909341SAndroid Build Coastguard Worker
2425*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
2426*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2427*c0909341SAndroid Build Coastguard Worker
2428*c0909341SAndroid Build Coastguard Worker        add             x9,  x2,  w9,  sxtw #1
2429*c0909341SAndroid Build Coastguard Worker        add             x11, x2,  w11, sxtw #1
2430*c0909341SAndroid Build Coastguard Worker
2431*c0909341SAndroid Build Coastguard Worker        sshr            v21.8h,  v16.8h,  #6      // first base_x
2432*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h,  v17.8h,  #6
2433*c0909341SAndroid Build Coastguard Worker
2434*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [x9], #16        // top[base_x]
2435*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h}, [x11], #16
2436*c0909341SAndroid Build Coastguard Worker
2437*c0909341SAndroid Build Coastguard Worker        movi            v10.8h,  #0x3e
2438*c0909341SAndroid Build Coastguard Worker        movi            v11.8h,  #64
2439*c0909341SAndroid Build Coastguard Worker
2440*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v10.16b // frac_x
2441*c0909341SAndroid Build Coastguard Worker        and             v17.16b, v17.16b, v10.16b
2442*c0909341SAndroid Build Coastguard Worker
2443*c0909341SAndroid Build Coastguard Worker        sub             v8.8h,   v11.8h,  v16.8h  // 64 - frac_x
2444*c0909341SAndroid Build Coastguard Worker        sub             v9.8h,   v11.8h,  v17.8h
2445*c0909341SAndroid Build Coastguard Worker
2446*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
2447*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v31.8h
2448*c0909341SAndroid Build Coastguard Worker
2449*c0909341SAndroid Build Coastguard Worker2:
2450*c0909341SAndroid Build Coastguard Worker        smov            w10,     v22.h[0]
2451*c0909341SAndroid Build Coastguard Worker
2452*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2453*c0909341SAndroid Build Coastguard Worker        movi            v12.8h,  #64
2454*c0909341SAndroid Build Coastguard Worker        cmp             w10, #0                   // base_x (bottom left) >= 0
2455*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[0]         // base_y[0]
2456*c0909341SAndroid Build Coastguard Worker        movi            v10.8h,  #0x3e
2457*c0909341SAndroid Build Coastguard Worker
2458*c0909341SAndroid Build Coastguard Worker        b.ge            4f
2459*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v23.16b, v10.16b // frac_y
2460*c0909341SAndroid Build Coastguard Worker        cmp             w10,     #(32-3)
2461*c0909341SAndroid Build Coastguard Worker
2462*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v15.16b          // left[0]
2463*c0909341SAndroid Build Coastguard Worker        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
2464*c0909341SAndroid Build Coastguard Worker        b.gt            22f
2465*c0909341SAndroid Build Coastguard Worker
2466*c0909341SAndroid Build Coastguard Worker21:
2467*c0909341SAndroid Build Coastguard Worker        // base_y < 32, using tbx
2468*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2469*c0909341SAndroid Build Coastguard Worker        movi            v11.8h,  #1, lsl #8
2470*c0909341SAndroid Build Coastguard Worker        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2471*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
2472*c0909341SAndroid Build Coastguard Worker
2473*c0909341SAndroid Build Coastguard Worker        movi            v13.16b, #2
2474*c0909341SAndroid Build Coastguard Worker
2475*c0909341SAndroid Build Coastguard Worker        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2476*c0909341SAndroid Build Coastguard Worker
2477*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v13.16b // base_y + 1 (*2)
2478*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b          // left[0]
2479*c0909341SAndroid Build Coastguard Worker
2480*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2481*c0909341SAndroid Build Coastguard Worker
2482*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v13.16b // base_y + 2 (*2)
2483*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b          // left[0]
2484*c0909341SAndroid Build Coastguard Worker
2485*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2486*c0909341SAndroid Build Coastguard Worker
2487*c0909341SAndroid Build Coastguard Worker        b               23f
2488*c0909341SAndroid Build Coastguard Worker
2489*c0909341SAndroid Build Coastguard Worker22:
2490*c0909341SAndroid Build Coastguard Worker        // base_y >= 32, using separate loads.
2491*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[1]
2492*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[2]
2493*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw #1
2494*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[3]
2495*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw #1
2496*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[0], [x10]
2497*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[4]
2498*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw #1
2499*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[1], [x15]
2500*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[5]
2501*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw #1
2502*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[2], [x16]
2503*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[6]
2504*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw #1
2505*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[3], [x17]
2506*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[7]
2507*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw #1
2508*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw #1
2509*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[4], [x10]
2510*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw #1
2511*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[5], [x15]
2512*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[6], [x16]
2513*c0909341SAndroid Build Coastguard Worker        ld3             {v18.h, v19.h, v20.h}[7], [x17]
2514*c0909341SAndroid Build Coastguard Worker
2515*c0909341SAndroid Build Coastguard Worker23:
2516*c0909341SAndroid Build Coastguard Worker
2517*c0909341SAndroid Build Coastguard Worker        ld1             {v5.8h}, [x9], #16        // top[base_x]
2518*c0909341SAndroid Build Coastguard Worker        ld1             {v7.8h}, [x11], #16
2519*c0909341SAndroid Build Coastguard Worker
2520*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2521*c0909341SAndroid Build Coastguard Worker
2522*c0909341SAndroid Build Coastguard Worker        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2523*c0909341SAndroid Build Coastguard Worker        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2524*c0909341SAndroid Build Coastguard Worker        umull2          v11.4s,  v18.8h,  v28.8h
2525*c0909341SAndroid Build Coastguard Worker        umlal2          v11.4s,  v19.8h,  v27.8h
2526*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v19.4h,  v28.4h
2527*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v20.4h,  v27.4h
2528*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v19.8h,  v28.8h
2529*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v20.8h,  v27.8h
2530*c0909341SAndroid Build Coastguard Worker
2531*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
2532*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v6.16b,  v7.16b,  #2
2533*c0909341SAndroid Build Coastguard Worker
2534*c0909341SAndroid Build Coastguard Worker        rshrn           v10.4h,  v10.4s,  #6
2535*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.8h,  v11.4s,  #6
2536*c0909341SAndroid Build Coastguard Worker        rshrn           v11.4h,  v12.4s,  #6
2537*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.8h,  v13.4s,  #6
2538*c0909341SAndroid Build Coastguard Worker
2539*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2540*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
2541*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v4.8h,   v8.8h
2542*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v18.8h,  v16.8h
2543*c0909341SAndroid Build Coastguard Worker        umull           v14.4s,  v6.4h,   v9.4h
2544*c0909341SAndroid Build Coastguard Worker        umlal           v14.4s,  v19.4h,  v17.4h
2545*c0909341SAndroid Build Coastguard Worker        umull2          v20.4s,  v6.8h,   v9.8h
2546*c0909341SAndroid Build Coastguard Worker        umlal2          v20.4s,  v19.8h,  v17.8h
2547*c0909341SAndroid Build Coastguard Worker
2548*c0909341SAndroid Build Coastguard Worker        cmge            v18.8h,  v21.8h,  #0
2549*c0909341SAndroid Build Coastguard Worker        cmge            v19.8h,  v22.8h,  #0
2550*c0909341SAndroid Build Coastguard Worker
2551*c0909341SAndroid Build Coastguard Worker        rshrn           v12.4h,  v12.4s,  #6
2552*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.8h,  v13.4s,  #6
2553*c0909341SAndroid Build Coastguard Worker        rshrn           v13.4h,  v14.4s,  #6
2554*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.8h,  v20.4s,  #6
2555*c0909341SAndroid Build Coastguard Worker
2556*c0909341SAndroid Build Coastguard Worker        bit             v10.16b, v12.16b, v18.16b
2557*c0909341SAndroid Build Coastguard Worker        bit             v11.16b, v13.16b, v19.16b
2558*c0909341SAndroid Build Coastguard Worker
2559*c0909341SAndroid Build Coastguard Worker        st1             {v10.8h}, [x0], #16
2560*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #8
2561*c0909341SAndroid Build Coastguard Worker        st1             {v11.8h}, [x13], #16
2562*c0909341SAndroid Build Coastguard Worker        b.le            3f
2563*c0909341SAndroid Build Coastguard Worker
2564*c0909341SAndroid Build Coastguard Worker        movi            v10.8h,  #8
2565*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v5.16b
2566*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v7.16b
2567*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v10.8h  // base_x += 8
2568*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v10.8h
2569*c0909341SAndroid Build Coastguard Worker        b               2b
2570*c0909341SAndroid Build Coastguard Worker
2571*c0909341SAndroid Build Coastguard Worker3:
2572*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2573*c0909341SAndroid Build Coastguard Worker        b.le            9f
2574*c0909341SAndroid Build Coastguard Worker        movi            v10.8h, #128
2575*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
2576*c0909341SAndroid Build Coastguard Worker        add             x13, x13, x1
2577*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12                  // reset w
2578*c0909341SAndroid Build Coastguard Worker        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
2579*c0909341SAndroid Build Coastguard Worker        b               1b
2580*c0909341SAndroid Build Coastguard Worker
2581*c0909341SAndroid Build Coastguard Worker4:      // The rest of the row only predicted from top[]
2582*c0909341SAndroid Build Coastguard Worker        ld1             {v5.8h}, [x9], #16        // top[base_x]
2583*c0909341SAndroid Build Coastguard Worker        ld1             {v7.8h}, [x11], #16
2584*c0909341SAndroid Build Coastguard Worker
2585*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
2586*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v6.16b,  v7.16b,  #2
2587*c0909341SAndroid Build Coastguard Worker
2588*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2589*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
2590*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v4.8h,   v8.8h
2591*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v18.8h,  v16.8h
2592*c0909341SAndroid Build Coastguard Worker        umull           v14.4s,  v6.4h,   v9.4h
2593*c0909341SAndroid Build Coastguard Worker        umlal           v14.4s,  v19.4h,  v17.4h
2594*c0909341SAndroid Build Coastguard Worker        umull2          v20.4s,  v6.8h,   v9.8h
2595*c0909341SAndroid Build Coastguard Worker        umlal2          v20.4s,  v19.8h,  v17.8h
2596*c0909341SAndroid Build Coastguard Worker
2597*c0909341SAndroid Build Coastguard Worker        rshrn           v12.4h,  v12.4s,  #6
2598*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.8h,  v13.4s,  #6
2599*c0909341SAndroid Build Coastguard Worker        rshrn           v13.4h,  v14.4s,  #6
2600*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.8h,  v20.4s,  #6
2601*c0909341SAndroid Build Coastguard Worker
2602*c0909341SAndroid Build Coastguard Worker        st1             {v12.8h}, [x0], #16
2603*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #8
2604*c0909341SAndroid Build Coastguard Worker        st1             {v13.8h}, [x13], #16
2605*c0909341SAndroid Build Coastguard Worker        b.le            3b
2606*c0909341SAndroid Build Coastguard Worker
2607*c0909341SAndroid Build Coastguard Worker        mov             v4.16b,  v5.16b
2608*c0909341SAndroid Build Coastguard Worker        mov             v6.16b,  v7.16b
2609*c0909341SAndroid Build Coastguard Worker        b               4b
2610*c0909341SAndroid Build Coastguard Worker
2611*c0909341SAndroid Build Coastguard Worker169:    // The rest of the block only predicted from left[]
2612*c0909341SAndroid Build Coastguard Worker        add             x1,  x1,  w4,  uxtw #1    // restore stride
2613*c0909341SAndroid Build Coastguard Worker        mov             w12, w5                   // orig remaining h
2614*c0909341SAndroid Build Coastguard Worker1:
2615*c0909341SAndroid Build Coastguard Worker        movi            v12.8h,  #64
2616*c0909341SAndroid Build Coastguard Worker        movi            v10.8h,  #0x3e
2617*c0909341SAndroid Build Coastguard Worker
2618*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2619*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v23.16b, v10.16b // frac_y
2620*c0909341SAndroid Build Coastguard Worker
2621*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[0]         // base_y[0]
2622*c0909341SAndroid Build Coastguard Worker
2623*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2624*c0909341SAndroid Build Coastguard Worker        movi            v11.8h,  #1, lsl #8
2625*c0909341SAndroid Build Coastguard Worker        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2626*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2627*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
2628*c0909341SAndroid Build Coastguard Worker
2629*c0909341SAndroid Build Coastguard Worker        cmp             w10,     #(32-1)
2630*c0909341SAndroid Build Coastguard Worker
2631*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v15.16b          // left[0]
2632*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
2633*c0909341SAndroid Build Coastguard Worker
2634*c0909341SAndroid Build Coastguard Worker        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
2635*c0909341SAndroid Build Coastguard Worker
2636*c0909341SAndroid Build Coastguard Worker        b.gt            31f
2637*c0909341SAndroid Build Coastguard Worker
2638*c0909341SAndroid Build Coastguard Worker        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2639*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // base_y + 1 (*2)
2640*c0909341SAndroid Build Coastguard Worker
2641*c0909341SAndroid Build Coastguard Worker2:
2642*c0909341SAndroid Build Coastguard Worker        // base_y < 32, using tbx.
2643*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[0]         // base_y[0]
2644*c0909341SAndroid Build Coastguard Worker        mov             v19.16b, v15.16b          // left[0]
2645*c0909341SAndroid Build Coastguard Worker        cmp             w10,     #(64-4)
2646*c0909341SAndroid Build Coastguard Worker        b.gt            32f
2647*c0909341SAndroid Build Coastguard Worker        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2648*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // base_y + 2 (*2)
2649*c0909341SAndroid Build Coastguard Worker        mov             v20.16b, v15.16b          // left[0]
2650*c0909341SAndroid Build Coastguard Worker        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2651*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // next base_y
2652*c0909341SAndroid Build Coastguard Worker
2653*c0909341SAndroid Build Coastguard Worker        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2654*c0909341SAndroid Build Coastguard Worker        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2655*c0909341SAndroid Build Coastguard Worker        umull2          v11.4s,  v18.8h,  v28.8h
2656*c0909341SAndroid Build Coastguard Worker        umlal2          v11.4s,  v19.8h,  v27.8h
2657*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v19.4h,  v28.4h
2658*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v20.4h,  v27.4h
2659*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v19.8h,  v28.8h
2660*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v20.8h,  v27.8h
2661*c0909341SAndroid Build Coastguard Worker
2662*c0909341SAndroid Build Coastguard Worker        rshrn           v10.4h,  v10.4s,  #6
2663*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.8h,  v11.4s,  #6
2664*c0909341SAndroid Build Coastguard Worker        rshrn           v11.4h,  v12.4s,  #6
2665*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.8h,  v13.4s,  #6
2666*c0909341SAndroid Build Coastguard Worker
2667*c0909341SAndroid Build Coastguard Worker        st1             {v10.8h}, [x0], x1
2668*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2669*c0909341SAndroid Build Coastguard Worker        st1             {v11.8h}, [x13], x1
2670*c0909341SAndroid Build Coastguard Worker        b.le            4f
2671*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
2672*c0909341SAndroid Build Coastguard Worker        b               2b
2673*c0909341SAndroid Build Coastguard Worker
2674*c0909341SAndroid Build Coastguard Worker31:     // base_y >= 32, using separate loads, loading v18 if we had to bail
2675*c0909341SAndroid Build Coastguard Worker        // in the prologue.
2676*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[0]
2677*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[2]
2678*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #2
2679*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[4]
2680*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw
2681*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[6]
2682*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw
2683*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[0], [x10]
2684*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[8]
2685*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw
2686*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[1], [x15]
2687*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[10]
2688*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw
2689*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[2], [x16]
2690*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[12]
2691*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw
2692*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[3], [x17]
2693*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[14]
2694*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw
2695*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw
2696*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[4], [x10]
2697*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw
2698*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[5], [x15]
2699*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // next base_y
2700*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[6], [x16]
2701*c0909341SAndroid Build Coastguard Worker        ld1             {v18.h}[7], [x17]
2702*c0909341SAndroid Build Coastguard Worker
2703*c0909341SAndroid Build Coastguard Worker32:     // base_y >= 32, using separate loads.
2704*c0909341SAndroid Build Coastguard Worker        cmp             w5,  #4
2705*c0909341SAndroid Build Coastguard Worker        b.lt            34f
2706*c0909341SAndroid Build Coastguard Worker33:     // h >= 4, preserving v18 from the previous round, loading v19-v22.
2707*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[0]
2708*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
2709*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[2]
2710*c0909341SAndroid Build Coastguard Worker        movi            v10.16b, #8
2711*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[4]
2712*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw
2713*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[6]
2714*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw
2715*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[0], [x10]
2716*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[8]
2717*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw
2718*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[1], [x15]
2719*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[10]
2720*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw
2721*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[2], [x16]
2722*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[12]
2723*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw
2724*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[3], [x17]
2725*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[14]
2726*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw
2727*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw
2728*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[4], [x10]
2729*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw
2730*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[5], [x15]
2731*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[6], [x16]
2732*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v10.16b // next base_y
2733*c0909341SAndroid Build Coastguard Worker        ld4             {v19.h, v20.h, v21.h, v22.h}[7], [x17]
2734*c0909341SAndroid Build Coastguard Worker
2735*c0909341SAndroid Build Coastguard Worker        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2736*c0909341SAndroid Build Coastguard Worker        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2737*c0909341SAndroid Build Coastguard Worker        umull2          v11.4s,  v18.8h,  v28.8h
2738*c0909341SAndroid Build Coastguard Worker        umlal2          v11.4s,  v19.8h,  v27.8h
2739*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v19.4h,  v28.4h
2740*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v20.4h,  v27.4h
2741*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v19.8h,  v28.8h
2742*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v20.8h,  v27.8h
2743*c0909341SAndroid Build Coastguard Worker
2744*c0909341SAndroid Build Coastguard Worker        rshrn           v10.4h,  v10.4s,  #6
2745*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.8h,  v11.4s,  #6
2746*c0909341SAndroid Build Coastguard Worker        rshrn           v11.4h,  v12.4s,  #6
2747*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.8h,  v13.4s,  #6
2748*c0909341SAndroid Build Coastguard Worker
2749*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v20.4h,  v28.4h  // left[base_y]*(64-frac_y)
2750*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v21.4h,  v27.4h  // + left[base_y+1]*frac_y
2751*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v20.8h,  v28.8h
2752*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v21.8h,  v27.8h
2753*c0909341SAndroid Build Coastguard Worker        umull           v14.4s,  v21.4h,  v28.4h
2754*c0909341SAndroid Build Coastguard Worker        umlal           v14.4s,  v22.4h,  v27.4h
2755*c0909341SAndroid Build Coastguard Worker        umull2          v18.4s,  v21.8h,  v28.8h
2756*c0909341SAndroid Build Coastguard Worker        umlal2          v18.4s,  v22.8h,  v27.8h
2757*c0909341SAndroid Build Coastguard Worker
2758*c0909341SAndroid Build Coastguard Worker        rshrn           v12.4h,  v12.4s,  #6
2759*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.8h,  v13.4s,  #6
2760*c0909341SAndroid Build Coastguard Worker        rshrn           v13.4h,  v14.4s,  #6
2761*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.8h,  v18.4s,  #6
2762*c0909341SAndroid Build Coastguard Worker
2763*c0909341SAndroid Build Coastguard Worker        st1             {v10.8h}, [x0],  x1
2764*c0909341SAndroid Build Coastguard Worker        cmp             w5,  #2
2765*c0909341SAndroid Build Coastguard Worker        st1             {v11.8h}, [x13], x1
2766*c0909341SAndroid Build Coastguard Worker        st1             {v12.8h}, [x0],  x1
2767*c0909341SAndroid Build Coastguard Worker        st1             {v13.8h}, [x13], x1
2768*c0909341SAndroid Build Coastguard Worker        b.lt            4f
2769*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v22.16b
2770*c0909341SAndroid Build Coastguard Worker        b.gt            33b
2771*c0909341SAndroid Build Coastguard Worker
2772*c0909341SAndroid Build Coastguard Worker34:     // h == 2, preserving v18 from the previous round, loading v19-v20.
2773*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[0]
2774*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[2]
2775*c0909341SAndroid Build Coastguard Worker        movi            v21.16b, #4
2776*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[4]
2777*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw
2778*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[6]
2779*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw
2780*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[0], [x10]
2781*c0909341SAndroid Build Coastguard Worker        smov            w10,     v29.b[8]
2782*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw
2783*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[1], [x15]
2784*c0909341SAndroid Build Coastguard Worker        smov            w15,     v29.b[10]
2785*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw
2786*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[2], [x16]
2787*c0909341SAndroid Build Coastguard Worker        smov            w16,     v29.b[12]
2788*c0909341SAndroid Build Coastguard Worker        add             x10, x3,  w10, sxtw
2789*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[3], [x17]
2790*c0909341SAndroid Build Coastguard Worker        smov            w17,     v29.b[14]
2791*c0909341SAndroid Build Coastguard Worker        add             x15, x3,  w15, sxtw
2792*c0909341SAndroid Build Coastguard Worker        add             x16, x3,  w16, sxtw
2793*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[4], [x10]
2794*c0909341SAndroid Build Coastguard Worker        add             x17, x3,  w17, sxtw
2795*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[5], [x15]
2796*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[6], [x16]
2797*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v21.16b // next base_y
2798*c0909341SAndroid Build Coastguard Worker        ld2             {v19.h, v20.h}[7], [x17]
2799*c0909341SAndroid Build Coastguard Worker
2800*c0909341SAndroid Build Coastguard Worker        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2801*c0909341SAndroid Build Coastguard Worker        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2802*c0909341SAndroid Build Coastguard Worker        umull2          v11.4s,  v18.8h,  v28.8h
2803*c0909341SAndroid Build Coastguard Worker        umlal2          v11.4s,  v19.8h,  v27.8h
2804*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v19.4h,  v28.4h
2805*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v20.4h,  v27.4h
2806*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v19.8h,  v28.8h
2807*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v20.8h,  v27.8h
2808*c0909341SAndroid Build Coastguard Worker
2809*c0909341SAndroid Build Coastguard Worker        rshrn           v10.4h,  v10.4s,  #6
2810*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.8h,  v11.4s,  #6
2811*c0909341SAndroid Build Coastguard Worker        rshrn           v11.4h,  v12.4s,  #6
2812*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.8h,  v13.4s,  #6
2813*c0909341SAndroid Build Coastguard Worker
2814*c0909341SAndroid Build Coastguard Worker        st1             {v10.8h}, [x0], x1
2815*c0909341SAndroid Build Coastguard Worker        st1             {v11.8h}, [x13], x1
2816*c0909341SAndroid Build Coastguard Worker        // The h==2 case only happens once at the end, if at all.
2817*c0909341SAndroid Build Coastguard Worker
2818*c0909341SAndroid Build Coastguard Worker4:
2819*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #8
2820*c0909341SAndroid Build Coastguard Worker        b.le            9f
2821*c0909341SAndroid Build Coastguard Worker
2822*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
2823*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
2824*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
2825*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
2826*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #16
2827*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #16
2828*c0909341SAndroid Build Coastguard Worker        mov             w5,  w12                  // reset h
2829*c0909341SAndroid Build Coastguard Worker        b               1b
2830*c0909341SAndroid Build Coastguard Worker
2831*c0909341SAndroid Build Coastguard Worker9:
2832*c0909341SAndroid Build Coastguard Worker        ldp             d14, d15, [sp, #0x30]
2833*c0909341SAndroid Build Coastguard Worker        ldp             d12, d13, [sp, #0x20]
2834*c0909341SAndroid Build Coastguard Worker        ldp             d10, d11, [sp, #0x10]
2835*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp], 0x40
2836*c0909341SAndroid Build Coastguard Worker        ret
2837*c0909341SAndroid Build Coastguard Workerendfunc
2838*c0909341SAndroid Build Coastguard Worker
2839*c0909341SAndroid Build Coastguard Workerjumptable ipred_z2_fill1_tbl
2840*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z2_fill1_tbl
2841*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z2_fill1_tbl
2842*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z2_fill1_tbl
2843*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z2_fill1_tbl
2844*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z2_fill1_tbl
2845*c0909341SAndroid Build Coastguard Workerendjumptable
2846*c0909341SAndroid Build Coastguard Worker
2847*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_fill2_16bpc_neon, export=1
2848*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
2849*c0909341SAndroid Build Coastguard Worker        mov             w8,  #(2 << 6)            // xpos = 2 << 6
2850*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2851*c0909341SAndroid Build Coastguard Worker
2852*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
2853*c0909341SAndroid Build Coastguard Worker        ld1             {v31.8h},  [x11]          // increments
2854*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                   // -dy
2855*c0909341SAndroid Build Coastguard Worker        b.eq            80f
2856*c0909341SAndroid Build Coastguard Worker
2857*c0909341SAndroid Build Coastguard Worker40:
2858*c0909341SAndroid Build Coastguard Worker        dup             v30.4h,  w7               // -dy
2859*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
2860*c0909341SAndroid Build Coastguard Worker
2861*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2862*c0909341SAndroid Build Coastguard Worker        movi            v25.8h,  #0x3e
2863*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2864*c0909341SAndroid Build Coastguard Worker
2865*c0909341SAndroid Build Coastguard Worker        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2866*c0909341SAndroid Build Coastguard Worker        // from left.
2867*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3]      // left[]
2868*c0909341SAndroid Build Coastguard Worker
2869*c0909341SAndroid Build Coastguard Worker        movi            v26.8h,  #64
2870*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #4
2871*c0909341SAndroid Build Coastguard Worker
2872*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2873*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v30.8b,  v25.8b  // frac_y
2874*c0909341SAndroid Build Coastguard Worker
2875*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2876*c0909341SAndroid Build Coastguard Worker
2877*c0909341SAndroid Build Coastguard Worker        movi            v23.4h,  #1, lsl #8
2878*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2879*c0909341SAndroid Build Coastguard Worker        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
2880*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #2
2881*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
2882*c0909341SAndroid Build Coastguard Worker
2883*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
2884*c0909341SAndroid Build Coastguard Worker        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
2885*c0909341SAndroid Build Coastguard Worker
2886*c0909341SAndroid Build Coastguard Worker        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2887*c0909341SAndroid Build Coastguard Worker
2888*c0909341SAndroid Build Coastguard Worker        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
2889*c0909341SAndroid Build Coastguard Worker
2890*c0909341SAndroid Build Coastguard Worker        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
2891*c0909341SAndroid Build Coastguard Worker
2892*c0909341SAndroid Build Coastguard Worker        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
2893*c0909341SAndroid Build Coastguard Worker
2894*c0909341SAndroid Build Coastguard Worker        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
2895*c0909341SAndroid Build Coastguard Worker        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
2896*c0909341SAndroid Build Coastguard Worker
2897*c0909341SAndroid Build Coastguard Worker        movi            v29.16b, #4
2898*c0909341SAndroid Build Coastguard Worker        add             v31.8h,  v31.8h,  v31.8h  // {0,2,4,6,0,2,4,6}
2899*c0909341SAndroid Build Coastguard Worker4:
2900*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
2901*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w8               // xpos
2902*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2903*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-8                  // base_x <= -8
2904*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
2905*c0909341SAndroid Build Coastguard Worker        b.le            49f
2906*c0909341SAndroid Build Coastguard Worker
2907*c0909341SAndroid Build Coastguard Worker        lsl             w9,  w9,  #1
2908*c0909341SAndroid Build Coastguard Worker        lsl             w11, w11, #1
2909*c0909341SAndroid Build Coastguard Worker
2910*c0909341SAndroid Build Coastguard Worker        dup             v17.4h,  w8               // xpos
2911*c0909341SAndroid Build Coastguard Worker
2912*c0909341SAndroid Build Coastguard Worker        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2913*c0909341SAndroid Build Coastguard Worker        ldr             q6,  [x2, w11, sxtw]
2914*c0909341SAndroid Build Coastguard Worker
2915*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
2916*c0909341SAndroid Build Coastguard Worker
2917*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2918*c0909341SAndroid Build Coastguard Worker
2919*c0909341SAndroid Build Coastguard Worker        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
2920*c0909341SAndroid Build Coastguard Worker
2921*c0909341SAndroid Build Coastguard Worker        uzp2            v5.8h,   v4.8h,   v6.8h   // top[base_x+1]
2922*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h,   v4.8h,   v6.8h   // top[base_x]
2923*c0909341SAndroid Build Coastguard Worker
2924*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
2925*c0909341SAndroid Build Coastguard Worker
2926*c0909341SAndroid Build Coastguard Worker        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2927*c0909341SAndroid Build Coastguard Worker
2928*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
2929*c0909341SAndroid Build Coastguard Worker
2930*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
2931*c0909341SAndroid Build Coastguard Worker
2932*c0909341SAndroid Build Coastguard Worker        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2933*c0909341SAndroid Build Coastguard Worker        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2934*c0909341SAndroid Build Coastguard Worker        umull2          v22.4s,  v18.8h,  v28.8h
2935*c0909341SAndroid Build Coastguard Worker        umlal2          v22.4s,  v19.8h,  v27.8h
2936*c0909341SAndroid Build Coastguard Worker
2937*c0909341SAndroid Build Coastguard Worker        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
2938*c0909341SAndroid Build Coastguard Worker        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2939*c0909341SAndroid Build Coastguard Worker        umull2          v24.4s,  v4.8h,   v17.8h
2940*c0909341SAndroid Build Coastguard Worker        umlal2          v24.4s,  v5.8h,   v16.8h
2941*c0909341SAndroid Build Coastguard Worker
2942*c0909341SAndroid Build Coastguard Worker        cmge            v20.8h,  v20.8h,  #0
2943*c0909341SAndroid Build Coastguard Worker
2944*c0909341SAndroid Build Coastguard Worker        rshrn           v21.4h,  v21.4s,  #6
2945*c0909341SAndroid Build Coastguard Worker        rshrn2          v21.8h,  v22.4s,  #6
2946*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v23.4s,  #6
2947*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h,  v24.4s,  #6
2948*c0909341SAndroid Build Coastguard Worker
2949*c0909341SAndroid Build Coastguard Worker        bit             v21.16b, v22.16b, v20.16b
2950*c0909341SAndroid Build Coastguard Worker
2951*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[0], [x0], x1
2952*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
2953*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2954*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[1], [x0], x1
2955*c0909341SAndroid Build Coastguard Worker        b.le            9f
2956*c0909341SAndroid Build Coastguard Worker
2957*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v19.16b, v19.16b, #8
2958*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2959*c0909341SAndroid Build Coastguard Worker        b               4b
2960*c0909341SAndroid Build Coastguard Worker
2961*c0909341SAndroid Build Coastguard Worker49:
2962*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2963*c0909341SAndroid Build Coastguard Worker
2964*c0909341SAndroid Build Coastguard Worker        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2965*c0909341SAndroid Build Coastguard Worker
2966*c0909341SAndroid Build Coastguard Worker        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2967*c0909341SAndroid Build Coastguard Worker        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2968*c0909341SAndroid Build Coastguard Worker        umull2          v21.4s,  v18.8h,  v28.8h
2969*c0909341SAndroid Build Coastguard Worker        umlal2          v21.4s,  v19.8h,  v27.8h
2970*c0909341SAndroid Build Coastguard Worker
2971*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h,  v20.4s,  #6
2972*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.8h,  v21.4s,  #6
2973*c0909341SAndroid Build Coastguard Worker
2974*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[0], [x0], x1
2975*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
2976*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[1], [x0], x1
2977*c0909341SAndroid Build Coastguard Worker        b.le            9f
2978*c0909341SAndroid Build Coastguard Worker
2979*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v19.16b, v19.16b, #8
2980*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2981*c0909341SAndroid Build Coastguard Worker        b               49b
2982*c0909341SAndroid Build Coastguard Worker
2983*c0909341SAndroid Build Coastguard Worker9:
2984*c0909341SAndroid Build Coastguard Worker        ret
2985*c0909341SAndroid Build Coastguard Worker
2986*c0909341SAndroid Build Coastguard Worker80:
2987*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #-0x40]!
2988*c0909341SAndroid Build Coastguard Worker        stp             d10, d11, [sp, #0x10]
2989*c0909341SAndroid Build Coastguard Worker        stp             d12, d13, [sp, #0x20]
2990*c0909341SAndroid Build Coastguard Worker        stp             d14, d15, [sp, #0x30]
2991*c0909341SAndroid Build Coastguard Worker
2992*c0909341SAndroid Build Coastguard Worker        dup             v18.8h,  w7               // -dy
2993*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
2994*c0909341SAndroid Build Coastguard Worker
2995*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2996*c0909341SAndroid Build Coastguard Worker        movi            v25.8h,  #0x3e
2997*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2998*c0909341SAndroid Build Coastguard Worker
2999*c0909341SAndroid Build Coastguard Worker        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
3000*c0909341SAndroid Build Coastguard Worker        // from left.
3001*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x3]      // left[]
3002*c0909341SAndroid Build Coastguard Worker
3003*c0909341SAndroid Build Coastguard Worker        movi            v26.8h,  #64
3004*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #4
3005*c0909341SAndroid Build Coastguard Worker
3006*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
3007*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v16.16b, v25.16b // frac_y
3008*c0909341SAndroid Build Coastguard Worker
3009*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
3010*c0909341SAndroid Build Coastguard Worker
3011*c0909341SAndroid Build Coastguard Worker        movi            v23.8h,  #1, lsl #8
3012*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3013*c0909341SAndroid Build Coastguard Worker        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
3014*c0909341SAndroid Build Coastguard Worker        movi            v17.16b, #2
3015*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
3016*c0909341SAndroid Build Coastguard Worker
3017*c0909341SAndroid Build Coastguard Worker        // Cut corners here; for the first row we don't expect to need to
3018*c0909341SAndroid Build Coastguard Worker        // read outside of v0.
3019*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b}, v29.16b // left[base_y]
3020*c0909341SAndroid Build Coastguard Worker
3021*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
3022*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
3023*c0909341SAndroid Build Coastguard Worker
3024*c0909341SAndroid Build Coastguard Worker        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
3025*c0909341SAndroid Build Coastguard Worker
3026*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #4
3027*c0909341SAndroid Build Coastguard Worker        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
3028*c0909341SAndroid Build Coastguard Worker8:
3029*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
3030*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
3031*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3032*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-16                 // base_x <= -16
3033*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
3034*c0909341SAndroid Build Coastguard Worker        b.le            89f
3035*c0909341SAndroid Build Coastguard Worker
3036*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
3037*c0909341SAndroid Build Coastguard Worker
3038*c0909341SAndroid Build Coastguard Worker        add             x9,  x2,  w9,  sxtw #1
3039*c0909341SAndroid Build Coastguard Worker        add             x11, x2,  w11, sxtw #1
3040*c0909341SAndroid Build Coastguard Worker
3041*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
3042*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x11]
3043*c0909341SAndroid Build Coastguard Worker
3044*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
3045*c0909341SAndroid Build Coastguard Worker
3046*c0909341SAndroid Build Coastguard Worker        sshr            v21.8h,  v16.8h,  #6      // first base_x
3047*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h,  v17.8h,  #6
3048*c0909341SAndroid Build Coastguard Worker
3049*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
3050*c0909341SAndroid Build Coastguard Worker
3051*c0909341SAndroid Build Coastguard Worker        uzp2            v2.8h,   v4.8h,   v5.8h   // top[base_x+1]
3052*c0909341SAndroid Build Coastguard Worker        uzp1            v4.8h,   v4.8h,   v5.8h   // top[base_x]
3053*c0909341SAndroid Build Coastguard Worker        uzp2            v3.8h,   v6.8h,   v7.8h
3054*c0909341SAndroid Build Coastguard Worker        uzp1            v6.8h,   v6.8h,   v7.8h
3055*c0909341SAndroid Build Coastguard Worker        mov             v5.16b,  v2.16b
3056*c0909341SAndroid Build Coastguard Worker        mov             v7.16b,  v3.16b
3057*c0909341SAndroid Build Coastguard Worker
3058*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
3059*c0909341SAndroid Build Coastguard Worker        and             v17.16b, v17.16b, v25.16b
3060*c0909341SAndroid Build Coastguard Worker
3061*c0909341SAndroid Build Coastguard Worker        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3062*c0909341SAndroid Build Coastguard Worker        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3063*c0909341SAndroid Build Coastguard Worker
3064*c0909341SAndroid Build Coastguard Worker        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
3065*c0909341SAndroid Build Coastguard Worker        sub             v9.8h,   v26.8h,  v17.8h
3066*c0909341SAndroid Build Coastguard Worker
3067*c0909341SAndroid Build Coastguard Worker        umull2          v11.4s,  v18.8h,  v28.8h
3068*c0909341SAndroid Build Coastguard Worker        umlal2          v11.4s,  v19.8h,  v27.8h
3069*c0909341SAndroid Build Coastguard Worker
3070*c0909341SAndroid Build Coastguard Worker        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
3071*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v31.8h
3072*c0909341SAndroid Build Coastguard Worker
3073*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v19.4h,  v28.4h
3074*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v20.4h,  v27.4h
3075*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v19.8h,  v28.8h
3076*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v20.8h,  v27.8h
3077*c0909341SAndroid Build Coastguard Worker
3078*c0909341SAndroid Build Coastguard Worker        rshrn           v10.4h,  v10.4s,  #6
3079*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.8h,  v11.4s,  #6
3080*c0909341SAndroid Build Coastguard Worker        rshrn           v11.4h,  v12.4s,  #6
3081*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.8h,  v13.4s,  #6
3082*c0909341SAndroid Build Coastguard Worker
3083*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
3084*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3085*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v4.8h,   v8.8h
3086*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v5.8h,   v16.8h
3087*c0909341SAndroid Build Coastguard Worker        umull           v14.4s,  v6.4h,   v9.4h
3088*c0909341SAndroid Build Coastguard Worker        umlal           v14.4s,  v7.4h,   v17.4h
3089*c0909341SAndroid Build Coastguard Worker        umull2          v18.4s,  v6.8h,   v9.8h
3090*c0909341SAndroid Build Coastguard Worker        umlal2          v18.4s,  v7.8h,   v17.8h
3091*c0909341SAndroid Build Coastguard Worker
3092*c0909341SAndroid Build Coastguard Worker        cmge            v21.8h,  v21.8h,  #0
3093*c0909341SAndroid Build Coastguard Worker        cmge            v22.8h,  v22.8h,  #0
3094*c0909341SAndroid Build Coastguard Worker
3095*c0909341SAndroid Build Coastguard Worker        rshrn           v12.4h,  v12.4s,  #6
3096*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.8h,  v13.4s,  #6
3097*c0909341SAndroid Build Coastguard Worker        rshrn           v13.4h,  v14.4s,  #6
3098*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.8h,  v18.4s,  #6
3099*c0909341SAndroid Build Coastguard Worker
3100*c0909341SAndroid Build Coastguard Worker        bit             v10.16b, v12.16b, v21.16b
3101*c0909341SAndroid Build Coastguard Worker        bit             v11.16b, v13.16b, v22.16b
3102*c0909341SAndroid Build Coastguard Worker
3103*c0909341SAndroid Build Coastguard Worker        st1             {v10.8h}, [x0], x1
3104*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3105*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3106*c0909341SAndroid Build Coastguard Worker        st1             {v11.8h}, [x0], x1
3107*c0909341SAndroid Build Coastguard Worker        b.le            9f
3108*c0909341SAndroid Build Coastguard Worker
3109*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
3110*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3111*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
3112*c0909341SAndroid Build Coastguard Worker        b               8b
3113*c0909341SAndroid Build Coastguard Worker
3114*c0909341SAndroid Build Coastguard Worker89:
3115*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
3116*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
3117*c0909341SAndroid Build Coastguard Worker
3118*c0909341SAndroid Build Coastguard Worker        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3119*c0909341SAndroid Build Coastguard Worker        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3120*c0909341SAndroid Build Coastguard Worker        umull2          v5.4s,   v18.8h,  v28.8h
3121*c0909341SAndroid Build Coastguard Worker        umlal2          v5.4s,   v19.8h,  v27.8h
3122*c0909341SAndroid Build Coastguard Worker        umull           v6.4s,   v19.4h,  v28.4h
3123*c0909341SAndroid Build Coastguard Worker        umlal           v6.4s,   v20.4h,  v27.4h
3124*c0909341SAndroid Build Coastguard Worker        umull2          v7.4s,   v19.8h,  v28.8h
3125*c0909341SAndroid Build Coastguard Worker        umlal2          v7.4s,   v20.8h,  v27.8h
3126*c0909341SAndroid Build Coastguard Worker
3127*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v4.4s,   #6
3128*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.8h,   v5.4s,   #6
3129*c0909341SAndroid Build Coastguard Worker        rshrn           v5.4h,   v6.4s,   #6
3130*c0909341SAndroid Build Coastguard Worker        rshrn2          v5.8h,   v7.4s,   #6
3131*c0909341SAndroid Build Coastguard Worker
3132*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h}, [x0], x1
3133*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3134*c0909341SAndroid Build Coastguard Worker        st1             {v5.8h}, [x0], x1
3135*c0909341SAndroid Build Coastguard Worker        b.le            9f
3136*c0909341SAndroid Build Coastguard Worker
3137*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v20.16b
3138*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3139*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
3140*c0909341SAndroid Build Coastguard Worker        b               89b
3141*c0909341SAndroid Build Coastguard Worker
3142*c0909341SAndroid Build Coastguard Worker9:
3143*c0909341SAndroid Build Coastguard Worker        ldp             d14, d15, [sp, #0x30]
3144*c0909341SAndroid Build Coastguard Worker        ldp             d12, d13, [sp, #0x20]
3145*c0909341SAndroid Build Coastguard Worker        ldp             d10, d11, [sp, #0x10]
3146*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp], 0x40
3147*c0909341SAndroid Build Coastguard Worker        ret
3148*c0909341SAndroid Build Coastguard Workerendfunc
3149*c0909341SAndroid Build Coastguard Worker
3150*c0909341SAndroid Build Coastguard Workerfunction ipred_z2_fill3_16bpc_neon, export=1
3151*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
3152*c0909341SAndroid Build Coastguard Worker        mov             w8,  #(1 << 6)            // xpos = 1 << 6
3153*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3154*c0909341SAndroid Build Coastguard Worker
3155*c0909341SAndroid Build Coastguard Worker        movrel          x11, increments
3156*c0909341SAndroid Build Coastguard Worker        ld1             {v31.8h},  [x11]          // increments
3157*c0909341SAndroid Build Coastguard Worker        neg             w7,  w7                   // -dy
3158*c0909341SAndroid Build Coastguard Worker        b.eq            80f
3159*c0909341SAndroid Build Coastguard Worker
3160*c0909341SAndroid Build Coastguard Worker40:
3161*c0909341SAndroid Build Coastguard Worker        dup             v30.4h,  w7               // -dy
3162*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #1
3163*c0909341SAndroid Build Coastguard Worker
3164*c0909341SAndroid Build Coastguard Worker        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
3165*c0909341SAndroid Build Coastguard Worker        movi            v25.8h,  #0x3e
3166*c0909341SAndroid Build Coastguard Worker        add             v30.4h,  v16.4h,  v30.4h  // -= dy
3167*c0909341SAndroid Build Coastguard Worker
3168*c0909341SAndroid Build Coastguard Worker        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3169*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
3170*c0909341SAndroid Build Coastguard Worker
3171*c0909341SAndroid Build Coastguard Worker        movi            v26.8h,  #64
3172*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #2
3173*c0909341SAndroid Build Coastguard Worker
3174*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
3175*c0909341SAndroid Build Coastguard Worker        and             v27.8b,  v30.8b,  v25.8b  // frac_y
3176*c0909341SAndroid Build Coastguard Worker
3177*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
3178*c0909341SAndroid Build Coastguard Worker
3179*c0909341SAndroid Build Coastguard Worker        movi            v23.4h,  #1, lsl #8
3180*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3181*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #4
3182*c0909341SAndroid Build Coastguard Worker        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
3183*c0909341SAndroid Build Coastguard Worker        movi            v17.8b,  #2
3184*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
3185*c0909341SAndroid Build Coastguard Worker
3186*c0909341SAndroid Build Coastguard Worker        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
3187*c0909341SAndroid Build Coastguard Worker        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
3188*c0909341SAndroid Build Coastguard Worker
3189*c0909341SAndroid Build Coastguard Worker        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
3190*c0909341SAndroid Build Coastguard Worker
3191*c0909341SAndroid Build Coastguard Worker        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3 (*2)
3192*c0909341SAndroid Build Coastguard Worker
3193*c0909341SAndroid Build Coastguard Worker        trn1            v29.2d,  v29.2d,  v28.2d  // base_y + 0, base_y + 2
3194*c0909341SAndroid Build Coastguard Worker        trn1            v30.2d,  v30.2d,  v24.2d  // base_y + 1, base_y + 3
3195*c0909341SAndroid Build Coastguard Worker
3196*c0909341SAndroid Build Coastguard Worker        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
3197*c0909341SAndroid Build Coastguard Worker
3198*c0909341SAndroid Build Coastguard Worker        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
3199*c0909341SAndroid Build Coastguard Worker        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
3200*c0909341SAndroid Build Coastguard Worker
3201*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #8
3202*c0909341SAndroid Build Coastguard Worker4:
3203*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
3204*c0909341SAndroid Build Coastguard Worker        dup             v16.4h,  w8               // xpos
3205*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3206*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-4                  // base_x <= -4
3207*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
3208*c0909341SAndroid Build Coastguard Worker        b.le            49f
3209*c0909341SAndroid Build Coastguard Worker
3210*c0909341SAndroid Build Coastguard Worker        lsl             w9,  w9,  #1
3211*c0909341SAndroid Build Coastguard Worker        lsl             w11, w11, #1
3212*c0909341SAndroid Build Coastguard Worker
3213*c0909341SAndroid Build Coastguard Worker        dup             v17.4h,  w8               // xpos
3214*c0909341SAndroid Build Coastguard Worker
3215*c0909341SAndroid Build Coastguard Worker        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
3216*c0909341SAndroid Build Coastguard Worker        ldr             q6,  [x2, w11, sxtw]
3217*c0909341SAndroid Build Coastguard Worker
3218*c0909341SAndroid Build Coastguard Worker        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
3219*c0909341SAndroid Build Coastguard Worker
3220*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3221*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3222*c0909341SAndroid Build Coastguard Worker
3223*c0909341SAndroid Build Coastguard Worker        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
3224*c0909341SAndroid Build Coastguard Worker
3225*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
3226*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v6.16b,  #2
3227*c0909341SAndroid Build Coastguard Worker
3228*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
3229*c0909341SAndroid Build Coastguard Worker
3230*c0909341SAndroid Build Coastguard Worker        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
3231*c0909341SAndroid Build Coastguard Worker        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
3232*c0909341SAndroid Build Coastguard Worker
3233*c0909341SAndroid Build Coastguard Worker        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
3234*c0909341SAndroid Build Coastguard Worker
3235*c0909341SAndroid Build Coastguard Worker        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
3236*c0909341SAndroid Build Coastguard Worker
3237*c0909341SAndroid Build Coastguard Worker        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3238*c0909341SAndroid Build Coastguard Worker        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3239*c0909341SAndroid Build Coastguard Worker        umull2          v22.4s,  v18.8h,  v28.8h
3240*c0909341SAndroid Build Coastguard Worker        umlal2          v22.4s,  v19.8h,  v27.8h
3241*c0909341SAndroid Build Coastguard Worker
3242*c0909341SAndroid Build Coastguard Worker        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
3243*c0909341SAndroid Build Coastguard Worker        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3244*c0909341SAndroid Build Coastguard Worker        umull2          v24.4s,  v4.8h,   v17.8h
3245*c0909341SAndroid Build Coastguard Worker        umlal2          v24.4s,  v5.8h,   v16.8h
3246*c0909341SAndroid Build Coastguard Worker
3247*c0909341SAndroid Build Coastguard Worker        cmge            v20.8h,  v20.8h,  #0
3248*c0909341SAndroid Build Coastguard Worker
3249*c0909341SAndroid Build Coastguard Worker        rshrn           v21.4h,  v21.4s,  #6
3250*c0909341SAndroid Build Coastguard Worker        rshrn2          v21.8h,  v22.4s,  #6
3251*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v23.4s,  #6
3252*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h,  v24.4s,  #6
3253*c0909341SAndroid Build Coastguard Worker
3254*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #8
3255*c0909341SAndroid Build Coastguard Worker
3256*c0909341SAndroid Build Coastguard Worker        bit             v21.16b, v22.16b, v20.16b
3257*c0909341SAndroid Build Coastguard Worker
3258*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[0], [x0], x1
3259*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3260*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3261*c0909341SAndroid Build Coastguard Worker        st1             {v21.d}[1], [x0], x1
3262*c0909341SAndroid Build Coastguard Worker        b.le            9f
3263*c0909341SAndroid Build Coastguard Worker
3264*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
3265*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
3266*c0909341SAndroid Build Coastguard Worker        b               4b
3267*c0909341SAndroid Build Coastguard Worker
3268*c0909341SAndroid Build Coastguard Worker49:
3269*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3270*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3271*c0909341SAndroid Build Coastguard Worker
3272*c0909341SAndroid Build Coastguard Worker        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3273*c0909341SAndroid Build Coastguard Worker        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3274*c0909341SAndroid Build Coastguard Worker        umull2          v21.4s,  v18.8h,  v28.8h
3275*c0909341SAndroid Build Coastguard Worker        umlal2          v21.4s,  v19.8h,  v27.8h
3276*c0909341SAndroid Build Coastguard Worker
3277*c0909341SAndroid Build Coastguard Worker        rshrn           v20.4h,  v20.4s,  #6
3278*c0909341SAndroid Build Coastguard Worker        rshrn2          v20.8h,  v21.4s,  #6
3279*c0909341SAndroid Build Coastguard Worker
3280*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[0], [x0], x1
3281*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3282*c0909341SAndroid Build Coastguard Worker        st1             {v20.d}[1], [x0], x1
3283*c0909341SAndroid Build Coastguard Worker        b.le            9f
3284*c0909341SAndroid Build Coastguard Worker
3285*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
3286*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
3287*c0909341SAndroid Build Coastguard Worker        b               49b
3288*c0909341SAndroid Build Coastguard Worker
3289*c0909341SAndroid Build Coastguard Worker9:
3290*c0909341SAndroid Build Coastguard Worker        ret
3291*c0909341SAndroid Build Coastguard Worker
3292*c0909341SAndroid Build Coastguard Worker80:
3293*c0909341SAndroid Build Coastguard Worker        stp             d8,  d9,  [sp, #-0x40]!
3294*c0909341SAndroid Build Coastguard Worker        stp             d10, d11, [sp, #0x10]
3295*c0909341SAndroid Build Coastguard Worker        stp             d12, d13, [sp, #0x20]
3296*c0909341SAndroid Build Coastguard Worker        stp             d14, d15, [sp, #0x30]
3297*c0909341SAndroid Build Coastguard Worker
3298*c0909341SAndroid Build Coastguard Worker        dup             v18.8h,  w7               // -dy
3299*c0909341SAndroid Build Coastguard Worker        movi            v17.16b, #2
3300*c0909341SAndroid Build Coastguard Worker
3301*c0909341SAndroid Build Coastguard Worker        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
3302*c0909341SAndroid Build Coastguard Worker        movi            v25.8h,  #0x3e
3303*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v18.8h  // -= dy
3304*c0909341SAndroid Build Coastguard Worker
3305*c0909341SAndroid Build Coastguard Worker        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3306*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
3307*c0909341SAndroid Build Coastguard Worker
3308*c0909341SAndroid Build Coastguard Worker        movi            v26.8h,  #64
3309*c0909341SAndroid Build Coastguard Worker        movi            v19.16b, #4
3310*c0909341SAndroid Build Coastguard Worker
3311*c0909341SAndroid Build Coastguard Worker        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
3312*c0909341SAndroid Build Coastguard Worker        and             v27.16b, v16.16b, v25.16b // frac_y
3313*c0909341SAndroid Build Coastguard Worker
3314*c0909341SAndroid Build Coastguard Worker        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 2
3315*c0909341SAndroid Build Coastguard Worker
3316*c0909341SAndroid Build Coastguard Worker        movi            v23.8h,  #1, lsl #8
3317*c0909341SAndroid Build Coastguard Worker        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3318*c0909341SAndroid Build Coastguard Worker        mov             v18.16b, v15.16b          // left[0]
3319*c0909341SAndroid Build Coastguard Worker        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
3320*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
3321*c0909341SAndroid Build Coastguard Worker
3322*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v29.16b, v17.16b // base_y + 1 (*2)
3323*c0909341SAndroid Build Coastguard Worker
3324*c0909341SAndroid Build Coastguard Worker        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
3325*c0909341SAndroid Build Coastguard Worker
3326*c0909341SAndroid Build Coastguard Worker        movi            v24.16b, #4
3327*c0909341SAndroid Build Coastguard Worker8:
3328*c0909341SAndroid Build Coastguard Worker        asr             w9,  w8,  #6              // base_x
3329*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,   w8              // xpos
3330*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3331*c0909341SAndroid Build Coastguard Worker        cmp             w9,  #-16                 // base_x <= -16
3332*c0909341SAndroid Build Coastguard Worker        asr             w11, w8,  #6              // base_x
3333*c0909341SAndroid Build Coastguard Worker        b.le            89f
3334*c0909341SAndroid Build Coastguard Worker
3335*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,   w8              // xpos
3336*c0909341SAndroid Build Coastguard Worker
3337*c0909341SAndroid Build Coastguard Worker        add             x9,  x2,  w9,  sxtw #1
3338*c0909341SAndroid Build Coastguard Worker        add             x11, x2,  w11, sxtw #1
3339*c0909341SAndroid Build Coastguard Worker
3340*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
3341*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x11]
3342*c0909341SAndroid Build Coastguard Worker
3343*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
3344*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3345*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
3346*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b
3347*c0909341SAndroid Build Coastguard Worker
3348*c0909341SAndroid Build Coastguard Worker        sshr            v22.8h,  v16.8h,  #6      // first base_x
3349*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
3350*c0909341SAndroid Build Coastguard Worker        sshr            v23.8h,  v17.8h,  #6
3351*c0909341SAndroid Build Coastguard Worker        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
3352*c0909341SAndroid Build Coastguard Worker
3353*c0909341SAndroid Build Coastguard Worker        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
3354*c0909341SAndroid Build Coastguard Worker        ext             v7.16b,  v6.16b,  v7.16b,  #2
3355*c0909341SAndroid Build Coastguard Worker
3356*c0909341SAndroid Build Coastguard Worker        and             v16.16b, v16.16b, v25.16b // frac_x
3357*c0909341SAndroid Build Coastguard Worker        and             v17.16b, v17.16b, v25.16b
3358*c0909341SAndroid Build Coastguard Worker
3359*c0909341SAndroid Build Coastguard Worker        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3360*c0909341SAndroid Build Coastguard Worker        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3361*c0909341SAndroid Build Coastguard Worker
3362*c0909341SAndroid Build Coastguard Worker        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
3363*c0909341SAndroid Build Coastguard Worker        sub             v9.8h,   v26.8h,  v17.8h
3364*c0909341SAndroid Build Coastguard Worker
3365*c0909341SAndroid Build Coastguard Worker        umull2          v11.4s,  v18.8h,  v28.8h
3366*c0909341SAndroid Build Coastguard Worker        umlal2          v11.4s,  v19.8h,  v27.8h
3367*c0909341SAndroid Build Coastguard Worker
3368*c0909341SAndroid Build Coastguard Worker        add             v22.8h,  v22.8h,  v31.8h  // actual base_x
3369*c0909341SAndroid Build Coastguard Worker        add             v23.8h,  v23.8h,  v31.8h
3370*c0909341SAndroid Build Coastguard Worker
3371*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v20.4h,  v28.4h
3372*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v21.4h,  v27.4h
3373*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v20.8h,  v28.8h
3374*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v21.8h,  v27.8h
3375*c0909341SAndroid Build Coastguard Worker
3376*c0909341SAndroid Build Coastguard Worker        rshrn           v10.4h,  v10.4s,  #6
3377*c0909341SAndroid Build Coastguard Worker        rshrn2          v10.8h,  v11.4s,  #6
3378*c0909341SAndroid Build Coastguard Worker        rshrn           v11.4h,  v12.4s,  #6
3379*c0909341SAndroid Build Coastguard Worker        rshrn2          v11.8h,  v13.4s,  #6
3380*c0909341SAndroid Build Coastguard Worker
3381*c0909341SAndroid Build Coastguard Worker        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
3382*c0909341SAndroid Build Coastguard Worker        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3383*c0909341SAndroid Build Coastguard Worker        umull2          v13.4s,  v4.8h,   v8.8h
3384*c0909341SAndroid Build Coastguard Worker        umlal2          v13.4s,  v5.8h,   v16.8h
3385*c0909341SAndroid Build Coastguard Worker        umull           v14.4s,  v6.4h,   v9.4h
3386*c0909341SAndroid Build Coastguard Worker        umlal           v14.4s,  v7.4h,   v17.4h
3387*c0909341SAndroid Build Coastguard Worker        umull2          v18.4s,  v6.8h,   v9.8h
3388*c0909341SAndroid Build Coastguard Worker        umlal2          v18.4s,  v7.8h,   v17.8h
3389*c0909341SAndroid Build Coastguard Worker
3390*c0909341SAndroid Build Coastguard Worker        cmge            v22.8h,  v22.8h,  #0
3391*c0909341SAndroid Build Coastguard Worker        cmge            v23.8h,  v23.8h,  #0
3392*c0909341SAndroid Build Coastguard Worker
3393*c0909341SAndroid Build Coastguard Worker        rshrn           v12.4h,  v12.4s,  #6
3394*c0909341SAndroid Build Coastguard Worker        rshrn2          v12.8h,  v13.4s,  #6
3395*c0909341SAndroid Build Coastguard Worker        rshrn           v13.4h,  v14.4s,  #6
3396*c0909341SAndroid Build Coastguard Worker        rshrn2          v13.8h,  v18.4s,  #6
3397*c0909341SAndroid Build Coastguard Worker
3398*c0909341SAndroid Build Coastguard Worker        bit             v10.16b, v12.16b, v22.16b
3399*c0909341SAndroid Build Coastguard Worker        bit             v11.16b, v13.16b, v23.16b
3400*c0909341SAndroid Build Coastguard Worker
3401*c0909341SAndroid Build Coastguard Worker        st1             {v10.8h}, [x0], x1
3402*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3403*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  w6              // xpos -= dx
3404*c0909341SAndroid Build Coastguard Worker        st1             {v11.8h}, [x0], x1
3405*c0909341SAndroid Build Coastguard Worker        b.le            9f
3406*c0909341SAndroid Build Coastguard Worker
3407*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3408*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b
3409*c0909341SAndroid Build Coastguard Worker        b               8b
3410*c0909341SAndroid Build Coastguard Worker
3411*c0909341SAndroid Build Coastguard Worker89:
3412*c0909341SAndroid Build Coastguard Worker        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
3413*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3414*c0909341SAndroid Build Coastguard Worker        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
3415*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b
3416*c0909341SAndroid Build Coastguard Worker        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
3417*c0909341SAndroid Build Coastguard Worker        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
3418*c0909341SAndroid Build Coastguard Worker
3419*c0909341SAndroid Build Coastguard Worker        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3420*c0909341SAndroid Build Coastguard Worker        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3421*c0909341SAndroid Build Coastguard Worker        umull2          v5.4s,   v18.8h,  v28.8h
3422*c0909341SAndroid Build Coastguard Worker        umlal2          v5.4s,   v19.8h,  v27.8h
3423*c0909341SAndroid Build Coastguard Worker        umull           v6.4s,   v20.4h,  v28.4h
3424*c0909341SAndroid Build Coastguard Worker        umlal           v6.4s,   v21.4h,  v27.4h
3425*c0909341SAndroid Build Coastguard Worker        umull2          v7.4s,   v20.8h,  v28.8h
3426*c0909341SAndroid Build Coastguard Worker        umlal2          v7.4s,   v21.8h,  v27.8h
3427*c0909341SAndroid Build Coastguard Worker
3428*c0909341SAndroid Build Coastguard Worker        rshrn           v4.4h,   v4.4s,   #6
3429*c0909341SAndroid Build Coastguard Worker        rshrn2          v4.8h,   v5.4s,   #6
3430*c0909341SAndroid Build Coastguard Worker        rshrn           v5.4h,   v6.4s,   #6
3431*c0909341SAndroid Build Coastguard Worker        rshrn2          v5.8h,   v7.4s,   #6
3432*c0909341SAndroid Build Coastguard Worker
3433*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h}, [x0], x1
3434*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
3435*c0909341SAndroid Build Coastguard Worker        st1             {v5.8h}, [x0], x1
3436*c0909341SAndroid Build Coastguard Worker        b.le            9f
3437*c0909341SAndroid Build Coastguard Worker
3438*c0909341SAndroid Build Coastguard Worker        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3439*c0909341SAndroid Build Coastguard Worker        add             v30.16b, v30.16b, v24.16b
3440*c0909341SAndroid Build Coastguard Worker        b               89b
3441*c0909341SAndroid Build Coastguard Worker
3442*c0909341SAndroid Build Coastguard Worker9:
3443*c0909341SAndroid Build Coastguard Worker        ldp             d14, d15, [sp, #0x30]
3444*c0909341SAndroid Build Coastguard Worker        ldp             d12, d13, [sp, #0x20]
3445*c0909341SAndroid Build Coastguard Worker        ldp             d10, d11, [sp, #0x10]
3446*c0909341SAndroid Build Coastguard Worker        ldp             d8,  d9,  [sp], 0x40
3447*c0909341SAndroid Build Coastguard Worker        ret
3448*c0909341SAndroid Build Coastguard Workerendfunc
3449*c0909341SAndroid Build Coastguard Worker
3450*c0909341SAndroid Build Coastguard Worker// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
3451*c0909341SAndroid Build Coastguard Worker//                                const pixel *const left,
3452*c0909341SAndroid Build Coastguard Worker//                                const int width, const int height,
3453*c0909341SAndroid Build Coastguard Worker//                                const int dy, const int max_base_y);
3454*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill1_16bpc_neon, export=1
3455*c0909341SAndroid Build Coastguard Worker        clz             w9,  w4
3456*c0909341SAndroid Build Coastguard Worker        movrel          x8,  ipred_z3_fill1_tbl
3457*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
3458*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x8, w9, uxtw #2]
3459*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
3460*c0909341SAndroid Build Coastguard Worker        add             x8,  x8,  x9
3461*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.8h}, [x10]           // padding
3462*c0909341SAndroid Build Coastguard Worker        mov             w7,  w5
3463*c0909341SAndroid Build Coastguard Worker        mov             w15, #64
3464*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1
3465*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3466*c0909341SAndroid Build Coastguard Worker        br              x8
3467*c0909341SAndroid Build Coastguard Worker
3468*c0909341SAndroid Build Coastguard Worker40:
3469*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3470*c0909341SAndroid Build Coastguard Worker4:
3471*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
3472*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
3473*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3474*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
3475*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
3476*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
3477*c0909341SAndroid Build Coastguard Worker        b.ge            ipred_z3_fill_padding_neon
3478*c0909341SAndroid Build Coastguard Worker        lsl             w8,  w8,  #1
3479*c0909341SAndroid Build Coastguard Worker        lsl             w10, w10, #1
3480*c0909341SAndroid Build Coastguard Worker        ldr             q0,  [x2, w8, uxtw]       // left[base]
3481*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x2, w10, uxtw]
3482*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   w9               // frac
3483*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   w11
3484*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v0.16b,  #2 // left[base+1]
3485*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v2.16b,  #2
3486*c0909341SAndroid Build Coastguard Worker        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
3487*c0909341SAndroid Build Coastguard Worker        sub             v7.4h,   v3.4h,   v2.4h
3488*c0909341SAndroid Build Coastguard Worker        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
3489*c0909341SAndroid Build Coastguard Worker        ushll           v17.4s,  v2.4h,   #6
3490*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
3491*c0909341SAndroid Build Coastguard Worker        smlal           v17.4s,  v7.4h,   v5.4h
3492*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
3493*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v17.4s,  #6
3494*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3495*c0909341SAndroid Build Coastguard Worker        zip1            v18.8h,  v16.8h,  v17.8h
3496*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0],  x1
3497*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x13], x1
3498*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3499*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[2], [x0]
3500*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[3], [x13]
3501*c0909341SAndroid Build Coastguard Worker        b.le            9f
3502*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
3503*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, x1
3504*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #4
3505*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #4
3506*c0909341SAndroid Build Coastguard Worker        b               4b
3507*c0909341SAndroid Build Coastguard Worker9:
3508*c0909341SAndroid Build Coastguard Worker        ret
3509*c0909341SAndroid Build Coastguard Worker
3510*c0909341SAndroid Build Coastguard Worker80:
3511*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3512*c0909341SAndroid Build Coastguard Worker8:
3513*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
3514*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
3515*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3516*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
3517*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
3518*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
3519*c0909341SAndroid Build Coastguard Worker        b.ge            ipred_z3_fill_padding_neon
3520*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw #1
3521*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw #1
3522*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   w9               // frac
3523*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   w11
3524*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x8]            // left[base]
3525*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h},  [x10]
3526*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
3527*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
3528*c0909341SAndroid Build Coastguard Worker        ldr             h1, [x8, #16]
3529*c0909341SAndroid Build Coastguard Worker        ldr             h3, [x10, #16]
3530*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   w9               // 64 - frac
3531*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   w11
3532*c0909341SAndroid Build Coastguard Worker        ext             v1.16b,  v0.16b,  v1.16b,  #2 // left[base+1]
3533*c0909341SAndroid Build Coastguard Worker        ext             v3.16b,  v2.16b,  v3.16b,  #2
3534*c0909341SAndroid Build Coastguard Worker        umull           v16.4s,  v0.4h,   v6.4h   // left[base]*(64-frac)
3535*c0909341SAndroid Build Coastguard Worker        umlal           v16.4s,  v1.4h,   v4.4h   // + left[base+1]*frac
3536*c0909341SAndroid Build Coastguard Worker        umull2          v17.4s,  v0.8h,   v6.8h
3537*c0909341SAndroid Build Coastguard Worker        umlal2          v17.4s,  v1.8h,   v4.8h
3538*c0909341SAndroid Build Coastguard Worker        umull           v18.4s,  v2.4h,   v7.4h
3539*c0909341SAndroid Build Coastguard Worker        umlal           v18.4s,  v3.4h,   v5.4h
3540*c0909341SAndroid Build Coastguard Worker        umull2          v19.4s,  v2.8h,   v7.8h
3541*c0909341SAndroid Build Coastguard Worker        umlal2          v19.4s,  v3.8h,   v5.8h
3542*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
3543*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.8h,  v17.4s,  #6
3544*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v18.4s,  #6
3545*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.8h,  v19.4s,  #6
3546*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3547*c0909341SAndroid Build Coastguard Worker        zip1            v18.8h,  v16.8h,  v17.8h
3548*c0909341SAndroid Build Coastguard Worker        zip2            v19.8h,  v16.8h,  v17.8h
3549*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3550*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0],  x1
3551*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x13], x1
3552*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[2], [x0],  x1
3553*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[3], [x13], x1
3554*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[0], [x0],  x1
3555*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[1], [x13], x1
3556*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[2], [x0],  x1
3557*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[3], [x13], x1
3558*c0909341SAndroid Build Coastguard Worker        b.le            9f
3559*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
3560*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, x1, lsl #2
3561*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #4
3562*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #4
3563*c0909341SAndroid Build Coastguard Worker        b               8b
3564*c0909341SAndroid Build Coastguard Worker9:
3565*c0909341SAndroid Build Coastguard Worker        ret
3566*c0909341SAndroid Build Coastguard Worker
3567*c0909341SAndroid Build Coastguard Worker160:
3568*c0909341SAndroid Build Coastguard Worker320:
3569*c0909341SAndroid Build Coastguard Worker640:
3570*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3571*c0909341SAndroid Build Coastguard Worker        mov             w12, w4
3572*c0909341SAndroid Build Coastguard Worker1:
3573*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
3574*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
3575*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // ypos += dy
3576*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_y
3577*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
3578*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
3579*c0909341SAndroid Build Coastguard Worker        b.ge            ipred_z3_fill_padding_neon
3580*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw #1
3581*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw #1
3582*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   w9               // frac
3583*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   w11
3584*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // left[base]
3585*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
3586*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
3587*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
3588*c0909341SAndroid Build Coastguard Worker        dup             v16.8h,  w9               // 64 - frac
3589*c0909341SAndroid Build Coastguard Worker        dup             v17.8h,  w11
3590*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // ypos += dy
3591*c0909341SAndroid Build Coastguard Worker2:
3592*c0909341SAndroid Build Coastguard Worker        ext             v18.16b, v0.16b,  v1.16b,  #2 // left[base+1]
3593*c0909341SAndroid Build Coastguard Worker        ext             v19.16b, v1.16b,  v2.16b,  #2
3594*c0909341SAndroid Build Coastguard Worker        ext             v20.16b, v3.16b,  v4.16b,  #2
3595*c0909341SAndroid Build Coastguard Worker        ext             v21.16b, v4.16b,  v5.16b,  #2
3596*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #16
3597*c0909341SAndroid Build Coastguard Worker        umull           v22.4s,  v0.4h,   v16.4h  // left[base]*(64-frac)
3598*c0909341SAndroid Build Coastguard Worker        umlal           v22.4s,  v18.4h,  v6.4h   // + left[base+1]*frac
3599*c0909341SAndroid Build Coastguard Worker        umull2          v23.4s,  v0.8h,   v16.8h
3600*c0909341SAndroid Build Coastguard Worker        umlal2          v23.4s,  v18.8h,  v6.8h
3601*c0909341SAndroid Build Coastguard Worker        umull           v24.4s,  v1.4h,   v16.4h
3602*c0909341SAndroid Build Coastguard Worker        umlal           v24.4s,  v19.4h,  v6.4h
3603*c0909341SAndroid Build Coastguard Worker        umull2          v25.4s,  v1.8h,   v16.8h
3604*c0909341SAndroid Build Coastguard Worker        umlal2          v25.4s,  v19.8h,  v6.8h
3605*c0909341SAndroid Build Coastguard Worker        umull           v26.4s,  v3.4h,   v17.4h
3606*c0909341SAndroid Build Coastguard Worker        umlal           v26.4s,  v20.4h,  v7.4h
3607*c0909341SAndroid Build Coastguard Worker        umull2          v27.4s,  v3.8h,   v17.8h
3608*c0909341SAndroid Build Coastguard Worker        umlal2          v27.4s,  v20.8h,  v7.8h
3609*c0909341SAndroid Build Coastguard Worker        umull           v28.4s,  v4.4h,   v17.4h
3610*c0909341SAndroid Build Coastguard Worker        umlal           v28.4s,  v21.4h,  v7.4h
3611*c0909341SAndroid Build Coastguard Worker        umull2          v29.4s,  v4.8h,   v17.8h
3612*c0909341SAndroid Build Coastguard Worker        umlal2          v29.4s,  v21.8h,  v7.8h
3613*c0909341SAndroid Build Coastguard Worker        rshrn           v22.4h,  v22.4s,  #6
3614*c0909341SAndroid Build Coastguard Worker        rshrn2          v22.8h,  v23.4s,  #6
3615*c0909341SAndroid Build Coastguard Worker        rshrn           v23.4h,  v24.4s,  #6
3616*c0909341SAndroid Build Coastguard Worker        rshrn2          v23.8h,  v25.4s,  #6
3617*c0909341SAndroid Build Coastguard Worker        rshrn           v24.4h,  v26.4s,  #6
3618*c0909341SAndroid Build Coastguard Worker        rshrn2          v24.8h,  v27.4s,  #6
3619*c0909341SAndroid Build Coastguard Worker        rshrn           v25.4h,  v28.4s,  #6
3620*c0909341SAndroid Build Coastguard Worker        rshrn2          v25.8h,  v29.4s,  #6
3621*c0909341SAndroid Build Coastguard Worker        zip1            v18.8h,  v22.8h,  v24.8h
3622*c0909341SAndroid Build Coastguard Worker        zip2            v19.8h,  v22.8h,  v24.8h
3623*c0909341SAndroid Build Coastguard Worker        zip1            v20.8h,  v23.8h,  v25.8h
3624*c0909341SAndroid Build Coastguard Worker        zip2            v21.8h,  v23.8h,  v25.8h
3625*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0],  x1
3626*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x13], x1
3627*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[2], [x0],  x1
3628*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[3], [x13], x1
3629*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[0], [x0],  x1
3630*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[1], [x13], x1
3631*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[2], [x0],  x1
3632*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[3], [x13], x1
3633*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[0], [x0],  x1
3634*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[1], [x13], x1
3635*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[2], [x0],  x1
3636*c0909341SAndroid Build Coastguard Worker        st1             {v20.s}[3], [x13], x1
3637*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[0], [x0],  x1
3638*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[1], [x13], x1
3639*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[2], [x0],  x1
3640*c0909341SAndroid Build Coastguard Worker        st1             {v21.s}[3], [x13], x1
3641*c0909341SAndroid Build Coastguard Worker        b.le            3f
3642*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
3643*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h}, [x8],  #32      // left[base]
3644*c0909341SAndroid Build Coastguard Worker        mov             v3.16b,  v5.16b
3645*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x10], #32
3646*c0909341SAndroid Build Coastguard Worker        b               2b
3647*c0909341SAndroid Build Coastguard Worker
3648*c0909341SAndroid Build Coastguard Worker3:
3649*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3650*c0909341SAndroid Build Coastguard Worker        b.le            9f
3651*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3652*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3653*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3654*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3655*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #4
3656*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #4
3657*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3658*c0909341SAndroid Build Coastguard Worker        b               1b
3659*c0909341SAndroid Build Coastguard Worker9:
3660*c0909341SAndroid Build Coastguard Worker        ret
3661*c0909341SAndroid Build Coastguard Workerendfunc
3662*c0909341SAndroid Build Coastguard Worker
3663*c0909341SAndroid Build Coastguard Workerjumptable ipred_z3_fill1_tbl
3664*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z3_fill1_tbl
3665*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z3_fill1_tbl
3666*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z3_fill1_tbl
3667*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z3_fill1_tbl
3668*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z3_fill1_tbl
3669*c0909341SAndroid Build Coastguard Workerendjumptable
3670*c0909341SAndroid Build Coastguard Worker
3671*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill_padding_neon, export=0
3672*c0909341SAndroid Build Coastguard Worker        cmp             w3,  #8
3673*c0909341SAndroid Build Coastguard Worker        movrel          x8,  ipred_z3_fill_padding_tbl
3674*c0909341SAndroid Build Coastguard Worker        b.gt            ipred_z3_fill_padding_wide
3675*c0909341SAndroid Build Coastguard Worker        // w3 = remaining width, w4 = constant height
3676*c0909341SAndroid Build Coastguard Worker        mov             w12, w4
3677*c0909341SAndroid Build Coastguard Worker
3678*c0909341SAndroid Build Coastguard Worker1:
3679*c0909341SAndroid Build Coastguard Worker        // Fill a WxH rectangle with padding. W can be any number;
3680*c0909341SAndroid Build Coastguard Worker        // this fills the exact width by filling in the largest
3681*c0909341SAndroid Build Coastguard Worker        // power of two in the remaining width, and repeating.
3682*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
3683*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
3684*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x8, w9, uxtw #2]
3685*c0909341SAndroid Build Coastguard Worker        add             x9,  x8,  x9
3686*c0909341SAndroid Build Coastguard Worker        br              x9
3687*c0909341SAndroid Build Coastguard Worker
3688*c0909341SAndroid Build Coastguard Worker20:
3689*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3690*c0909341SAndroid Build Coastguard Worker2:
3691*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0],  x1
3692*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
3693*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x13], x1
3694*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x0],  x1
3695*c0909341SAndroid Build Coastguard Worker        st1             {v31.s}[0], [x13], x1
3696*c0909341SAndroid Build Coastguard Worker        b.gt            2b
3697*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3698*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3699*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3700*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3701*c0909341SAndroid Build Coastguard Worker        b.le            9f
3702*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3703*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #4
3704*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #4
3705*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3706*c0909341SAndroid Build Coastguard Worker        b               1b
3707*c0909341SAndroid Build Coastguard Worker
3708*c0909341SAndroid Build Coastguard Worker40:
3709*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3710*c0909341SAndroid Build Coastguard Worker4:
3711*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x0],  x1
3712*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
3713*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x13], x1
3714*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x0],  x1
3715*c0909341SAndroid Build Coastguard Worker        st1             {v31.4h}, [x13], x1
3716*c0909341SAndroid Build Coastguard Worker        b.gt            4b
3717*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #4
3718*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3719*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3720*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3721*c0909341SAndroid Build Coastguard Worker        b.le            9f
3722*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3723*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #8
3724*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #8
3725*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3726*c0909341SAndroid Build Coastguard Worker        b               1b
3727*c0909341SAndroid Build Coastguard Worker
3728*c0909341SAndroid Build Coastguard Worker80:
3729*c0909341SAndroid Build Coastguard Worker160:
3730*c0909341SAndroid Build Coastguard Worker320:
3731*c0909341SAndroid Build Coastguard Worker640:
3732*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3733*c0909341SAndroid Build Coastguard Worker8:
3734*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0],  x1
3735*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
3736*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x13], x1
3737*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0],  x1
3738*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x13], x1
3739*c0909341SAndroid Build Coastguard Worker        b.gt            8b
3740*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #8
3741*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3742*c0909341SAndroid Build Coastguard Worker        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3743*c0909341SAndroid Build Coastguard Worker        msub            x13, x1,  x12, x13
3744*c0909341SAndroid Build Coastguard Worker        b.le            9f
3745*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3746*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #16
3747*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #16
3748*c0909341SAndroid Build Coastguard Worker        mov             w4,  w12
3749*c0909341SAndroid Build Coastguard Worker        b               1b
3750*c0909341SAndroid Build Coastguard Worker
3751*c0909341SAndroid Build Coastguard Worker9:
3752*c0909341SAndroid Build Coastguard Worker        ret
3753*c0909341SAndroid Build Coastguard Workerendfunc
3754*c0909341SAndroid Build Coastguard Worker
3755*c0909341SAndroid Build Coastguard Workerjumptable ipred_z3_fill_padding_tbl
3756*c0909341SAndroid Build Coastguard Worker        .word 640b - ipred_z3_fill_padding_tbl
3757*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_z3_fill_padding_tbl
3758*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_z3_fill_padding_tbl
3759*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_z3_fill_padding_tbl
3760*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_z3_fill_padding_tbl
3761*c0909341SAndroid Build Coastguard Worker        .word 20b  - ipred_z3_fill_padding_tbl
3762*c0909341SAndroid Build Coastguard Workerendjumptable
3763*c0909341SAndroid Build Coastguard Worker
3764*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill_padding_wide
3765*c0909341SAndroid Build Coastguard Worker        // Fill a WxH rectangle with padding, with W > 8.
3766*c0909341SAndroid Build Coastguard Worker        lsr             x1,  x1,  #1
3767*c0909341SAndroid Build Coastguard Worker        mov             w12, w3
3768*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3,  uxtw #1
3769*c0909341SAndroid Build Coastguard Worker1:
3770*c0909341SAndroid Build Coastguard Worker        ands            w5,  w3,  #7
3771*c0909341SAndroid Build Coastguard Worker        b.eq            2f
3772*c0909341SAndroid Build Coastguard Worker        // If the width isn't aligned to 8, first do one 8 pixel write
3773*c0909341SAndroid Build Coastguard Worker        // and align the start pointer.
3774*c0909341SAndroid Build Coastguard Worker        sub             w3,  w3,  w5
3775*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0]
3776*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  w5,  uxtw #1
3777*c0909341SAndroid Build Coastguard Worker2:
3778*c0909341SAndroid Build Coastguard Worker        // Fill the rest of the line with aligned 8 pixel writes.
3779*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #8
3780*c0909341SAndroid Build Coastguard Worker        st1             {v31.8h}, [x0], #16
3781*c0909341SAndroid Build Coastguard Worker        b.gt            2b
3782*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #1
3783*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
3784*c0909341SAndroid Build Coastguard Worker        b.le            9f
3785*c0909341SAndroid Build Coastguard Worker        mov             w3,  w12
3786*c0909341SAndroid Build Coastguard Worker        b               1b
3787*c0909341SAndroid Build Coastguard Worker9:
3788*c0909341SAndroid Build Coastguard Worker        ret
3789*c0909341SAndroid Build Coastguard Workerendfunc
3790*c0909341SAndroid Build Coastguard Worker
3791*c0909341SAndroid Build Coastguard Workerfunction ipred_z3_fill2_16bpc_neon, export=1
3792*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
3793*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3794*c0909341SAndroid Build Coastguard Worker        ld1r            {v31.16b}, [x10]          // padding
3795*c0909341SAndroid Build Coastguard Worker        mov             w7,  w5
3796*c0909341SAndroid Build Coastguard Worker        mov             w15, #64
3797*c0909341SAndroid Build Coastguard Worker        add             x13, x0,  x1
3798*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3799*c0909341SAndroid Build Coastguard Worker        b.eq            8f
3800*c0909341SAndroid Build Coastguard Worker
3801*c0909341SAndroid Build Coastguard Worker4:      // h == 4
3802*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
3803*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
3804*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3805*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
3806*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
3807*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
3808*c0909341SAndroid Build Coastguard Worker        b.ge            ipred_z3_fill_padding_neon
3809*c0909341SAndroid Build Coastguard Worker        lsl             w8,  w8,  #1
3810*c0909341SAndroid Build Coastguard Worker        lsl             w10, w10, #1
3811*c0909341SAndroid Build Coastguard Worker        ldr             q0,  [x2, w8, uxtw]       // top[base]
3812*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x2, w10, uxtw]
3813*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   w9               // frac
3814*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   w11
3815*c0909341SAndroid Build Coastguard Worker        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
3816*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
3817*c0909341SAndroid Build Coastguard Worker        uzp2            v3.8h,   v2.8h,   v2.8h
3818*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h,   v2.8h,   v2.8h
3819*c0909341SAndroid Build Coastguard Worker        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
3820*c0909341SAndroid Build Coastguard Worker        sub             v7.4h,   v3.4h,   v2.4h
3821*c0909341SAndroid Build Coastguard Worker        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
3822*c0909341SAndroid Build Coastguard Worker        ushll           v17.4s,  v2.4h,   #6
3823*c0909341SAndroid Build Coastguard Worker        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
3824*c0909341SAndroid Build Coastguard Worker        smlal           v17.4s,  v7.4h,   v5.4h
3825*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
3826*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v17.4s,  #6
3827*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3828*c0909341SAndroid Build Coastguard Worker        zip1            v18.8h,  v16.8h,  v17.8h
3829*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0],  x1
3830*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x13], x1
3831*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3832*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[2], [x0]
3833*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[3], [x13]
3834*c0909341SAndroid Build Coastguard Worker        b.le            9f
3835*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
3836*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, x1
3837*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #4
3838*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #4
3839*c0909341SAndroid Build Coastguard Worker        b               4b
3840*c0909341SAndroid Build Coastguard Worker9:
3841*c0909341SAndroid Build Coastguard Worker        ret
3842*c0909341SAndroid Build Coastguard Worker
3843*c0909341SAndroid Build Coastguard Worker8:      // h == 8
3844*c0909341SAndroid Build Coastguard Worker        lsr             w8,  w7,  #6              // base
3845*c0909341SAndroid Build Coastguard Worker        and             w9,  w7,  #0x3e           // frac
3846*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3847*c0909341SAndroid Build Coastguard Worker        cmp             w8,  w6                   // base >= max_base_x
3848*c0909341SAndroid Build Coastguard Worker        lsr             w10, w7,  #6              // base
3849*c0909341SAndroid Build Coastguard Worker        and             w11, w7,  #0x3e           // frac
3850*c0909341SAndroid Build Coastguard Worker        b.ge            ipred_z3_fill_padding_neon
3851*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  w8,  uxtw #1
3852*c0909341SAndroid Build Coastguard Worker        add             x10, x2,  w10, uxtw #1
3853*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   w9               // frac
3854*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   w11
3855*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
3856*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h},  [x10]
3857*c0909341SAndroid Build Coastguard Worker        sub             w9,  w15, w9              // 64 - frac
3858*c0909341SAndroid Build Coastguard Worker        sub             w11, w15, w11
3859*c0909341SAndroid Build Coastguard Worker        dup             v6.8h,   w9               // 64 - frac
3860*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   w11
3861*c0909341SAndroid Build Coastguard Worker        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
3862*c0909341SAndroid Build Coastguard Worker        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
3863*c0909341SAndroid Build Coastguard Worker        uzp2            v21.8h,  v2.8h,   v3.8h
3864*c0909341SAndroid Build Coastguard Worker        uzp1            v2.8h,   v2.8h,   v3.8h
3865*c0909341SAndroid Build Coastguard Worker        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
3866*c0909341SAndroid Build Coastguard Worker        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
3867*c0909341SAndroid Build Coastguard Worker        umull2          v17.4s,  v0.8h,   v6.8h
3868*c0909341SAndroid Build Coastguard Worker        umlal2          v17.4s,  v20.8h,  v4.8h
3869*c0909341SAndroid Build Coastguard Worker        umull           v18.4s,  v2.4h,   v7.4h
3870*c0909341SAndroid Build Coastguard Worker        umlal           v18.4s,  v21.4h,  v5.4h
3871*c0909341SAndroid Build Coastguard Worker        umull2          v19.4s,  v2.8h,   v7.8h
3872*c0909341SAndroid Build Coastguard Worker        umlal2          v19.4s,  v21.8h,  v5.8h
3873*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6
3874*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.8h,  v17.4s,  #6
3875*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v18.4s,  #6
3876*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.8h,  v19.4s,  #6
3877*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #2
3878*c0909341SAndroid Build Coastguard Worker        zip1            v18.8h,  v16.8h,  v17.8h
3879*c0909341SAndroid Build Coastguard Worker        zip2            v19.8h,  v16.8h,  v17.8h
3880*c0909341SAndroid Build Coastguard Worker        add             w7,  w7,  w5              // xpos += dx
3881*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[0], [x0],  x1
3882*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[1], [x13], x1
3883*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[2], [x0],  x1
3884*c0909341SAndroid Build Coastguard Worker        st1             {v18.s}[3], [x13], x1
3885*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[0], [x0],  x1
3886*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[1], [x13], x1
3887*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[2], [x0],  x1
3888*c0909341SAndroid Build Coastguard Worker        st1             {v19.s}[3], [x13], x1
3889*c0909341SAndroid Build Coastguard Worker        b.le            9f
3890*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
3891*c0909341SAndroid Build Coastguard Worker        sub             x13, x13, x1, lsl #2
3892*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  #4
3893*c0909341SAndroid Build Coastguard Worker        add             x13, x13, #4
3894*c0909341SAndroid Build Coastguard Worker        b               8b
3895*c0909341SAndroid Build Coastguard Worker9:
3896*c0909341SAndroid Build Coastguard Worker        ret
3897*c0909341SAndroid Build Coastguard Workerendfunc
3898*c0909341SAndroid Build Coastguard Worker
3899*c0909341SAndroid Build Coastguard Worker
3900*c0909341SAndroid Build Coastguard Worker// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
3901*c0909341SAndroid Build Coastguard Worker//                              const pixel *const topleft,
3902*c0909341SAndroid Build Coastguard Worker//                              const int width, const int height, const int filt_idx,
3903*c0909341SAndroid Build Coastguard Worker//                              const int max_width, const int max_height,
3904*c0909341SAndroid Build Coastguard Worker//                              const int bitdepth_max);
3905*c0909341SAndroid Build Coastguard Worker.macro filter_fn bpc
3906*c0909341SAndroid Build Coastguard Workerfunction ipred_filter_\bpc\()bpc_neon
3907*c0909341SAndroid Build Coastguard Worker        and             w5,  w5,  #511
3908*c0909341SAndroid Build Coastguard Worker        movrel          x6,  X(filter_intra_taps)
3909*c0909341SAndroid Build Coastguard Worker        lsl             w5,  w5,  #6
3910*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  w5, uxtw
3911*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
3912*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
3913*c0909341SAndroid Build Coastguard Worker        movrel          x5,  ipred_filter\bpc\()_tbl
3914*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
3915*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
3916*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x5, w9, uxtw #2]
3917*c0909341SAndroid Build Coastguard Worker        sxtl            v16.8h,  v16.8b
3918*c0909341SAndroid Build Coastguard Worker        sxtl            v17.8h,  v17.8b
3919*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  x9
3920*c0909341SAndroid Build Coastguard Worker        sxtl            v18.8h,  v18.8b
3921*c0909341SAndroid Build Coastguard Worker        sxtl            v19.8h,  v19.8b
3922*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
3923*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
3924*c0909341SAndroid Build Coastguard Worker        sxtl            v20.8h,  v20.8b
3925*c0909341SAndroid Build Coastguard Worker        sxtl            v21.8h,  v21.8b
3926*c0909341SAndroid Build Coastguard Worker        sxtl            v22.8h,  v22.8b
3927*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w8
3928*c0909341SAndroid Build Coastguard Worker.if \bpc == 10
3929*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #0
3930*c0909341SAndroid Build Coastguard Worker.endif
3931*c0909341SAndroid Build Coastguard Worker        br              x5
3932*c0909341SAndroid Build Coastguard Worker40:
3933*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3934*c0909341SAndroid Build Coastguard Worker        ldur            d0,  [x2, #2]             // top (0-3)
3935*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
3936*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
3937*c0909341SAndroid Build Coastguard Worker4:
3938*c0909341SAndroid Build Coastguard Worker        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
3939*c0909341SAndroid Build Coastguard Worker.if \bpc == 10
3940*c0909341SAndroid Build Coastguard Worker        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3941*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3942*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3943*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3944*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3945*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3946*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3947*c0909341SAndroid Build Coastguard Worker        srshr           v2.8h,   v2.8h,   #4
3948*c0909341SAndroid Build Coastguard Worker        smax            v2.8h,   v2.8h,   v30.8h
3949*c0909341SAndroid Build Coastguard Worker.else
3950*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
3951*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
3952*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
3953*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
3954*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
3955*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
3956*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
3957*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3958*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3959*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3960*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3961*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3962*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3963*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3964*c0909341SAndroid Build Coastguard Worker        sqrshrun        v2.4h,   v2.4s,   #4
3965*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v2.8h,   v3.4s,   #4
3966*c0909341SAndroid Build Coastguard Worker.endif
3967*c0909341SAndroid Build Coastguard Worker        smin            v2.8h,   v2.8h,   v31.8h
3968*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
3969*c0909341SAndroid Build Coastguard Worker        st1             {v2.d}[0], [x0], x1
3970*c0909341SAndroid Build Coastguard Worker        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
3971*c0909341SAndroid Build Coastguard Worker        st1             {v2.d}[1], [x6], x1
3972*c0909341SAndroid Build Coastguard Worker        b.gt            4b
3973*c0909341SAndroid Build Coastguard Worker        ret
3974*c0909341SAndroid Build Coastguard Worker80:
3975*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
3976*c0909341SAndroid Build Coastguard Worker        ldur            q0,  [x2, #2]             // top (0-7)
3977*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
3978*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
3979*c0909341SAndroid Build Coastguard Worker8:
3980*c0909341SAndroid Build Coastguard Worker        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
3981*c0909341SAndroid Build Coastguard Worker.if \bpc == 10
3982*c0909341SAndroid Build Coastguard Worker        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3983*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3984*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3985*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3986*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3987*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3988*c0909341SAndroid Build Coastguard Worker        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3989*c0909341SAndroid Build Coastguard Worker        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
3990*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
3991*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
3992*c0909341SAndroid Build Coastguard Worker        srshr           v2.8h,   v2.8h,   #4
3993*c0909341SAndroid Build Coastguard Worker        smax            v2.8h,   v2.8h,   v30.8h
3994*c0909341SAndroid Build Coastguard Worker        smin            v2.8h,   v2.8h,   v31.8h
3995*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
3996*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
3997*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
3998*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
3999*c0909341SAndroid Build Coastguard Worker        srshr           v3.8h,   v3.8h,   #4
4000*c0909341SAndroid Build Coastguard Worker        smax            v3.8h,   v3.8h,   v30.8h
4001*c0909341SAndroid Build Coastguard Worker.else
4002*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
4003*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
4004*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
4005*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
4006*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
4007*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
4008*c0909341SAndroid Build Coastguard Worker        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
4009*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
4010*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
4011*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
4012*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
4013*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
4014*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
4015*c0909341SAndroid Build Coastguard Worker        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
4016*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
4017*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
4018*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
4019*c0909341SAndroid Build Coastguard Worker        sqrshrun        v2.4h,   v2.4s,   #4
4020*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v2.8h,   v3.4s,   #4
4021*c0909341SAndroid Build Coastguard Worker        smin            v2.8h,   v2.8h,   v31.8h
4022*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
4023*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
4024*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
4025*c0909341SAndroid Build Coastguard Worker        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
4026*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
4027*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
4028*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
4029*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
4030*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
4031*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
4032*c0909341SAndroid Build Coastguard Worker        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
4033*c0909341SAndroid Build Coastguard Worker        sqrshrun        v3.4h,   v4.4s,   #4
4034*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v3.8h,   v5.4s,   #4
4035*c0909341SAndroid Build Coastguard Worker.endif
4036*c0909341SAndroid Build Coastguard Worker        smin            v3.8h,   v3.8h,   v31.8h
4037*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
4038*c0909341SAndroid Build Coastguard Worker        st2             {v2.d, v3.d}[0], [x0], x1
4039*c0909341SAndroid Build Coastguard Worker        zip2            v0.2d,   v2.2d,   v3.2d
4040*c0909341SAndroid Build Coastguard Worker        st2             {v2.d, v3.d}[1], [x6], x1
4041*c0909341SAndroid Build Coastguard Worker        b.gt            8b
4042*c0909341SAndroid Build Coastguard Worker        ret
4043*c0909341SAndroid Build Coastguard Worker160:
4044*c0909341SAndroid Build Coastguard Worker320:
4045*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4046*c0909341SAndroid Build Coastguard Worker        add             x8,  x2,  #2
4047*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  #4
4048*c0909341SAndroid Build Coastguard Worker        mov             x7,  #-4
4049*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw #1
4050*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
4051*c0909341SAndroid Build Coastguard Worker
4052*c0909341SAndroid Build Coastguard Worker1:
4053*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
4054*c0909341SAndroid Build Coastguard Worker2:
4055*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
4056*c0909341SAndroid Build Coastguard Worker.if \bpc == 10
4057*c0909341SAndroid Build Coastguard Worker        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
4058*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
4059*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
4060*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
4061*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
4062*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
4063*c0909341SAndroid Build Coastguard Worker        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
4064*c0909341SAndroid Build Coastguard Worker
4065*c0909341SAndroid Build Coastguard Worker        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
4066*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
4067*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
4068*c0909341SAndroid Build Coastguard Worker        srshr           v3.8h,   v3.8h,   #4
4069*c0909341SAndroid Build Coastguard Worker        smax            v3.8h,   v3.8h,   v30.8h
4070*c0909341SAndroid Build Coastguard Worker        smin            v3.8h,   v3.8h,   v31.8h
4071*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
4072*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
4073*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
4074*c0909341SAndroid Build Coastguard Worker        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
4075*c0909341SAndroid Build Coastguard Worker
4076*c0909341SAndroid Build Coastguard Worker        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
4077*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
4078*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
4079*c0909341SAndroid Build Coastguard Worker        srshr           v4.8h,   v4.8h,   #4
4080*c0909341SAndroid Build Coastguard Worker        smax            v4.8h,   v4.8h,   v30.8h
4081*c0909341SAndroid Build Coastguard Worker        smin            v4.8h,   v4.8h,   v31.8h
4082*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
4083*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
4084*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
4085*c0909341SAndroid Build Coastguard Worker        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
4086*c0909341SAndroid Build Coastguard Worker
4087*c0909341SAndroid Build Coastguard Worker        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
4088*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
4089*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
4090*c0909341SAndroid Build Coastguard Worker        srshr           v5.8h,   v5.8h,   #4
4091*c0909341SAndroid Build Coastguard Worker        smax            v5.8h,   v5.8h,   v30.8h
4092*c0909341SAndroid Build Coastguard Worker        smin            v5.8h,   v5.8h,   v31.8h
4093*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
4094*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
4095*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
4096*c0909341SAndroid Build Coastguard Worker        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
4097*c0909341SAndroid Build Coastguard Worker
4098*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
4099*c0909341SAndroid Build Coastguard Worker        srshr           v6.8h,   v6.8h,   #4
4100*c0909341SAndroid Build Coastguard Worker        smax            v6.8h,   v6.8h,   v30.8h
4101*c0909341SAndroid Build Coastguard Worker.else
4102*c0909341SAndroid Build Coastguard Worker        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
4103*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
4104*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
4105*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
4106*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
4107*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
4108*c0909341SAndroid Build Coastguard Worker        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
4109*c0909341SAndroid Build Coastguard Worker        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
4110*c0909341SAndroid Build Coastguard Worker        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
4111*c0909341SAndroid Build Coastguard Worker        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
4112*c0909341SAndroid Build Coastguard Worker        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
4113*c0909341SAndroid Build Coastguard Worker        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
4114*c0909341SAndroid Build Coastguard Worker        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
4115*c0909341SAndroid Build Coastguard Worker        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
4116*c0909341SAndroid Build Coastguard Worker
4117*c0909341SAndroid Build Coastguard Worker        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
4118*c0909341SAndroid Build Coastguard Worker        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
4119*c0909341SAndroid Build Coastguard Worker        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
4120*c0909341SAndroid Build Coastguard Worker        sqrshrun        v3.4h,   v3.4s,   #4
4121*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v3.8h,   v4.4s,   #4
4122*c0909341SAndroid Build Coastguard Worker        smin            v3.8h,   v3.8h,   v31.8h
4123*c0909341SAndroid Build Coastguard Worker        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
4124*c0909341SAndroid Build Coastguard Worker        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
4125*c0909341SAndroid Build Coastguard Worker        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
4126*c0909341SAndroid Build Coastguard Worker        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
4127*c0909341SAndroid Build Coastguard Worker        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
4128*c0909341SAndroid Build Coastguard Worker        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
4129*c0909341SAndroid Build Coastguard Worker        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
4130*c0909341SAndroid Build Coastguard Worker        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
4131*c0909341SAndroid Build Coastguard Worker        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
4132*c0909341SAndroid Build Coastguard Worker        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
4133*c0909341SAndroid Build Coastguard Worker        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
4134*c0909341SAndroid Build Coastguard Worker
4135*c0909341SAndroid Build Coastguard Worker        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
4136*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
4137*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
4138*c0909341SAndroid Build Coastguard Worker        sqrshrun        v4.4h,   v5.4s,   #4
4139*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v4.8h,   v6.4s,   #4
4140*c0909341SAndroid Build Coastguard Worker        smin            v4.8h,   v4.8h,   v31.8h
4141*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
4142*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
4143*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
4144*c0909341SAndroid Build Coastguard Worker        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
4145*c0909341SAndroid Build Coastguard Worker        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
4146*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
4147*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
4148*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
4149*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
4150*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
4151*c0909341SAndroid Build Coastguard Worker        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
4152*c0909341SAndroid Build Coastguard Worker
4153*c0909341SAndroid Build Coastguard Worker        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
4154*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
4155*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
4156*c0909341SAndroid Build Coastguard Worker        sqrshrun        v5.4h,   v24.4s,  #4
4157*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v5.8h,   v25.4s,  #4
4158*c0909341SAndroid Build Coastguard Worker        smin            v5.8h,   v5.8h,   v31.8h
4159*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
4160*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
4161*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
4162*c0909341SAndroid Build Coastguard Worker        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
4163*c0909341SAndroid Build Coastguard Worker        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
4164*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
4165*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
4166*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
4167*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
4168*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
4169*c0909341SAndroid Build Coastguard Worker        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
4170*c0909341SAndroid Build Coastguard Worker
4171*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
4172*c0909341SAndroid Build Coastguard Worker        sqrshrun        v6.4h,   v26.4s,  #4
4173*c0909341SAndroid Build Coastguard Worker        sqrshrun2       v6.8h,   v27.4s,  #4
4174*c0909341SAndroid Build Coastguard Worker.endif
4175*c0909341SAndroid Build Coastguard Worker        smin            v6.8h,   v6.8h,   v31.8h
4176*c0909341SAndroid Build Coastguard Worker
4177*c0909341SAndroid Build Coastguard Worker        ins             v0.h[2], v2.h[7]
4178*c0909341SAndroid Build Coastguard Worker        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
4179*c0909341SAndroid Build Coastguard Worker        ins             v0.h[0], v6.h[7]
4180*c0909341SAndroid Build Coastguard Worker        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
4181*c0909341SAndroid Build Coastguard Worker        ins             v0.h[1], v6.h[3]
4182*c0909341SAndroid Build Coastguard Worker        b.gt            2b
4183*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
4184*c0909341SAndroid Build Coastguard Worker        b.le            9f
4185*c0909341SAndroid Build Coastguard Worker        sub             x8,  x6,  w9, uxtw #1
4186*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
4187*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
4188*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
4189*c0909341SAndroid Build Coastguard Worker        b               1b
4190*c0909341SAndroid Build Coastguard Worker9:
4191*c0909341SAndroid Build Coastguard Worker        ret
4192*c0909341SAndroid Build Coastguard Workerendfunc
4193*c0909341SAndroid Build Coastguard Worker
4194*c0909341SAndroid Build Coastguard Workerjumptable ipred_filter\bpc\()_tbl
4195*c0909341SAndroid Build Coastguard Worker        .word 320b - ipred_filter\bpc\()_tbl
4196*c0909341SAndroid Build Coastguard Worker        .word 160b - ipred_filter\bpc\()_tbl
4197*c0909341SAndroid Build Coastguard Worker        .word 80b  - ipred_filter\bpc\()_tbl
4198*c0909341SAndroid Build Coastguard Worker        .word 40b  - ipred_filter\bpc\()_tbl
4199*c0909341SAndroid Build Coastguard Workerendjumptable
4200*c0909341SAndroid Build Coastguard Worker.endm
4201*c0909341SAndroid Build Coastguard Worker
4202*c0909341SAndroid Build Coastguard Workerfilter_fn 10
4203*c0909341SAndroid Build Coastguard Workerfilter_fn 12
4204*c0909341SAndroid Build Coastguard Worker
4205*c0909341SAndroid Build Coastguard Workerfunction ipred_filter_16bpc_neon, export=1
4206*c0909341SAndroid Build Coastguard Worker        ldr             w8,  [sp]
4207*c0909341SAndroid Build Coastguard Worker        cmp             w8,  0x3ff
4208*c0909341SAndroid Build Coastguard Worker        b.le            ipred_filter_10bpc_neon
4209*c0909341SAndroid Build Coastguard Worker        b               ipred_filter_12bpc_neon
4210*c0909341SAndroid Build Coastguard Workerendfunc
4211*c0909341SAndroid Build Coastguard Worker
4212*c0909341SAndroid Build Coastguard Worker// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4213*c0909341SAndroid Build Coastguard Worker//                          const pixel *const pal, const uint8_t *idx,
4214*c0909341SAndroid Build Coastguard Worker//                          const int w, const int h);
4215*c0909341SAndroid Build Coastguard Workerfunction pal_pred_16bpc_neon, export=1
4216*c0909341SAndroid Build Coastguard Worker        ld1             {v30.8h}, [x2]
4217*c0909341SAndroid Build Coastguard Worker        clz             w9,  w4
4218*c0909341SAndroid Build Coastguard Worker        movrel          x6,  pal_pred_tbl
4219*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #25
4220*c0909341SAndroid Build Coastguard Worker        movi            v29.16b, #7
4221*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x6, w9, uxtw #2]
4222*c0909341SAndroid Build Coastguard Worker        movi            v31.8h,  #1, lsl #8
4223*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x9
4224*c0909341SAndroid Build Coastguard Worker        br              x6
4225*c0909341SAndroid Build Coastguard Worker40:
4226*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4227*c0909341SAndroid Build Coastguard Worker        add             x2,  x0,  x1
4228*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4229*c0909341SAndroid Build Coastguard Worker4:
4230*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8b}, [x3], #8
4231*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
4232*c0909341SAndroid Build Coastguard Worker        ushr            v3.8b,   v1.8b,   #4
4233*c0909341SAndroid Build Coastguard Worker        and             v2.8b,   v1.8b,   v29.8b
4234*c0909341SAndroid Build Coastguard Worker        zip1            v1.16b,  v2.16b,  v3.16b
4235*c0909341SAndroid Build Coastguard Worker        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
4236*c0909341SAndroid Build Coastguard Worker        add             v1.16b,  v1.16b,  v1.16b
4237*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b,  v1.16b,  v1.16b
4238*c0909341SAndroid Build Coastguard Worker        zip2            v1.16b,  v1.16b,  v1.16b
4239*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v31.8h
4240*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v31.8h
4241*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v30.16b}, v0.16b
4242*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[0], [x0], x1
4243*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v30.16b}, v1.16b
4244*c0909341SAndroid Build Coastguard Worker        st1             {v0.d}[1], [x2], x1
4245*c0909341SAndroid Build Coastguard Worker        st1             {v1.d}[0], [x0], x1
4246*c0909341SAndroid Build Coastguard Worker        st1             {v1.d}[1], [x2], x1
4247*c0909341SAndroid Build Coastguard Worker        b.gt            4b
4248*c0909341SAndroid Build Coastguard Worker        ret
4249*c0909341SAndroid Build Coastguard Worker80:
4250*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4251*c0909341SAndroid Build Coastguard Worker        add             x2,  x0,  x1
4252*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4253*c0909341SAndroid Build Coastguard Worker8:
4254*c0909341SAndroid Build Coastguard Worker        ld1             {v2.16b}, [x3], #16
4255*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
4256*c0909341SAndroid Build Coastguard Worker        ushr            v4.16b,  v2.16b,  #4
4257*c0909341SAndroid Build Coastguard Worker        and             v3.16b,  v2.16b,  v29.16b
4258*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b,  v3.16b,  v4.16b
4259*c0909341SAndroid Build Coastguard Worker        zip2            v3.16b,  v3.16b,  v4.16b
4260*c0909341SAndroid Build Coastguard Worker        add             v2.16b,  v2.16b,  v2.16b
4261*c0909341SAndroid Build Coastguard Worker        add             v3.16b,  v3.16b,  v3.16b
4262*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b,  v2.16b,  v2.16b
4263*c0909341SAndroid Build Coastguard Worker        zip2            v1.16b,  v2.16b,  v2.16b
4264*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b,  v3.16b,  v3.16b
4265*c0909341SAndroid Build Coastguard Worker        zip2            v3.16b,  v3.16b,  v3.16b
4266*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v31.8h
4267*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v31.8h
4268*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v31.8h
4269*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v31.8h
4270*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v30.16b}, v0.16b
4271*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v30.16b}, v1.16b
4272*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], x1
4273*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v30.16b}, v2.16b
4274*c0909341SAndroid Build Coastguard Worker        st1             {v1.8h}, [x2], x1
4275*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v30.16b}, v3.16b
4276*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h}, [x0], x1
4277*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h}, [x2], x1
4278*c0909341SAndroid Build Coastguard Worker        b.gt            8b
4279*c0909341SAndroid Build Coastguard Worker        ret
4280*c0909341SAndroid Build Coastguard Worker160:
4281*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4282*c0909341SAndroid Build Coastguard Worker        add             x2,  x0,  x1
4283*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4284*c0909341SAndroid Build Coastguard Worker16:
4285*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b, v5.16b}, [x3], #32
4286*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #4
4287*c0909341SAndroid Build Coastguard Worker        ushr            v7.16b,  v4.16b,  #4
4288*c0909341SAndroid Build Coastguard Worker        and             v6.16b,  v4.16b,  v29.16b
4289*c0909341SAndroid Build Coastguard Worker        ushr            v3.16b,  v5.16b,  #4
4290*c0909341SAndroid Build Coastguard Worker        and             v2.16b,  v5.16b,  v29.16b
4291*c0909341SAndroid Build Coastguard Worker        zip1            v4.16b,  v6.16b,  v7.16b
4292*c0909341SAndroid Build Coastguard Worker        zip2            v5.16b,  v6.16b,  v7.16b
4293*c0909341SAndroid Build Coastguard Worker        zip1            v6.16b,  v2.16b,  v3.16b
4294*c0909341SAndroid Build Coastguard Worker        zip2            v7.16b,  v2.16b,  v3.16b
4295*c0909341SAndroid Build Coastguard Worker        add             v4.16b,  v4.16b,  v4.16b
4296*c0909341SAndroid Build Coastguard Worker        add             v5.16b,  v5.16b,  v5.16b
4297*c0909341SAndroid Build Coastguard Worker        add             v6.16b,  v6.16b,  v6.16b
4298*c0909341SAndroid Build Coastguard Worker        add             v7.16b,  v7.16b,  v7.16b
4299*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b,  v4.16b,  v4.16b
4300*c0909341SAndroid Build Coastguard Worker        zip2            v1.16b,  v4.16b,  v4.16b
4301*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b,  v5.16b,  v5.16b
4302*c0909341SAndroid Build Coastguard Worker        zip2            v3.16b,  v5.16b,  v5.16b
4303*c0909341SAndroid Build Coastguard Worker        zip1            v4.16b,  v6.16b,  v6.16b
4304*c0909341SAndroid Build Coastguard Worker        zip2            v5.16b,  v6.16b,  v6.16b
4305*c0909341SAndroid Build Coastguard Worker        zip1            v6.16b,  v7.16b,  v7.16b
4306*c0909341SAndroid Build Coastguard Worker        zip2            v7.16b,  v7.16b,  v7.16b
4307*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v31.8h
4308*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v31.8h
4309*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v31.8h
4310*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v31.8h
4311*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v31.8h
4312*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v30.16b}, v0.16b
4313*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v31.8h
4314*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v30.16b}, v1.16b
4315*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v31.8h
4316*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v30.16b}, v2.16b
4317*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v31.8h
4318*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v30.16b}, v3.16b
4319*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v30.16b}, v4.16b
4320*c0909341SAndroid Build Coastguard Worker        tbl             v5.16b, {v30.16b}, v5.16b
4321*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], x1
4322*c0909341SAndroid Build Coastguard Worker        tbl             v6.16b, {v30.16b}, v6.16b
4323*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h, v3.8h}, [x2], x1
4324*c0909341SAndroid Build Coastguard Worker        tbl             v7.16b, {v30.16b}, v7.16b
4325*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h}, [x0], x1
4326*c0909341SAndroid Build Coastguard Worker        st1             {v6.8h, v7.8h}, [x2], x1
4327*c0909341SAndroid Build Coastguard Worker        b.gt            16b
4328*c0909341SAndroid Build Coastguard Worker        ret
4329*c0909341SAndroid Build Coastguard Worker320:
4330*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4331*c0909341SAndroid Build Coastguard Worker        add             x2,  x0,  x1
4332*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4333*c0909341SAndroid Build Coastguard Worker32:
4334*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b, v5.16b}, [x3], #32
4335*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #2
4336*c0909341SAndroid Build Coastguard Worker        ushr            v7.16b,  v4.16b,  #4
4337*c0909341SAndroid Build Coastguard Worker        and             v6.16b,  v4.16b,  v29.16b
4338*c0909341SAndroid Build Coastguard Worker        ushr            v3.16b,  v5.16b,  #4
4339*c0909341SAndroid Build Coastguard Worker        and             v2.16b,  v5.16b,  v29.16b
4340*c0909341SAndroid Build Coastguard Worker        zip1            v4.16b,  v6.16b,  v7.16b
4341*c0909341SAndroid Build Coastguard Worker        zip2            v5.16b,  v6.16b,  v7.16b
4342*c0909341SAndroid Build Coastguard Worker        zip1            v6.16b,  v2.16b,  v3.16b
4343*c0909341SAndroid Build Coastguard Worker        zip2            v7.16b,  v2.16b,  v3.16b
4344*c0909341SAndroid Build Coastguard Worker        add             v4.16b,  v4.16b,  v4.16b
4345*c0909341SAndroid Build Coastguard Worker        add             v5.16b,  v5.16b,  v5.16b
4346*c0909341SAndroid Build Coastguard Worker        add             v6.16b,  v6.16b,  v6.16b
4347*c0909341SAndroid Build Coastguard Worker        add             v7.16b,  v7.16b,  v7.16b
4348*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b,  v4.16b,  v4.16b
4349*c0909341SAndroid Build Coastguard Worker        zip2            v1.16b,  v4.16b,  v4.16b
4350*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b,  v5.16b,  v5.16b
4351*c0909341SAndroid Build Coastguard Worker        zip2            v3.16b,  v5.16b,  v5.16b
4352*c0909341SAndroid Build Coastguard Worker        zip1            v4.16b,  v6.16b,  v6.16b
4353*c0909341SAndroid Build Coastguard Worker        zip2            v5.16b,  v6.16b,  v6.16b
4354*c0909341SAndroid Build Coastguard Worker        zip1            v6.16b,  v7.16b,  v7.16b
4355*c0909341SAndroid Build Coastguard Worker        zip2            v7.16b,  v7.16b,  v7.16b
4356*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v31.8h
4357*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v31.8h
4358*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v31.8h
4359*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v31.8h
4360*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v31.8h
4361*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v30.16b}, v0.16b
4362*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v31.8h
4363*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v30.16b}, v1.16b
4364*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v31.8h
4365*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v30.16b}, v2.16b
4366*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v31.8h
4367*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v30.16b}, v3.16b
4368*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v30.16b}, v4.16b
4369*c0909341SAndroid Build Coastguard Worker        tbl             v5.16b, {v30.16b}, v5.16b
4370*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4371*c0909341SAndroid Build Coastguard Worker        tbl             v6.16b, {v30.16b}, v6.16b
4372*c0909341SAndroid Build Coastguard Worker        tbl             v7.16b, {v30.16b}, v7.16b
4373*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4374*c0909341SAndroid Build Coastguard Worker        b.gt            32b
4375*c0909341SAndroid Build Coastguard Worker        ret
4376*c0909341SAndroid Build Coastguard Worker640:
4377*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4378*c0909341SAndroid Build Coastguard Worker        add             x2,  x0,  #64
4379*c0909341SAndroid Build Coastguard Worker64:
4380*c0909341SAndroid Build Coastguard Worker        ld1             {v4.16b, v5.16b}, [x3], #32
4381*c0909341SAndroid Build Coastguard Worker        subs            w5,  w5,  #1
4382*c0909341SAndroid Build Coastguard Worker        ushr            v7.16b,  v4.16b,  #4
4383*c0909341SAndroid Build Coastguard Worker        and             v6.16b,  v4.16b,  v29.16b
4384*c0909341SAndroid Build Coastguard Worker        ushr            v3.16b,  v5.16b,  #4
4385*c0909341SAndroid Build Coastguard Worker        and             v2.16b,  v5.16b,  v29.16b
4386*c0909341SAndroid Build Coastguard Worker        zip1            v4.16b,  v6.16b,  v7.16b
4387*c0909341SAndroid Build Coastguard Worker        zip2            v5.16b,  v6.16b,  v7.16b
4388*c0909341SAndroid Build Coastguard Worker        zip1            v6.16b,  v2.16b,  v3.16b
4389*c0909341SAndroid Build Coastguard Worker        zip2            v7.16b,  v2.16b,  v3.16b
4390*c0909341SAndroid Build Coastguard Worker        add             v4.16b,  v4.16b,  v4.16b
4391*c0909341SAndroid Build Coastguard Worker        add             v5.16b,  v5.16b,  v5.16b
4392*c0909341SAndroid Build Coastguard Worker        add             v6.16b,  v6.16b,  v6.16b
4393*c0909341SAndroid Build Coastguard Worker        add             v7.16b,  v7.16b,  v7.16b
4394*c0909341SAndroid Build Coastguard Worker        zip1            v0.16b,  v4.16b,  v4.16b
4395*c0909341SAndroid Build Coastguard Worker        zip2            v1.16b,  v4.16b,  v4.16b
4396*c0909341SAndroid Build Coastguard Worker        zip1            v2.16b,  v5.16b,  v5.16b
4397*c0909341SAndroid Build Coastguard Worker        zip2            v3.16b,  v5.16b,  v5.16b
4398*c0909341SAndroid Build Coastguard Worker        zip1            v4.16b,  v6.16b,  v6.16b
4399*c0909341SAndroid Build Coastguard Worker        zip2            v5.16b,  v6.16b,  v6.16b
4400*c0909341SAndroid Build Coastguard Worker        zip1            v6.16b,  v7.16b,  v7.16b
4401*c0909341SAndroid Build Coastguard Worker        zip2            v7.16b,  v7.16b,  v7.16b
4402*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v31.8h
4403*c0909341SAndroid Build Coastguard Worker        add             v1.8h,   v1.8h,   v31.8h
4404*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v31.8h
4405*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v31.8h
4406*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v31.8h
4407*c0909341SAndroid Build Coastguard Worker        tbl             v0.16b, {v30.16b}, v0.16b
4408*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v5.8h,   v31.8h
4409*c0909341SAndroid Build Coastguard Worker        tbl             v1.16b, {v30.16b}, v1.16b
4410*c0909341SAndroid Build Coastguard Worker        add             v6.8h,   v6.8h,   v31.8h
4411*c0909341SAndroid Build Coastguard Worker        tbl             v2.16b, {v30.16b}, v2.16b
4412*c0909341SAndroid Build Coastguard Worker        add             v7.8h,   v7.8h,   v31.8h
4413*c0909341SAndroid Build Coastguard Worker        tbl             v3.16b, {v30.16b}, v3.16b
4414*c0909341SAndroid Build Coastguard Worker        tbl             v4.16b, {v30.16b}, v4.16b
4415*c0909341SAndroid Build Coastguard Worker        tbl             v5.16b, {v30.16b}, v5.16b
4416*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4417*c0909341SAndroid Build Coastguard Worker        tbl             v6.16b, {v30.16b}, v6.16b
4418*c0909341SAndroid Build Coastguard Worker        tbl             v7.16b, {v30.16b}, v7.16b
4419*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4420*c0909341SAndroid Build Coastguard Worker        b.gt            64b
4421*c0909341SAndroid Build Coastguard Worker        ret
4422*c0909341SAndroid Build Coastguard Workerendfunc
4423*c0909341SAndroid Build Coastguard Worker
4424*c0909341SAndroid Build Coastguard Workerjumptable pal_pred_tbl
4425*c0909341SAndroid Build Coastguard Worker        .word 640b - pal_pred_tbl
4426*c0909341SAndroid Build Coastguard Worker        .word 320b - pal_pred_tbl
4427*c0909341SAndroid Build Coastguard Worker        .word 160b - pal_pred_tbl
4428*c0909341SAndroid Build Coastguard Worker        .word 80b  - pal_pred_tbl
4429*c0909341SAndroid Build Coastguard Worker        .word 40b  - pal_pred_tbl
4430*c0909341SAndroid Build Coastguard Workerendjumptable
4431*c0909341SAndroid Build Coastguard Worker
4432*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4433*c0909341SAndroid Build Coastguard Worker//                               const pixel *const topleft,
4434*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height,
4435*c0909341SAndroid Build Coastguard Worker//                               const int16_t *ac, const int alpha,
4436*c0909341SAndroid Build Coastguard Worker//                               const int bitdepth_max);
4437*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_128_16bpc_neon, export=1
4438*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w7   // bitdepth_max
4439*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4440*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_128_tbl
4441*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
4442*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x7, w9, uxtw #2]
4443*c0909341SAndroid Build Coastguard Worker        urshr           v0.8h,   v31.8h,  #1
4444*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6   // alpha
4445*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x9
4446*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4447*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4448*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #0
4449*c0909341SAndroid Build Coastguard Worker        br              x7
4450*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_splat_w4):
4451*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4452*c0909341SAndroid Build Coastguard Worker1:
4453*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x5], #32
4454*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4455*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
4456*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v4.8h,   v1.8h
4457*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v5.4h,   v1.4h
4458*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v5.8h,   v1.8h
4459*c0909341SAndroid Build Coastguard Worker        cmlt            v16.4s,  v2.4s,   #0     // sign
4460*c0909341SAndroid Build Coastguard Worker        cmlt            v17.4s,  v3.4s,   #0
4461*c0909341SAndroid Build Coastguard Worker        cmlt            v18.4s,  v4.4s,   #0
4462*c0909341SAndroid Build Coastguard Worker        cmlt            v19.4s,  v5.4s,   #0
4463*c0909341SAndroid Build Coastguard Worker        add             v2.4s,   v2.4s,   v16.4s // diff + sign
4464*c0909341SAndroid Build Coastguard Worker        add             v3.4s,   v3.4s,   v17.4s
4465*c0909341SAndroid Build Coastguard Worker        add             v4.4s,   v4.4s,   v18.4s
4466*c0909341SAndroid Build Coastguard Worker        add             v5.4s,   v5.4s,   v19.4s
4467*c0909341SAndroid Build Coastguard Worker        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4468*c0909341SAndroid Build Coastguard Worker        rshrn2          v2.8h,   v3.4s,   #6
4469*c0909341SAndroid Build Coastguard Worker        rshrn           v3.4h,   v4.4s,   #6
4470*c0909341SAndroid Build Coastguard Worker        rshrn2          v3.8h,   v5.4s,   #6
4471*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4472*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v0.8h
4473*c0909341SAndroid Build Coastguard Worker        smax            v2.8h,   v2.8h,   v30.8h
4474*c0909341SAndroid Build Coastguard Worker        smax            v3.8h,   v3.8h,   v30.8h
4475*c0909341SAndroid Build Coastguard Worker        smin            v2.8h,   v2.8h,   v31.8h
4476*c0909341SAndroid Build Coastguard Worker        smin            v3.8h,   v3.8h,   v31.8h
4477*c0909341SAndroid Build Coastguard Worker        st1             {v2.d}[0],  [x0], x1
4478*c0909341SAndroid Build Coastguard Worker        st1             {v2.d}[1],  [x6], x1
4479*c0909341SAndroid Build Coastguard Worker        st1             {v3.d}[0],  [x0], x1
4480*c0909341SAndroid Build Coastguard Worker        st1             {v3.d}[1],  [x6], x1
4481*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4482*c0909341SAndroid Build Coastguard Worker        ret
4483*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_splat_w8):
4484*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4485*c0909341SAndroid Build Coastguard Worker1:
4486*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x5], #32
4487*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
4488*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
4489*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v4.8h,   v1.8h
4490*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v5.4h,   v1.4h
4491*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v5.8h,   v1.8h
4492*c0909341SAndroid Build Coastguard Worker        cmlt            v16.4s,  v2.4s,   #0     // sign
4493*c0909341SAndroid Build Coastguard Worker        cmlt            v17.4s,  v3.4s,   #0
4494*c0909341SAndroid Build Coastguard Worker        cmlt            v18.4s,  v4.4s,   #0
4495*c0909341SAndroid Build Coastguard Worker        cmlt            v19.4s,  v5.4s,   #0
4496*c0909341SAndroid Build Coastguard Worker        add             v2.4s,   v2.4s,   v16.4s // diff + sign
4497*c0909341SAndroid Build Coastguard Worker        add             v3.4s,   v3.4s,   v17.4s
4498*c0909341SAndroid Build Coastguard Worker        add             v4.4s,   v4.4s,   v18.4s
4499*c0909341SAndroid Build Coastguard Worker        add             v5.4s,   v5.4s,   v19.4s
4500*c0909341SAndroid Build Coastguard Worker        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4501*c0909341SAndroid Build Coastguard Worker        rshrn2          v2.8h,   v3.4s,   #6
4502*c0909341SAndroid Build Coastguard Worker        rshrn           v3.4h,   v4.4s,   #6
4503*c0909341SAndroid Build Coastguard Worker        rshrn2          v3.8h,   v5.4s,   #6
4504*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4505*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v3.8h,   v0.8h
4506*c0909341SAndroid Build Coastguard Worker        smax            v2.8h,   v2.8h,   v30.8h
4507*c0909341SAndroid Build Coastguard Worker        smax            v3.8h,   v3.8h,   v30.8h
4508*c0909341SAndroid Build Coastguard Worker        smin            v2.8h,   v2.8h,   v31.8h
4509*c0909341SAndroid Build Coastguard Worker        smin            v3.8h,   v3.8h,   v31.8h
4510*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h},  [x0], x1
4511*c0909341SAndroid Build Coastguard Worker        st1             {v3.8h},  [x6], x1
4512*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4513*c0909341SAndroid Build Coastguard Worker        ret
4514*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_splat_w16):
4515*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4516*c0909341SAndroid Build Coastguard Worker        add             x7,  x5,  w3, uxtw #1
4517*c0909341SAndroid Build Coastguard Worker        sub             x1,  x1,  w3, uxtw #1
4518*c0909341SAndroid Build Coastguard Worker        mov             w9,  w3
4519*c0909341SAndroid Build Coastguard Worker1:
4520*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x5], #32
4521*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x7], #32
4522*c0909341SAndroid Build Coastguard Worker        subs            w3,  w3,  #16
4523*c0909341SAndroid Build Coastguard Worker        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
4524*c0909341SAndroid Build Coastguard Worker        smull2          v17.4s,  v2.8h,   v1.8h
4525*c0909341SAndroid Build Coastguard Worker        smull           v18.4s,  v3.4h,   v1.4h
4526*c0909341SAndroid Build Coastguard Worker        smull2          v19.4s,  v3.8h,   v1.8h
4527*c0909341SAndroid Build Coastguard Worker        smull           v2.4s,   v4.4h,   v1.4h
4528*c0909341SAndroid Build Coastguard Worker        smull2          v3.4s,   v4.8h,   v1.8h
4529*c0909341SAndroid Build Coastguard Worker        smull           v4.4s,   v5.4h,   v1.4h
4530*c0909341SAndroid Build Coastguard Worker        smull2          v5.4s,   v5.8h,   v1.8h
4531*c0909341SAndroid Build Coastguard Worker        cmlt            v20.4s,  v16.4s,  #0     // sign
4532*c0909341SAndroid Build Coastguard Worker        cmlt            v21.4s,  v17.4s,  #0
4533*c0909341SAndroid Build Coastguard Worker        cmlt            v22.4s,  v18.4s,  #0
4534*c0909341SAndroid Build Coastguard Worker        cmlt            v23.4s,  v19.4s,  #0
4535*c0909341SAndroid Build Coastguard Worker        cmlt            v24.4s,  v2.4s,   #0
4536*c0909341SAndroid Build Coastguard Worker        cmlt            v25.4s,  v3.4s,   #0
4537*c0909341SAndroid Build Coastguard Worker        cmlt            v26.4s,  v4.4s,   #0
4538*c0909341SAndroid Build Coastguard Worker        cmlt            v27.4s,  v5.4s,   #0
4539*c0909341SAndroid Build Coastguard Worker        add             v16.4s,  v16.4s,  v20.4s // diff + sign
4540*c0909341SAndroid Build Coastguard Worker        add             v17.4s,  v17.4s,  v21.4s
4541*c0909341SAndroid Build Coastguard Worker        add             v18.4s,  v18.4s,  v22.4s
4542*c0909341SAndroid Build Coastguard Worker        add             v19.4s,  v19.4s,  v23.4s
4543*c0909341SAndroid Build Coastguard Worker        add             v2.4s,   v2.4s,   v24.4s
4544*c0909341SAndroid Build Coastguard Worker        add             v3.4s,   v3.4s,   v25.4s
4545*c0909341SAndroid Build Coastguard Worker        add             v4.4s,   v4.4s,   v26.4s
4546*c0909341SAndroid Build Coastguard Worker        add             v5.4s,   v5.4s,   v27.4s
4547*c0909341SAndroid Build Coastguard Worker        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
4548*c0909341SAndroid Build Coastguard Worker        rshrn2          v16.8h,  v17.4s,  #6
4549*c0909341SAndroid Build Coastguard Worker        rshrn           v17.4h,  v18.4s,  #6
4550*c0909341SAndroid Build Coastguard Worker        rshrn2          v17.8h,  v19.4s,  #6
4551*c0909341SAndroid Build Coastguard Worker        rshrn           v6.4h,   v2.4s,   #6
4552*c0909341SAndroid Build Coastguard Worker        rshrn2          v6.8h,   v3.4s,   #6
4553*c0909341SAndroid Build Coastguard Worker        rshrn           v7.4h,   v4.4s,   #6
4554*c0909341SAndroid Build Coastguard Worker        rshrn2          v7.8h,   v5.4s,   #6
4555*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
4556*c0909341SAndroid Build Coastguard Worker        add             v3.8h,   v17.8h,  v0.8h
4557*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v6.8h,   v0.8h
4558*c0909341SAndroid Build Coastguard Worker        add             v5.8h,   v7.8h,   v0.8h
4559*c0909341SAndroid Build Coastguard Worker        smax            v2.8h,   v2.8h,   v30.8h
4560*c0909341SAndroid Build Coastguard Worker        smax            v3.8h,   v3.8h,   v30.8h
4561*c0909341SAndroid Build Coastguard Worker        smax            v4.8h,   v4.8h,   v30.8h
4562*c0909341SAndroid Build Coastguard Worker        smax            v5.8h,   v5.8h,   v30.8h
4563*c0909341SAndroid Build Coastguard Worker        smin            v2.8h,   v2.8h,   v31.8h
4564*c0909341SAndroid Build Coastguard Worker        smin            v3.8h,   v3.8h,   v31.8h
4565*c0909341SAndroid Build Coastguard Worker        smin            v4.8h,   v4.8h,   v31.8h
4566*c0909341SAndroid Build Coastguard Worker        smin            v5.8h,   v5.8h,   v31.8h
4567*c0909341SAndroid Build Coastguard Worker        st1             {v2.8h, v3.8h},  [x0], #32
4568*c0909341SAndroid Build Coastguard Worker        st1             {v4.8h, v5.8h},  [x6], #32
4569*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4570*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
4571*c0909341SAndroid Build Coastguard Worker        add             x5,  x5,  w9, uxtw #1
4572*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  w9, uxtw #1
4573*c0909341SAndroid Build Coastguard Worker        add             x0,  x0,  x1
4574*c0909341SAndroid Build Coastguard Worker        add             x6,  x6,  x1
4575*c0909341SAndroid Build Coastguard Worker        mov             w3,  w9
4576*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4577*c0909341SAndroid Build Coastguard Worker        ret
4578*c0909341SAndroid Build Coastguard Workerendfunc
4579*c0909341SAndroid Build Coastguard Worker
4580*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_128_tbl
4581*c0909341SAndroid Build Coastguard Workeripred_cfl_splat_tbl:
4582*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4583*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4584*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl
4585*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl
4586*c0909341SAndroid Build Coastguard Workerendjumptable
4587*c0909341SAndroid Build Coastguard Worker
4588*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4589*c0909341SAndroid Build Coastguard Worker//                               const pixel *const topleft,
4590*c0909341SAndroid Build Coastguard Worker//                               const int width, const int height,
4591*c0909341SAndroid Build Coastguard Worker//                               const int16_t *ac, const int alpha,
4592*c0909341SAndroid Build Coastguard Worker//                               const int bitdepth_max);
4593*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_top_16bpc_neon, export=1
4594*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w7   // bitdepth_max
4595*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4596*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_top_tbl
4597*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
4598*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x7, w9, uxtw #2]
4599*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6   // alpha
4600*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
4601*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x9
4602*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4603*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4604*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #0
4605*c0909341SAndroid Build Coastguard Worker        br              x7
4606*c0909341SAndroid Build Coastguard Worker4:
4607*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4608*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x2]
4609*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.4h
4610*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #2
4611*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4612*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w4)
4613*c0909341SAndroid Build Coastguard Worker8:
4614*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4615*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x2]
4616*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
4617*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
4618*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4619*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w8)
4620*c0909341SAndroid Build Coastguard Worker16:
4621*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4622*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2]
4623*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v2.8h,   v3.8h
4624*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
4625*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #4
4626*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4627*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4628*c0909341SAndroid Build Coastguard Worker32:
4629*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4630*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4631*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
4632*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
4633*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v2.8h,   v4.8h
4634*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
4635*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h,   v0.4s,   #5
4636*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4637*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4638*c0909341SAndroid Build Coastguard Workerendfunc
4639*c0909341SAndroid Build Coastguard Worker
4640*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_top_tbl
4641*c0909341SAndroid Build Coastguard Worker        .word 32b - ipred_cfl_top_tbl
4642*c0909341SAndroid Build Coastguard Worker        .word 16b - ipred_cfl_top_tbl
4643*c0909341SAndroid Build Coastguard Worker        .word 8b  - ipred_cfl_top_tbl
4644*c0909341SAndroid Build Coastguard Worker        .word 4b  - ipred_cfl_top_tbl
4645*c0909341SAndroid Build Coastguard Workerendjumptable
4646*c0909341SAndroid Build Coastguard Worker
4647*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4648*c0909341SAndroid Build Coastguard Worker//                                const pixel *const topleft,
4649*c0909341SAndroid Build Coastguard Worker//                                const int width, const int height,
4650*c0909341SAndroid Build Coastguard Worker//                                const int16_t *ac, const int alpha,
4651*c0909341SAndroid Build Coastguard Worker//                                const int bitdepth_max);
4652*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_left_16bpc_neon, export=1
4653*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w7   // bitdepth_max
4654*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw #1
4655*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4656*c0909341SAndroid Build Coastguard Worker        clz             w8,  w4
4657*c0909341SAndroid Build Coastguard Worker        movrel          x10, ipred_cfl_splat_tbl
4658*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_left_tbl
4659*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #26
4660*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #26
4661*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x10, w9, uxtw #2]
4662*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7,  w8, uxtw #2]
4663*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6   // alpha
4664*c0909341SAndroid Build Coastguard Worker        add             x9,  x10, x9
4665*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
4666*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4667*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4668*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #0
4669*c0909341SAndroid Build Coastguard Worker        br              x7
4670*c0909341SAndroid Build Coastguard Worker
4671*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h4):
4672*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4673*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x2]
4674*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.4h
4675*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #2
4676*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4677*c0909341SAndroid Build Coastguard Worker        br              x9
4678*c0909341SAndroid Build Coastguard Worker
4679*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h8):
4680*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4681*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x2]
4682*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
4683*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #3
4684*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4685*c0909341SAndroid Build Coastguard Worker        br              x9
4686*c0909341SAndroid Build Coastguard Worker
4687*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h16):
4688*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4689*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2]
4690*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v2.8h,   v3.8h
4691*c0909341SAndroid Build Coastguard Worker        addv            h0,      v0.8h
4692*c0909341SAndroid Build Coastguard Worker        urshr           v0.4h,   v0.4h,   #4
4693*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4694*c0909341SAndroid Build Coastguard Worker        br              x9
4695*c0909341SAndroid Build Coastguard Worker
4696*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_left_h32):
4697*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4698*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4699*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
4700*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
4701*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v2.8h,   v4.8h
4702*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
4703*c0909341SAndroid Build Coastguard Worker        rshrn           v0.4h,   v0.4s,   #5
4704*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4705*c0909341SAndroid Build Coastguard Worker        br              x9
4706*c0909341SAndroid Build Coastguard Workerendfunc
4707*c0909341SAndroid Build Coastguard Worker
4708*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_left_tbl
4709*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
4710*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
4711*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
4712*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
4713*c0909341SAndroid Build Coastguard Workerendjumptable
4714*c0909341SAndroid Build Coastguard Worker
4715*c0909341SAndroid Build Coastguard Worker// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4716*c0909341SAndroid Build Coastguard Worker//                           const pixel *const topleft,
4717*c0909341SAndroid Build Coastguard Worker//                           const int width, const int height,
4718*c0909341SAndroid Build Coastguard Worker//                           const int16_t *ac, const int alpha,
4719*c0909341SAndroid Build Coastguard Worker//                           const int bitdepth_max);
4720*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_16bpc_neon, export=1
4721*c0909341SAndroid Build Coastguard Worker        dup             v31.8h,  w7              // bitdepth_max
4722*c0909341SAndroid Build Coastguard Worker        sub             x2,  x2,  w4, uxtw #1
4723*c0909341SAndroid Build Coastguard Worker        add             w8,  w3,  w4             // width + height
4724*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   w6              // alpha
4725*c0909341SAndroid Build Coastguard Worker        clz             w9,  w3
4726*c0909341SAndroid Build Coastguard Worker        clz             w6,  w4
4727*c0909341SAndroid Build Coastguard Worker        dup             v16.4s, w8               // width + height
4728*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_tbl
4729*c0909341SAndroid Build Coastguard Worker        rbit            w8,  w8                  // rbit(width + height)
4730*c0909341SAndroid Build Coastguard Worker        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
4731*c0909341SAndroid Build Coastguard Worker        sub             w6,  w6,  #26
4732*c0909341SAndroid Build Coastguard Worker        clz             w8,  w8                  // ctz(width + height)
4733*c0909341SAndroid Build Coastguard Worker        ldrsw           x9,  [x7, w9, uxtw #2]
4734*c0909341SAndroid Build Coastguard Worker        ldrsw           x6,  [x7, w6, uxtw #2]
4735*c0909341SAndroid Build Coastguard Worker        neg             w8,  w8                  // -ctz(width + height)
4736*c0909341SAndroid Build Coastguard Worker        add             x9,  x7,  x9
4737*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x6
4738*c0909341SAndroid Build Coastguard Worker        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
4739*c0909341SAndroid Build Coastguard Worker        dup             v17.4s,  w8              // -ctz(width + height)
4740*c0909341SAndroid Build Coastguard Worker        add             x6,  x0,  x1
4741*c0909341SAndroid Build Coastguard Worker        lsl             x1,  x1,  #1
4742*c0909341SAndroid Build Coastguard Worker        movi            v30.8h,  #0
4743*c0909341SAndroid Build Coastguard Worker        br              x7
4744*c0909341SAndroid Build Coastguard Worker
4745*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h4):
4746*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4747*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},  [x2], #8
4748*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.4h
4749*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
4750*c0909341SAndroid Build Coastguard Worker        br              x9
4751*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w4):
4752*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4753*c0909341SAndroid Build Coastguard Worker        ld1             {v2.4h},  [x2]
4754*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
4755*c0909341SAndroid Build Coastguard Worker        uaddlv          s2,      v2.4h
4756*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #4
4757*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v2.2s
4758*c0909341SAndroid Build Coastguard Worker        ushl            v0.2s,   v0.2s,   v17.2s
4759*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4760*c0909341SAndroid Build Coastguard Worker        // h = 8/16
4761*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #16
4762*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
4763*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
4764*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
4765*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
4766*c0909341SAndroid Build Coastguard Worker        mul             v0.2s,   v0.2s,   v16.2s
4767*c0909341SAndroid Build Coastguard Worker        ushr            v0.2s,   v0.2s,   #17
4768*c0909341SAndroid Build Coastguard Worker1:
4769*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4770*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w4)
4771*c0909341SAndroid Build Coastguard Worker
4772*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h8):
4773*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4774*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h},  [x2], #16
4775*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
4776*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
4777*c0909341SAndroid Build Coastguard Worker        br              x9
4778*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w8):
4779*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4780*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h},  [x2]
4781*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
4782*c0909341SAndroid Build Coastguard Worker        uaddlv          s2,      v2.8h
4783*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
4784*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v2.2s
4785*c0909341SAndroid Build Coastguard Worker        ushl            v0.2s,   v0.2s,   v17.2s
4786*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4787*c0909341SAndroid Build Coastguard Worker        // h = 4/16/32
4788*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
4789*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
4790*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
4791*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
4792*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
4793*c0909341SAndroid Build Coastguard Worker        mul             v0.2s,   v0.2s,   v16.2s
4794*c0909341SAndroid Build Coastguard Worker        ushr            v0.2s,   v0.2s,   #17
4795*c0909341SAndroid Build Coastguard Worker1:
4796*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4797*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w8)
4798*c0909341SAndroid Build Coastguard Worker
4799*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h16):
4800*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4801*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2], #32
4802*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v2.8h,   v3.8h
4803*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
4804*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
4805*c0909341SAndroid Build Coastguard Worker        br              x9
4806*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w16):
4807*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4808*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x2]
4809*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v16.2s
4810*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
4811*c0909341SAndroid Build Coastguard Worker        uaddlv          s2,      v2.8h
4812*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #16
4813*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v2.2s
4814*c0909341SAndroid Build Coastguard Worker        ushl            v0.2s,   v0.2s,   v17.2s
4815*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4816*c0909341SAndroid Build Coastguard Worker        // h = 4/8/32
4817*c0909341SAndroid Build Coastguard Worker        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
4818*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
4819*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
4820*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
4821*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
4822*c0909341SAndroid Build Coastguard Worker        mul             v0.2s,   v0.2s,   v16.2s
4823*c0909341SAndroid Build Coastguard Worker        ushr            v0.2s,   v0.2s,   #17
4824*c0909341SAndroid Build Coastguard Worker1:
4825*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4826*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4827*c0909341SAndroid Build Coastguard Worker
4828*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_h32):
4829*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4830*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
4831*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
4832*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
4833*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v2.8h,   v4.8h
4834*c0909341SAndroid Build Coastguard Worker        add             x2,  x2,  #2
4835*c0909341SAndroid Build Coastguard Worker        uaddlv          s0,      v0.8h
4836*c0909341SAndroid Build Coastguard Worker        br              x9
4837*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_w32):
4838*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4839*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4840*c0909341SAndroid Build Coastguard Worker        add             v0.4s,   v0.4s,   v16.4s
4841*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
4842*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
4843*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v4.8h
4844*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #32
4845*c0909341SAndroid Build Coastguard Worker        uaddlv          s2,      v2.8h
4846*c0909341SAndroid Build Coastguard Worker        add             v0.2s,   v0.2s,   v2.2s
4847*c0909341SAndroid Build Coastguard Worker        ushl            v0.2s,   v0.2s,   v17.2s
4848*c0909341SAndroid Build Coastguard Worker        b.eq            1f
4849*c0909341SAndroid Build Coastguard Worker        // h = 8/16
4850*c0909341SAndroid Build Coastguard Worker        cmp             w4,  #8
4851*c0909341SAndroid Build Coastguard Worker        mov             w16, #0x6667
4852*c0909341SAndroid Build Coastguard Worker        mov             w17, #0xAAAB
4853*c0909341SAndroid Build Coastguard Worker        csel            w16, w16, w17, eq
4854*c0909341SAndroid Build Coastguard Worker        dup             v16.2s,  w16
4855*c0909341SAndroid Build Coastguard Worker        mul             v0.2s,   v0.2s,   v16.2s
4856*c0909341SAndroid Build Coastguard Worker        ushr            v0.2s,   v0.2s,   #17
4857*c0909341SAndroid Build Coastguard Worker1:
4858*c0909341SAndroid Build Coastguard Worker        dup             v0.8h,   v0.h[0]
4859*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_splat_w16)
4860*c0909341SAndroid Build Coastguard Workerendfunc
4861*c0909341SAndroid Build Coastguard Worker
4862*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_tbl
4863*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h32) - ipred_cfl_tbl
4864*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h16) - ipred_cfl_tbl
4865*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
4866*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
4867*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w32) - ipred_cfl_tbl
4868*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w16) - ipred_cfl_tbl
4869*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
4870*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
4871*c0909341SAndroid Build Coastguard Workerendjumptable
4872*c0909341SAndroid Build Coastguard Worker
4873*c0909341SAndroid Build Coastguard Worker// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
4874*c0909341SAndroid Build Coastguard Worker//                            const ptrdiff_t stride, const int w_pad,
4875*c0909341SAndroid Build Coastguard Worker//                            const int h_pad, const int cw, const int ch);
4876*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_ac_420_16bpc_neon, export=1
4877*c0909341SAndroid Build Coastguard Worker        clz             w8,  w5
4878*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  #2
4879*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_420_tbl
4880*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #27
4881*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7, w8, uxtw #2]
4882*c0909341SAndroid Build Coastguard Worker        movi            v24.4s,  #0
4883*c0909341SAndroid Build Coastguard Worker        movi            v25.4s,  #0
4884*c0909341SAndroid Build Coastguard Worker        movi            v26.4s,  #0
4885*c0909341SAndroid Build Coastguard Worker        movi            v27.4s,  #0
4886*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
4887*c0909341SAndroid Build Coastguard Worker        sub             w8,  w6,  w4         // height - h_pad
4888*c0909341SAndroid Build Coastguard Worker        rbit            w9,  w5              // rbit(width)
4889*c0909341SAndroid Build Coastguard Worker        rbit            w10, w6              // rbit(height)
4890*c0909341SAndroid Build Coastguard Worker        clz             w9,  w9              // ctz(width)
4891*c0909341SAndroid Build Coastguard Worker        clz             w10, w10             // ctz(height)
4892*c0909341SAndroid Build Coastguard Worker        add             w9,  w9,  w10        // log2sz
4893*c0909341SAndroid Build Coastguard Worker        add             x10, x1,  x2
4894*c0909341SAndroid Build Coastguard Worker        dup             v31.4s,  w9
4895*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
4896*c0909341SAndroid Build Coastguard Worker        neg             v31.4s,  v31.4s      // -log2sz
4897*c0909341SAndroid Build Coastguard Worker        br              x7
4898*c0909341SAndroid Build Coastguard Worker
4899*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w4):
4900*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4901*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input
4902*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
4903*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x10], x2
4904*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x1],  x2
4905*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h}, [x10], x2
4906*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
4907*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v3.8h
4908*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
4909*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4910*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4911*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h}, [x0], #16
4912*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
4913*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
4914*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4915*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v0.2d,   v0.2d
4916*c0909341SAndroid Build Coastguard Worker        trn2            v0.2d,   v0.2d,   v0.2d
4917*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w4_hpad):
4918*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
4919*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
4920*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4921*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4922*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
4923*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
4924*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
4925*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
4926*c0909341SAndroid Build Coastguard Worker        b.gt            2b
4927*c0909341SAndroid Build Coastguard Worker3:
4928*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w4_calc_subtract_dc):
4929*c0909341SAndroid Build Coastguard Worker        // Aggregate the sums
4930*c0909341SAndroid Build Coastguard Worker        add             v24.4s,  v24.4s,  v25.4s
4931*c0909341SAndroid Build Coastguard Worker        add             v26.4s,  v26.4s,  v27.4s
4932*c0909341SAndroid Build Coastguard Worker        add             v0.4s,   v24.4s,  v26.4s
4933*c0909341SAndroid Build Coastguard Worker        addv            s0,  v0.4s                // sum
4934*c0909341SAndroid Build Coastguard Worker        sub             x0,  x0,  w6, uxtw #3
4935*c0909341SAndroid Build Coastguard Worker        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
4936*c0909341SAndroid Build Coastguard Worker        dup             v4.8h,   v4.h[0]
4937*c0909341SAndroid Build Coastguard Worker6:      // Subtract dc from ac
4938*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x0]
4939*c0909341SAndroid Build Coastguard Worker        subs            w6,  w6,  #4
4940*c0909341SAndroid Build Coastguard Worker        sub             v0.8h,   v0.8h,   v4.8h
4941*c0909341SAndroid Build Coastguard Worker        sub             v1.8h,   v1.8h,   v4.8h
4942*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4943*c0909341SAndroid Build Coastguard Worker        b.gt            6b
4944*c0909341SAndroid Build Coastguard Worker        ret
4945*c0909341SAndroid Build Coastguard Worker
4946*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8):
4947*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
4948*c0909341SAndroid Build Coastguard Worker        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
4949*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
4950*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
4951*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x10], x2
4952*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x1],  x2
4953*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
4954*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x10], x2
4955*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
4956*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
4957*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
4958*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
4959*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v6.8h
4960*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4961*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v4.8h,   #1
4962*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4963*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
4964*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
4965*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
4966*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
4967*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
4968*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4969*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v1.16b
4970*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
4971*c0909341SAndroid Build Coastguard Worker
4972*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8_wpad):
4973*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
4974*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
4975*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x10], x2
4976*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x1],  x2
4977*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h}, [x10], x2
4978*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v2.8h
4979*c0909341SAndroid Build Coastguard Worker        addp            v1.8h,   v1.8h,   v3.8h
4980*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v1.8h
4981*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
4982*c0909341SAndroid Build Coastguard Worker        dup             v1.4h,   v0.h[3]
4983*c0909341SAndroid Build Coastguard Worker        dup             v3.4h,   v0.h[7]
4984*c0909341SAndroid Build Coastguard Worker        trn2            v2.2d,   v0.2d,   v0.2d
4985*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
4986*c0909341SAndroid Build Coastguard Worker        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
4987*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
4988*c0909341SAndroid Build Coastguard Worker        uaddw           v25.4s,  v25.4s,  v1.4h
4989*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v2.4h
4990*c0909341SAndroid Build Coastguard Worker        uaddw           v27.4s,  v27.4s,  v3.4h
4991*c0909341SAndroid Build Coastguard Worker        b.gt            1b
4992*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v2.2d,   v3.2d
4993*c0909341SAndroid Build Coastguard Worker        trn1            v1.2d,   v2.2d,   v3.2d
4994*c0909341SAndroid Build Coastguard Worker
4995*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w8_hpad):
4996*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
4997*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
4998*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
4999*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
5000*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5001*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5002*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5003*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5004*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
5005*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5006*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5007*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5008*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5009*c0909341SAndroid Build Coastguard Worker        b.gt            2b
5010*c0909341SAndroid Build Coastguard Worker3:
5011*c0909341SAndroid Build Coastguard Worker
5012*c0909341SAndroid Build Coastguard Worker        // Double the height and reuse the w4 summing/subtracting
5013*c0909341SAndroid Build Coastguard Worker        lsl             w6,  w6,  #1
5014*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5015*c0909341SAndroid Build Coastguard Worker
5016*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16):
5017*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5018*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_420_w16_tbl
5019*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x7, w3, uxtw #2]
5020*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x3
5021*c0909341SAndroid Build Coastguard Worker        br              x7
5022*c0909341SAndroid Build Coastguard Worker
5023*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad0):
5024*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5025*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
5026*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5027*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
5028*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5029*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
5030*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
5031*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
5032*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
5033*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v4.8h
5034*c0909341SAndroid Build Coastguard Worker        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
5035*c0909341SAndroid Build Coastguard Worker        add             v2.8h,   v2.8h,   v6.8h
5036*c0909341SAndroid Build Coastguard Worker        addp            v16.8h,  v16.8h,  v17.8h
5037*c0909341SAndroid Build Coastguard Worker        addp            v18.8h,  v18.8h,  v19.8h
5038*c0909341SAndroid Build Coastguard Worker        addp            v20.8h,  v20.8h,  v21.8h
5039*c0909341SAndroid Build Coastguard Worker        addp            v22.8h,  v22.8h,  v23.8h
5040*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v20.8h
5041*c0909341SAndroid Build Coastguard Worker        add             v18.8h,  v18.8h,  v22.8h
5042*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
5043*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v2.8h,   #1
5044*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v16.8h,  #1
5045*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v18.8h,  #1
5046*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5047*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5048*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5049*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5050*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5051*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5052*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5053*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5054*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5055*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5056*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5057*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5058*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5059*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5060*c0909341SAndroid Build Coastguard Worker
5061*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad1):
5062*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5063*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
5064*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x1,  #32]
5065*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
5066*c0909341SAndroid Build Coastguard Worker        ldr             q5,  [x10, #32]
5067*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h, v4.8h}, [x10], x2
5068*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v2.8h
5069*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5070*c0909341SAndroid Build Coastguard Worker        addp            v5.8h,   v5.8h,   v5.8h
5071*c0909341SAndroid Build Coastguard Worker        addp            v3.8h,   v3.8h,   v4.8h
5072*c0909341SAndroid Build Coastguard Worker        ldr             q18, [x1,  #32]
5073*c0909341SAndroid Build Coastguard Worker        add             v2.4h,   v2.4h,   v5.4h
5074*c0909341SAndroid Build Coastguard Worker        ld1             {v16.8h, v17.8h}, [x1],  x2
5075*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v3.8h
5076*c0909341SAndroid Build Coastguard Worker        ldr             q21, [x10, #32]
5077*c0909341SAndroid Build Coastguard Worker        ld1             {v19.8h, v20.8h}, [x10], x2
5078*c0909341SAndroid Build Coastguard Worker        addp            v18.8h,  v18.8h,  v18.8h
5079*c0909341SAndroid Build Coastguard Worker        addp            v16.8h,  v16.8h,  v17.8h
5080*c0909341SAndroid Build Coastguard Worker        addp            v21.8h,  v21.8h,  v21.8h
5081*c0909341SAndroid Build Coastguard Worker        addp            v19.8h,  v19.8h,  v20.8h
5082*c0909341SAndroid Build Coastguard Worker        add             v18.4h,  v18.4h,  v21.4h
5083*c0909341SAndroid Build Coastguard Worker        add             v16.8h,  v16.8h,  v19.8h
5084*c0909341SAndroid Build Coastguard Worker        shl             v1.4h,   v2.4h,   #1
5085*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
5086*c0909341SAndroid Build Coastguard Worker        shl             v3.4h,   v18.4h,  #1
5087*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v16.8h,  #1
5088*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   v1.h[3]
5089*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   v3.h[3]
5090*c0909341SAndroid Build Coastguard Worker        trn1            v1.2d,   v1.2d,   v4.2d
5091*c0909341SAndroid Build Coastguard Worker        trn1            v3.2d,   v3.2d,   v5.2d
5092*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5093*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5094*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5095*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5096*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5097*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5098*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5099*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5100*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5101*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5102*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5103*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5104*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5105*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5106*c0909341SAndroid Build Coastguard Worker
5107*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad2):
5108*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5109*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 8
5110*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
5111*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x10], x2
5112*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x1],  x2
5113*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5114*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x10], x2
5115*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
5116*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
5117*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
5118*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
5119*c0909341SAndroid Build Coastguard Worker        add             v4.8h,   v4.8h,   v6.8h
5120*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
5121*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v4.8h,   #1
5122*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
5123*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
5124*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5125*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5126*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5127*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5128*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5129*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5130*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5131*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5132*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5133*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5134*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5135*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5136*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5137*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5138*c0909341SAndroid Build Coastguard Worker
5139*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_wpad3):
5140*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5141*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 12
5142*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
5143*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x10], x2
5144*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h}, [x1],  x2
5145*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h}, [x10], x2
5146*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v4.8h
5147*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v6.8h
5148*c0909341SAndroid Build Coastguard Worker        add             v0.8h,   v0.8h,   v2.8h
5149*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #1
5150*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[3]
5151*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v0.h[7]
5152*c0909341SAndroid Build Coastguard Worker        trn2            v2.2d,   v0.2d,   v3.2d
5153*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v0.2d,   v1.2d
5154*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5155*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5156*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5157*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5158*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5159*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5160*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5161*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5162*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5163*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5164*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5165*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5166*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5167*c0909341SAndroid Build Coastguard Worker
5168*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_420_w16_hpad):
5169*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
5170*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
5171*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #4
5172*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5173*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5174*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5175*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5176*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5177*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5178*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5179*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5180*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5181*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5182*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5183*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5184*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5185*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5186*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5187*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5188*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5189*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5190*c0909341SAndroid Build Coastguard Worker        b.gt            2b
5191*c0909341SAndroid Build Coastguard Worker3:
5192*c0909341SAndroid Build Coastguard Worker
5193*c0909341SAndroid Build Coastguard Worker        // Quadruple the height and reuse the w4 summing/subtracting
5194*c0909341SAndroid Build Coastguard Worker        lsl             w6,  w6,  #2
5195*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5196*c0909341SAndroid Build Coastguard Workerendfunc
5197*c0909341SAndroid Build Coastguard Worker
5198*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_420_tbl
5199*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
5200*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
5201*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
5202*c0909341SAndroid Build Coastguard Workerendjumptable
5203*c0909341SAndroid Build Coastguard Worker
5204*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_420_w16_tbl
5205*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
5206*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
5207*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
5208*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
5209*c0909341SAndroid Build Coastguard Workerendjumptable
5210*c0909341SAndroid Build Coastguard Worker
5211*c0909341SAndroid Build Coastguard Worker// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
5212*c0909341SAndroid Build Coastguard Worker//                            const ptrdiff_t stride, const int w_pad,
5213*c0909341SAndroid Build Coastguard Worker//                            const int h_pad, const int cw, const int ch);
5214*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_ac_422_16bpc_neon, export=1
5215*c0909341SAndroid Build Coastguard Worker        clz             w8,  w5
5216*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  #2
5217*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_422_tbl
5218*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #27
5219*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7, w8, uxtw #2]
5220*c0909341SAndroid Build Coastguard Worker        movi            v24.4s,  #0
5221*c0909341SAndroid Build Coastguard Worker        movi            v25.4s,  #0
5222*c0909341SAndroid Build Coastguard Worker        movi            v26.4s,  #0
5223*c0909341SAndroid Build Coastguard Worker        movi            v27.4s,  #0
5224*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
5225*c0909341SAndroid Build Coastguard Worker        sub             w8,  w6,  w4         // height - h_pad
5226*c0909341SAndroid Build Coastguard Worker        rbit            w9,  w5              // rbit(width)
5227*c0909341SAndroid Build Coastguard Worker        rbit            w10, w6              // rbit(height)
5228*c0909341SAndroid Build Coastguard Worker        clz             w9,  w9              // ctz(width)
5229*c0909341SAndroid Build Coastguard Worker        clz             w10, w10             // ctz(height)
5230*c0909341SAndroid Build Coastguard Worker        add             w9,  w9,  w10        // log2sz
5231*c0909341SAndroid Build Coastguard Worker        add             x10, x1,  x2
5232*c0909341SAndroid Build Coastguard Worker        dup             v31.4s,  w9
5233*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
5234*c0909341SAndroid Build Coastguard Worker        neg             v31.4s,  v31.4s      // -log2sz
5235*c0909341SAndroid Build Coastguard Worker        br              x7
5236*c0909341SAndroid Build Coastguard Worker
5237*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w4):
5238*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5239*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input
5240*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
5241*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x10], x2
5242*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x1],  x2
5243*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h}, [x10], x2
5244*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5245*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
5246*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
5247*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v2.8h,   #2
5248*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5249*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
5250*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5251*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5252*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5253*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5254*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5255*c0909341SAndroid Build Coastguard Worker        trn2            v0.2d,   v1.2d,   v1.2d
5256*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v1.2d,   v1.2d
5257*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w4_hpad)
5258*c0909341SAndroid Build Coastguard Worker
5259*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w8):
5260*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5261*c0909341SAndroid Build Coastguard Worker        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
5262*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
5263*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
5264*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x10], x2
5265*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x1],  x2
5266*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5267*c0909341SAndroid Build Coastguard Worker        ld1             {v6.8h, v7.8h}, [x10], x2
5268*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
5269*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
5270*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
5271*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
5272*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v2.8h,   #2
5273*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v4.8h,   #2
5274*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v6.8h,   #2
5275*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5276*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5277*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5278*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5279*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5280*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5281*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5282*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5283*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5284*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5285*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5286*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v3.16b
5287*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5288*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
5289*c0909341SAndroid Build Coastguard Worker
5290*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w8_wpad):
5291*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
5292*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
5293*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x10], x2
5294*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x1],  x2
5295*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h}, [x10], x2
5296*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5297*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
5298*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
5299*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #2
5300*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   v0.h[3]
5301*c0909341SAndroid Build Coastguard Worker        dup             v5.8h,   v0.h[7]
5302*c0909341SAndroid Build Coastguard Worker        dup             v6.4h,   v2.h[3]
5303*c0909341SAndroid Build Coastguard Worker        dup             v7.8h,   v2.h[7]
5304*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v0.2d,   v5.2d
5305*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v0.2d,   v4.2d
5306*c0909341SAndroid Build Coastguard Worker        trn2            v3.2d,   v2.2d,   v7.2d
5307*c0909341SAndroid Build Coastguard Worker        trn1            v2.2d,   v2.2d,   v6.2d
5308*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5309*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5310*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5311*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5312*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5313*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5314*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5315*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5316*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5317*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5318*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5319*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v3.16b
5320*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5321*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
5322*c0909341SAndroid Build Coastguard Worker
5323*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16):
5324*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5325*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_422_w16_tbl
5326*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x7, w3, uxtw #2]
5327*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x3
5328*c0909341SAndroid Build Coastguard Worker        br              x7
5329*c0909341SAndroid Build Coastguard Worker
5330*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad0):
5331*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5332*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, without padding
5333*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5334*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
5335*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5336*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
5337*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
5338*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v7.8h
5339*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
5340*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v2.8h,   #2
5341*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v4.8h,   #2
5342*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v6.8h,   #2
5343*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5344*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5345*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5346*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5347*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5348*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5349*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5350*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5351*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5352*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5353*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5354*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5355*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5356*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5357*c0909341SAndroid Build Coastguard Worker
5358*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad1):
5359*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5360*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 4
5361*c0909341SAndroid Build Coastguard Worker        ldr             q2,  [x1,  #32]
5362*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
5363*c0909341SAndroid Build Coastguard Worker        ldr             q6,  [x10, #32]
5364*c0909341SAndroid Build Coastguard Worker        ld1             {v4.8h, v5.8h}, [x10], x2
5365*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v2.8h
5366*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5367*c0909341SAndroid Build Coastguard Worker        addp            v6.8h,   v6.8h,   v6.8h
5368*c0909341SAndroid Build Coastguard Worker        addp            v4.8h,   v4.8h,   v5.8h
5369*c0909341SAndroid Build Coastguard Worker        shl             v1.4h,   v2.4h,   #2
5370*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
5371*c0909341SAndroid Build Coastguard Worker        shl             v3.4h,   v6.4h,   #2
5372*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v4.8h,   #2
5373*c0909341SAndroid Build Coastguard Worker        dup             v4.4h,   v1.h[3]
5374*c0909341SAndroid Build Coastguard Worker        dup             v5.4h,   v3.h[3]
5375*c0909341SAndroid Build Coastguard Worker        trn1            v1.2d,   v1.2d,   v4.2d
5376*c0909341SAndroid Build Coastguard Worker        trn1            v3.2d,   v3.2d,   v5.2d
5377*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5378*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5379*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5380*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5381*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5382*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5383*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5384*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5385*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5386*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5387*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5388*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5389*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5390*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5391*c0909341SAndroid Build Coastguard Worker
5392*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad2):
5393*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5394*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 8
5395*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
5396*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x10], x2
5397*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v1.8h
5398*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v3.8h
5399*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #2
5400*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #2
5401*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
5402*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
5403*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5404*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5405*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5406*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5407*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5408*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5409*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5410*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5411*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5412*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5413*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5414*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5415*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5416*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5417*c0909341SAndroid Build Coastguard Worker
5418*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_422_w16_wpad3):
5419*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5420*c0909341SAndroid Build Coastguard Worker1:      // Copy and subsample input, padding 12
5421*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
5422*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x10], x2
5423*c0909341SAndroid Build Coastguard Worker        addp            v0.8h,   v0.8h,   v0.8h
5424*c0909341SAndroid Build Coastguard Worker        addp            v2.8h,   v2.8h,   v2.8h
5425*c0909341SAndroid Build Coastguard Worker        shl             v0.4h,   v0.4h,   #2
5426*c0909341SAndroid Build Coastguard Worker        shl             v2.4h,   v2.4h,   #2
5427*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[3]
5428*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[3]
5429*c0909341SAndroid Build Coastguard Worker        trn1            v0.2d,   v0.2d,   v1.2d
5430*c0909341SAndroid Build Coastguard Worker        trn1            v2.2d,   v2.2d,   v3.2d
5431*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5432*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5433*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5434*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5435*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5436*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5437*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5438*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5439*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5440*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5441*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5442*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5443*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5444*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5445*c0909341SAndroid Build Coastguard Workerendfunc
5446*c0909341SAndroid Build Coastguard Worker
5447*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_422_tbl
5448*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
5449*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w8)  - ipred_cfl_ac_422_tbl
5450*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w4)  - ipred_cfl_ac_422_tbl
5451*c0909341SAndroid Build Coastguard Workerendjumptable
5452*c0909341SAndroid Build Coastguard Worker
5453*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_422_w16_tbl
5454*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
5455*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
5456*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
5457*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
5458*c0909341SAndroid Build Coastguard Workerendjumptable
5459*c0909341SAndroid Build Coastguard Worker
5460*c0909341SAndroid Build Coastguard Worker// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
5461*c0909341SAndroid Build Coastguard Worker//                            const ptrdiff_t stride, const int w_pad,
5462*c0909341SAndroid Build Coastguard Worker//                            const int h_pad, const int cw, const int ch);
5463*c0909341SAndroid Build Coastguard Workerfunction ipred_cfl_ac_444_16bpc_neon, export=1
5464*c0909341SAndroid Build Coastguard Worker        clz             w8,  w5
5465*c0909341SAndroid Build Coastguard Worker        lsl             w4,  w4,  #2
5466*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_444_tbl
5467*c0909341SAndroid Build Coastguard Worker        sub             w8,  w8,  #26
5468*c0909341SAndroid Build Coastguard Worker        ldrsw           x8,  [x7, w8, uxtw #2]
5469*c0909341SAndroid Build Coastguard Worker        movi            v24.4s,  #0
5470*c0909341SAndroid Build Coastguard Worker        movi            v25.4s,  #0
5471*c0909341SAndroid Build Coastguard Worker        movi            v26.4s,  #0
5472*c0909341SAndroid Build Coastguard Worker        movi            v27.4s,  #0
5473*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x8
5474*c0909341SAndroid Build Coastguard Worker        sub             w8,  w6,  w4         // height - h_pad
5475*c0909341SAndroid Build Coastguard Worker        rbit            w9,  w5              // rbit(width)
5476*c0909341SAndroid Build Coastguard Worker        rbit            w10, w6              // rbit(height)
5477*c0909341SAndroid Build Coastguard Worker        clz             w9,  w9              // ctz(width)
5478*c0909341SAndroid Build Coastguard Worker        clz             w10, w10             // ctz(height)
5479*c0909341SAndroid Build Coastguard Worker        add             w9,  w9,  w10        // log2sz
5480*c0909341SAndroid Build Coastguard Worker        add             x10, x1,  x2
5481*c0909341SAndroid Build Coastguard Worker        dup             v31.4s,  w9
5482*c0909341SAndroid Build Coastguard Worker        lsl             x2,  x2,  #1
5483*c0909341SAndroid Build Coastguard Worker        neg             v31.4s,  v31.4s      // -log2sz
5484*c0909341SAndroid Build Coastguard Worker        br              x7
5485*c0909341SAndroid Build Coastguard Worker
5486*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w4):
5487*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5488*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input
5489*c0909341SAndroid Build Coastguard Worker        ld1             {v0.4h},   [x1],  x2
5490*c0909341SAndroid Build Coastguard Worker        ld1             {v0.d}[1], [x10], x2
5491*c0909341SAndroid Build Coastguard Worker        ld1             {v1.4h},   [x1],  x2
5492*c0909341SAndroid Build Coastguard Worker        ld1             {v1.d}[1], [x10], x2
5493*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5494*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #3
5495*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5496*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h}, [x0], #32
5497*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5498*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5499*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5500*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5501*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5502*c0909341SAndroid Build Coastguard Worker        trn2            v0.2d,   v1.2d,   v1.2d
5503*c0909341SAndroid Build Coastguard Worker        trn2            v1.2d,   v1.2d,   v1.2d
5504*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w4_hpad)
5505*c0909341SAndroid Build Coastguard Worker
5506*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w8):
5507*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5508*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input
5509*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
5510*c0909341SAndroid Build Coastguard Worker        ld1             {v1.8h}, [x10], x2
5511*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x1],  x2
5512*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5513*c0909341SAndroid Build Coastguard Worker        ld1             {v3.8h}, [x10], x2
5514*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #3
5515*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #3
5516*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v3.8h,   #3
5517*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #4
5518*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5519*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5520*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5521*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5522*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5523*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5524*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5525*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5526*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5527*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5528*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v3.16b
5529*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5530*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w8_hpad)
5531*c0909341SAndroid Build Coastguard Worker
5532*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w16):
5533*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5534*c0909341SAndroid Build Coastguard Worker        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
5535*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, without padding
5536*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
5537*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h, v3.8h}, [x10], x2
5538*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5539*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #3
5540*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #3
5541*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v3.8h,   #3
5542*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5543*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5544*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5545*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5546*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5547*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5548*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5549*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5550*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5551*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5552*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5553*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5554*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5555*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5556*c0909341SAndroid Build Coastguard Worker
5557*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w16_wpad):
5558*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 8
5559*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
5560*c0909341SAndroid Build Coastguard Worker        ld1             {v2.8h}, [x10], x2
5561*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5562*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #3
5563*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
5564*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
5565*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #2
5566*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5567*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5568*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5569*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5570*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5571*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5572*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5573*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5574*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5575*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5576*c0909341SAndroid Build Coastguard Worker        mov             v0.16b,  v2.16b
5577*c0909341SAndroid Build Coastguard Worker        mov             v1.16b,  v3.16b
5578*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w16_hpad)
5579*c0909341SAndroid Build Coastguard Worker
5580*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32):
5581*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5582*c0909341SAndroid Build Coastguard Worker        movrel          x7,  ipred_cfl_ac_444_w32_tbl
5583*c0909341SAndroid Build Coastguard Worker        lsr             w3,  w3,  #1
5584*c0909341SAndroid Build Coastguard Worker        ldrsw           x3,  [x7, w3, uxtw #2]
5585*c0909341SAndroid Build Coastguard Worker        lsr             x2,  x2,  #1 // Restore the stride to one line increments
5586*c0909341SAndroid Build Coastguard Worker        add             x7,  x7,  x3
5587*c0909341SAndroid Build Coastguard Worker        br              x7
5588*c0909341SAndroid Build Coastguard Worker
5589*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad0):
5590*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5591*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, without padding
5592*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5593*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5594*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #3
5595*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #3
5596*c0909341SAndroid Build Coastguard Worker        shl             v3.8h,   v3.8h,   #3
5597*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #1
5598*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5599*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5600*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5601*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5602*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5603*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5604*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5605*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5606*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5607*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5608*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_444_w32_hpad)
5609*c0909341SAndroid Build Coastguard Worker
5610*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad2):
5611*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5612*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 8
5613*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
5614*c0909341SAndroid Build Coastguard Worker        shl             v2.8h,   v2.8h,   #3
5615*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5616*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #3
5617*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v2.h[7]
5618*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #1
5619*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5620*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5621*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5622*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5623*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5624*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5625*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5626*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5627*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5628*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5629*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_444_w32_hpad)
5630*c0909341SAndroid Build Coastguard Worker
5631*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad4):
5632*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5633*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 16
5634*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h, v1.8h}, [x1],  x2
5635*c0909341SAndroid Build Coastguard Worker        shl             v1.8h,   v1.8h,   #3
5636*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5637*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v1.h[7]
5638*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v1.h[7]
5639*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #1
5640*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5641*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5642*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5643*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5644*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5645*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5646*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5647*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5648*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5649*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5650*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_444_w32_hpad)
5651*c0909341SAndroid Build Coastguard Worker
5652*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_wpad6):
5653*c0909341SAndroid Build Coastguard Worker        AARCH64_VALID_JUMP_TARGET
5654*c0909341SAndroid Build Coastguard Worker1:      // Copy and expand input, padding 24
5655*c0909341SAndroid Build Coastguard Worker        ld1             {v0.8h}, [x1],  x2
5656*c0909341SAndroid Build Coastguard Worker        shl             v0.8h,   v0.8h,   #3
5657*c0909341SAndroid Build Coastguard Worker        dup             v1.8h,   v0.h[7]
5658*c0909341SAndroid Build Coastguard Worker        dup             v2.8h,   v0.h[7]
5659*c0909341SAndroid Build Coastguard Worker        dup             v3.8h,   v0.h[7]
5660*c0909341SAndroid Build Coastguard Worker        subs            w8,  w8,  #1
5661*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5662*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5663*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5664*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5665*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5666*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5667*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5668*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5669*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5670*c0909341SAndroid Build Coastguard Worker        b.gt            1b
5671*c0909341SAndroid Build Coastguard Worker
5672*c0909341SAndroid Build Coastguard WorkerL(ipred_cfl_ac_444_w32_hpad):
5673*c0909341SAndroid Build Coastguard Worker        cbz             w4,  3f
5674*c0909341SAndroid Build Coastguard Worker2:      // Vertical padding (h_pad > 0)
5675*c0909341SAndroid Build Coastguard Worker        subs            w4,  w4,  #2
5676*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5677*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5678*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5679*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5680*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5681*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5682*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5683*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5684*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5685*c0909341SAndroid Build Coastguard Worker        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5686*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v0.4h
5687*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v0.8h
5688*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v1.4h
5689*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v1.8h
5690*c0909341SAndroid Build Coastguard Worker        uaddw           v24.4s,  v24.4s,  v2.4h
5691*c0909341SAndroid Build Coastguard Worker        uaddw2          v25.4s,  v25.4s,  v2.8h
5692*c0909341SAndroid Build Coastguard Worker        uaddw           v26.4s,  v26.4s,  v3.4h
5693*c0909341SAndroid Build Coastguard Worker        uaddw2          v27.4s,  v27.4s,  v3.8h
5694*c0909341SAndroid Build Coastguard Worker        b.gt            2b
5695*c0909341SAndroid Build Coastguard Worker3:
5696*c0909341SAndroid Build Coastguard Worker
5697*c0909341SAndroid Build Coastguard Worker        //  Multiply the height by eight and reuse the w4 subtracting
5698*c0909341SAndroid Build Coastguard Worker        lsl             w6,  w6,  #3
5699*c0909341SAndroid Build Coastguard Worker        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5700*c0909341SAndroid Build Coastguard Workerendfunc
5701*c0909341SAndroid Build Coastguard Worker
5702*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_444_tbl
5703*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
5704*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
5705*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
5706*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
5707*c0909341SAndroid Build Coastguard Workerendjumptable
5708*c0909341SAndroid Build Coastguard Worker
5709*c0909341SAndroid Build Coastguard Workerjumptable ipred_cfl_ac_444_w32_tbl
5710*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
5711*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
5712*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
5713*c0909341SAndroid Build Coastguard Worker        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
5714*c0909341SAndroid Build Coastguard Workerendjumptable
5715