xref: /aosp_15_r20/external/libdav1d/src/arm/64/ipred.S (revision c09093415860a1c2373dacd84c4fde00c507cdfd)
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                             const pixel *const topleft,
33//                             const int width, const int height, const int a,
34//                             const int max_width, const int max_height);
35function ipred_dc_128_8bpc_neon, export=1
36        clz             w3,  w3
37        movrel          x5,  ipred_dc_128_tbl
38        sub             w3,  w3,  #25
39        ldrsw           x3,  [x5, w3, uxtw #2]
40        movi            v0.16b,  #128
41        add             x5,  x5,  x3
42        add             x6,  x0,  x1
43        lsl             x1,  x1,  #1
44        br              x5
4540:
46        AARCH64_VALID_JUMP_TARGET
474:
48        st1             {v0.s}[0],  [x0], x1
49        st1             {v0.s}[0],  [x6], x1
50        subs            w4,  w4,  #4
51        st1             {v0.s}[0],  [x0], x1
52        st1             {v0.s}[0],  [x6], x1
53        b.gt            4b
54        ret
5580:
56        AARCH64_VALID_JUMP_TARGET
578:
58        st1             {v0.8b},  [x0], x1
59        st1             {v0.8b},  [x6], x1
60        subs            w4,  w4,  #4
61        st1             {v0.8b},  [x0], x1
62        st1             {v0.8b},  [x6], x1
63        b.gt            8b
64        ret
65160:
66        AARCH64_VALID_JUMP_TARGET
6716:
68        st1             {v0.16b}, [x0], x1
69        st1             {v0.16b}, [x6], x1
70        subs            w4,  w4,  #4
71        st1             {v0.16b}, [x0], x1
72        st1             {v0.16b}, [x6], x1
73        b.gt            16b
74        ret
75320:
76        AARCH64_VALID_JUMP_TARGET
77        movi            v1.16b,  #128
7832:
79        st1             {v0.16b, v1.16b}, [x0], x1
80        st1             {v0.16b, v1.16b}, [x6], x1
81        subs            w4,  w4,  #4
82        st1             {v0.16b, v1.16b}, [x0], x1
83        st1             {v0.16b, v1.16b}, [x6], x1
84        b.gt            32b
85        ret
86640:
87        AARCH64_VALID_JUMP_TARGET
88        movi            v1.16b,  #128
89        movi            v2.16b,  #128
90        movi            v3.16b,  #128
9164:
92        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
93        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
94        subs            w4,  w4,  #4
95        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
96        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
97        b.gt            64b
98        ret
99endfunc
100
101jumptable ipred_dc_128_tbl
102        .word 640b - ipred_dc_128_tbl
103        .word 320b - ipred_dc_128_tbl
104        .word 160b - ipred_dc_128_tbl
105        .word 80b  - ipred_dc_128_tbl
106        .word 40b  - ipred_dc_128_tbl
107endjumptable
108
109// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
110//                        const pixel *const topleft,
111//                        const int width, const int height, const int a,
112//                        const int max_width, const int max_height);
113function ipred_v_8bpc_neon, export=1
114        clz             w3,  w3
115        movrel          x5,  ipred_v_tbl
116        sub             w3,  w3,  #25
117        ldrsw           x3,  [x5, w3, uxtw #2]
118        add             x2,  x2,  #1
119        add             x5,  x5,  x3
120        add             x6,  x0,  x1
121        lsl             x1,  x1,  #1
122        br              x5
12340:
124        AARCH64_VALID_JUMP_TARGET
125        ld1             {v0.s}[0],  [x2]
1264:
127        st1             {v0.s}[0],  [x0], x1
128        st1             {v0.s}[0],  [x6], x1
129        subs            w4,  w4,  #4
130        st1             {v0.s}[0],  [x0], x1
131        st1             {v0.s}[0],  [x6], x1
132        b.gt            4b
133        ret
13480:
135        AARCH64_VALID_JUMP_TARGET
136        ld1             {v0.8b},  [x2]
1378:
138        st1             {v0.8b},  [x0], x1
139        st1             {v0.8b},  [x6], x1
140        subs            w4,  w4,  #4
141        st1             {v0.8b},  [x0], x1
142        st1             {v0.8b},  [x6], x1
143        b.gt            8b
144        ret
145160:
146        AARCH64_VALID_JUMP_TARGET
147        ld1             {v0.16b}, [x2]
14816:
149        st1             {v0.16b}, [x0], x1
150        st1             {v0.16b}, [x6], x1
151        subs            w4,  w4,  #4
152        st1             {v0.16b}, [x0], x1
153        st1             {v0.16b}, [x6], x1
154        b.gt            16b
155        ret
156320:
157        AARCH64_VALID_JUMP_TARGET
158        ld1             {v0.16b, v1.16b}, [x2]
15932:
160        st1             {v0.16b, v1.16b}, [x0], x1
161        st1             {v0.16b, v1.16b}, [x6], x1
162        subs            w4,  w4,  #4
163        st1             {v0.16b, v1.16b}, [x0], x1
164        st1             {v0.16b, v1.16b}, [x6], x1
165        b.gt            32b
166        ret
167640:
168        AARCH64_VALID_JUMP_TARGET
169        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
17064:
171        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
172        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
173        subs            w4,  w4,  #4
174        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
175        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
176        b.gt            64b
177        ret
178endfunc
179
180jumptable ipred_v_tbl
181        .word 640b - ipred_v_tbl
182        .word 320b - ipred_v_tbl
183        .word 160b - ipred_v_tbl
184        .word 80b  - ipred_v_tbl
185        .word 40b  - ipred_v_tbl
186endjumptable
187
188// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
189//                        const pixel *const topleft,
190//                        const int width, const int height, const int a,
191//                        const int max_width, const int max_height);
192function ipred_h_8bpc_neon, export=1
193        clz             w3,  w3
194        movrel          x5,  ipred_h_tbl
195        sub             w3,  w3,  #25
196        ldrsw           x3,  [x5, w3, uxtw #2]
197        sub             x2,  x2,  #4
198        add             x5,  x5,  x3
199        mov             x7,  #-4
200        add             x6,  x0,  x1
201        lsl             x1,  x1,  #1
202        br              x5
20340:
204        AARCH64_VALID_JUMP_TARGET
2054:
206        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
207        st1             {v3.s}[0],  [x0], x1
208        st1             {v2.s}[0],  [x6], x1
209        subs            w4,  w4,  #4
210        st1             {v1.s}[0],  [x0], x1
211        st1             {v0.s}[0],  [x6], x1
212        b.gt            4b
213        ret
21480:
215        AARCH64_VALID_JUMP_TARGET
2168:
217        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
218        st1             {v3.8b},  [x0], x1
219        st1             {v2.8b},  [x6], x1
220        subs            w4,  w4,  #4
221        st1             {v1.8b},  [x0], x1
222        st1             {v0.8b},  [x6], x1
223        b.gt            8b
224        ret
225160:
226        AARCH64_VALID_JUMP_TARGET
22716:
228        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
229        st1             {v3.16b}, [x0], x1
230        st1             {v2.16b}, [x6], x1
231        subs            w4,  w4,  #4
232        st1             {v1.16b}, [x0], x1
233        st1             {v0.16b}, [x6], x1
234        b.gt            16b
235        ret
236320:
237        AARCH64_VALID_JUMP_TARGET
23832:
239        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
240        str             q3,  [x0, #16]
241        str             q2,  [x6, #16]
242        st1             {v3.16b}, [x0], x1
243        st1             {v2.16b}, [x6], x1
244        subs            w4,  w4,  #4
245        str             q1,  [x0, #16]
246        str             q0,  [x6, #16]
247        st1             {v1.16b}, [x0], x1
248        st1             {v0.16b}, [x6], x1
249        b.gt            32b
250        ret
251640:
252        AARCH64_VALID_JUMP_TARGET
25364:
254        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
255        str             q3,  [x0, #16]
256        str             q2,  [x6, #16]
257        stp             q3,  q3,  [x0, #32]
258        stp             q2,  q2,  [x6, #32]
259        st1             {v3.16b}, [x0], x1
260        st1             {v2.16b}, [x6], x1
261        subs            w4,  w4,  #4
262        str             q1,  [x0, #16]
263        str             q0,  [x6, #16]
264        stp             q1,  q1,  [x0, #32]
265        stp             q0,  q0,  [x6, #32]
266        st1             {v1.16b}, [x0], x1
267        st1             {v0.16b}, [x6], x1
268        b.gt            64b
269        ret
270endfunc
271
272jumptable ipred_h_tbl
273        .word 640b - ipred_h_tbl
274        .word 320b - ipred_h_tbl
275        .word 160b - ipred_h_tbl
276        .word 80b  - ipred_h_tbl
277        .word 40b  - ipred_h_tbl
278endjumptable
279
280// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
281//                             const pixel *const topleft,
282//                             const int width, const int height, const int a,
283//                             const int max_width, const int max_height);
284function ipred_dc_top_8bpc_neon, export=1
285        clz             w3,  w3
286        movrel          x5,  ipred_dc_top_tbl
287        sub             w3,  w3,  #25
288        ldrsw           x3,  [x5, w3, uxtw #2]
289        add             x2,  x2,  #1
290        add             x5,  x5,  x3
291        add             x6,  x0,  x1
292        lsl             x1,  x1,  #1
293        br              x5
29440:
295        AARCH64_VALID_JUMP_TARGET
296        ld1r            {v0.2s},  [x2]
297        uaddlv          h0,      v0.8b
298        rshrn           v0.8b,   v0.8h,   #3
299        dup             v0.8b,   v0.b[0]
3004:
301        st1             {v0.s}[0],  [x0], x1
302        st1             {v0.s}[0],  [x6], x1
303        subs            w4,  w4,  #4
304        st1             {v0.s}[0],  [x0], x1
305        st1             {v0.s}[0],  [x6], x1
306        b.gt            4b
307        ret
30880:
309        AARCH64_VALID_JUMP_TARGET
310        ld1             {v0.8b},  [x2]
311        uaddlv          h0,      v0.8b
312        rshrn           v0.8b,   v0.8h,   #3
313        dup             v0.8b,   v0.b[0]
3148:
315        st1             {v0.8b},  [x0], x1
316        st1             {v0.8b},  [x6], x1
317        subs            w4,  w4,  #4
318        st1             {v0.8b},  [x0], x1
319        st1             {v0.8b},  [x6], x1
320        b.gt            8b
321        ret
322160:
323        AARCH64_VALID_JUMP_TARGET
324        ld1             {v0.16b}, [x2]
325        uaddlv          h0,      v0.16b
326        rshrn           v0.8b,   v0.8h,   #4
327        dup             v0.16b,  v0.b[0]
32816:
329        st1             {v0.16b}, [x0], x1
330        st1             {v0.16b}, [x6], x1
331        subs            w4,  w4,  #4
332        st1             {v0.16b}, [x0], x1
333        st1             {v0.16b}, [x6], x1
334        b.gt            16b
335        ret
336320:
337        AARCH64_VALID_JUMP_TARGET
338        ld1             {v0.16b, v1.16b}, [x2]
339        uaddlv          h0,      v0.16b
340        uaddlv          h1,      v1.16b
341        add             v2.4h,   v0.4h,   v1.4h
342        rshrn           v2.8b,   v2.8h,   #5
343        dup             v0.16b,  v2.b[0]
344        dup             v1.16b,  v2.b[0]
34532:
346        st1             {v0.16b, v1.16b}, [x0], x1
347        st1             {v0.16b, v1.16b}, [x6], x1
348        subs            w4,  w4,  #4
349        st1             {v0.16b, v1.16b}, [x0], x1
350        st1             {v0.16b, v1.16b}, [x6], x1
351        b.gt            32b
352        ret
353640:
354        AARCH64_VALID_JUMP_TARGET
355        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
356        uaddlv          h0,      v0.16b
357        uaddlv          h1,      v1.16b
358        uaddlv          h2,      v2.16b
359        uaddlv          h3,      v3.16b
360        add             v4.4h,   v0.4h,   v1.4h
361        add             v5.4h,   v2.4h,   v3.4h
362        add             v4.4h,   v4.4h,   v5.4h
363        rshrn           v4.8b,   v4.8h,   #6
364        dup             v0.16b,  v4.b[0]
365        dup             v1.16b,  v4.b[0]
366        dup             v2.16b,  v4.b[0]
367        dup             v3.16b,  v4.b[0]
36864:
369        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
370        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
371        subs            w4,  w4,  #4
372        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
373        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
374        b.gt            64b
375        ret
376endfunc
377
378jumptable ipred_dc_top_tbl
379        .word 640b - ipred_dc_top_tbl
380        .word 320b - ipred_dc_top_tbl
381        .word 160b - ipred_dc_top_tbl
382        .word 80b  - ipred_dc_top_tbl
383        .word 40b  - ipred_dc_top_tbl
384endjumptable
385
386// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
387//                              const pixel *const topleft,
388//                              const int width, const int height, const int a,
389//                              const int max_width, const int max_height);
390function ipred_dc_left_8bpc_neon, export=1
391        sub             x2,  x2,  w4, uxtw
392        clz             w3,  w3
393        clz             w7,  w4
394        movrel          x5,  ipred_dc_left_tbl
395        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
396        sub             w7,  w7,  #25
397        ldrsw           x3,  [x5, w3, uxtw #2]
398        ldrsw           x7,  [x5, w7, uxtw #2]
399        add             x3,  x5,  x3
400        add             x5,  x5,  x7
401        add             x6,  x0,  x1
402        lsl             x1,  x1,  #1
403        br              x5
404
405L(ipred_dc_left_h4):
406        AARCH64_VALID_JUMP_TARGET
407        ld1r            {v0.2s},  [x2]
408        uaddlv          h0,      v0.8b
409        rshrn           v0.8b,   v0.8h,   #3
410        dup             v0.16b,  v0.b[0]
411        br              x3
412L(ipred_dc_left_w4):
413        AARCH64_VALID_JUMP_TARGET
4141:
415        st1             {v0.s}[0],  [x0], x1
416        st1             {v0.s}[0],  [x6], x1
417        subs            w4,  w4,  #4
418        st1             {v0.s}[0],  [x0], x1
419        st1             {v0.s}[0],  [x6], x1
420        b.gt            1b
421        ret
422
423L(ipred_dc_left_h8):
424        AARCH64_VALID_JUMP_TARGET
425        ld1             {v0.8b},  [x2]
426        uaddlv          h0,      v0.8b
427        rshrn           v0.8b,   v0.8h,   #3
428        dup             v0.16b,  v0.b[0]
429        br              x3
430L(ipred_dc_left_w8):
431        AARCH64_VALID_JUMP_TARGET
4321:
433        st1             {v0.8b},  [x0], x1
434        st1             {v0.8b},  [x6], x1
435        subs            w4,  w4,  #4
436        st1             {v0.8b},  [x0], x1
437        st1             {v0.8b},  [x6], x1
438        b.gt            1b
439        ret
440
441L(ipred_dc_left_h16):
442        AARCH64_VALID_JUMP_TARGET
443        ld1             {v0.16b}, [x2]
444        uaddlv          h0,      v0.16b
445        rshrn           v0.8b,   v0.8h,   #4
446        dup             v0.16b,  v0.b[0]
447        br              x3
448L(ipred_dc_left_w16):
449        AARCH64_VALID_JUMP_TARGET
4501:
451        st1             {v0.16b}, [x0], x1
452        st1             {v0.16b}, [x6], x1
453        subs            w4,  w4,  #4
454        st1             {v0.16b}, [x0], x1
455        st1             {v0.16b}, [x6], x1
456        b.gt            1b
457        ret
458
459L(ipred_dc_left_h32):
460        AARCH64_VALID_JUMP_TARGET
461        ld1             {v0.16b, v1.16b}, [x2]
462        uaddlv          h0,      v0.16b
463        uaddlv          h1,      v1.16b
464        add             v0.4h,   v0.4h,   v1.4h
465        rshrn           v0.8b,   v0.8h,   #5
466        dup             v0.16b,  v0.b[0]
467        br              x3
468L(ipred_dc_left_w32):
469        AARCH64_VALID_JUMP_TARGET
470        mov             v1.16b,  v0.16b
4711:
472        st1             {v0.16b, v1.16b}, [x0], x1
473        st1             {v0.16b, v1.16b}, [x6], x1
474        subs            w4,  w4,  #4
475        st1             {v0.16b, v1.16b}, [x0], x1
476        st1             {v0.16b, v1.16b}, [x6], x1
477        b.gt            1b
478        ret
479
480L(ipred_dc_left_h64):
481        AARCH64_VALID_JUMP_TARGET
482        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
483        uaddlv          h0,      v0.16b
484        uaddlv          h1,      v1.16b
485        uaddlv          h2,      v2.16b
486        uaddlv          h3,      v3.16b
487        add             v0.4h,   v0.4h,   v1.4h
488        add             v2.4h,   v2.4h,   v3.4h
489        add             v0.4h,   v0.4h,   v2.4h
490        rshrn           v0.8b,   v0.8h,   #6
491        dup             v0.16b,  v0.b[0]
492        br              x3
493L(ipred_dc_left_w64):
494        AARCH64_VALID_JUMP_TARGET
495        mov             v1.16b,  v0.16b
496        mov             v2.16b,  v0.16b
497        mov             v3.16b,  v0.16b
4981:
499        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
500        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
501        subs            w4,  w4,  #4
502        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
503        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
504        b.gt            1b
505        ret
506endfunc
507
508jumptable ipred_dc_left_tbl
509        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
510        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
511        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
512        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
513        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
514        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
515        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
516        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
517        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
518        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
519endjumptable
520
521// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
522//                         const pixel *const topleft,
523//                         const int width, const int height, const int a,
524//                         const int max_width, const int max_height);
525function ipred_dc_8bpc_neon, export=1
526        sub             x2,  x2,  w4, uxtw
527        add             w7,  w3,  w4             // width + height
528        clz             w3,  w3
529        clz             w6,  w4
530        dup             v16.8h, w7               // width + height
531        movrel          x5,  ipred_dc_tbl
532        rbit            w7,  w7                  // rbit(width + height)
533        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
534        sub             w6,  w6,  #25
535        clz             w7,  w7                  // ctz(width + height)
536        ldrsw           x3,  [x5, w3, uxtw #2]
537        ldrsw           x6,  [x5, w6, uxtw #2]
538        neg             w7,  w7                  // -ctz(width + height)
539        add             x3,  x5,  x3
540        add             x5,  x5,  x6
541        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
542        dup             v17.8h,  w7              // -ctz(width + height)
543        add             x6,  x0,  x1
544        lsl             x1,  x1,  #1
545        br              x5
546
547L(ipred_dc_h4):
548        AARCH64_VALID_JUMP_TARGET
549        ld1             {v0.s}[0],  [x2], #4
550        ins             v0.s[1], wzr
551        uaddlv          h0,      v0.8b
552        add             x2,  x2,  #1
553        br              x3
554L(ipred_dc_w4):
555        AARCH64_VALID_JUMP_TARGET
556        ld1             {v1.s}[0],  [x2]
557        ins             v1.s[1], wzr
558        add             v0.4h,   v0.4h,   v16.4h
559        uaddlv          h1,      v1.8b
560        cmp             w4,  #4
561        add             v0.4h,   v0.4h,   v1.4h
562        ushl            v0.4h,   v0.4h,   v17.4h
563        b.eq            1f
564        // h = 8/16
565        mov             w16, #(0x3334/2)
566        movk            w16, #(0x5556/2), lsl #16
567        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
568        lsr             w16, w16, w17
569        dup             v16.4h,  w16
570        sqdmulh         v0.4h,   v0.4h,   v16.4h
5711:
572        dup             v0.8b,   v0.b[0]
5732:
574        st1             {v0.s}[0],  [x0], x1
575        st1             {v0.s}[0],  [x6], x1
576        subs            w4,  w4,  #4
577        st1             {v0.s}[0],  [x0], x1
578        st1             {v0.s}[0],  [x6], x1
579        b.gt            2b
580        ret
581
582L(ipred_dc_h8):
583        AARCH64_VALID_JUMP_TARGET
584        ld1             {v0.8b},  [x2], #8
585        uaddlv          h0,      v0.8b
586        add             x2,  x2,  #1
587        br              x3
588L(ipred_dc_w8):
589        AARCH64_VALID_JUMP_TARGET
590        ld1             {v1.8b},  [x2]
591        add             v0.4h,   v0.4h,   v16.4h
592        uaddlv          h1,      v1.8b
593        cmp             w4,  #8
594        add             v0.4h,   v0.4h,   v1.4h
595        ushl            v0.4h,   v0.4h,   v17.4h
596        b.eq            1f
597        // h = 4/16/32
598        cmp             w4,  #32
599        mov             w16, #(0x3334/2)
600        mov             w17, #(0x5556/2)
601        csel            w16, w16, w17, eq
602        dup             v16.4h,  w16
603        sqdmulh         v0.4h,   v0.4h,   v16.4h
6041:
605        dup             v0.8b,   v0.b[0]
6062:
607        st1             {v0.8b},  [x0], x1
608        st1             {v0.8b},  [x6], x1
609        subs            w4,  w4,  #4
610        st1             {v0.8b},  [x0], x1
611        st1             {v0.8b},  [x6], x1
612        b.gt            2b
613        ret
614
615L(ipred_dc_h16):
616        AARCH64_VALID_JUMP_TARGET
617        ld1             {v0.16b}, [x2], #16
618        uaddlv          h0,      v0.16b
619        add             x2,  x2,  #1
620        br              x3
621L(ipred_dc_w16):
622        AARCH64_VALID_JUMP_TARGET
623        ld1             {v1.16b}, [x2]
624        add             v0.4h,   v0.4h,   v16.4h
625        uaddlv          h1,      v1.16b
626        cmp             w4,  #16
627        add             v0.4h,   v0.4h,   v1.4h
628        ushl            v0.4h,   v0.4h,   v17.4h
629        b.eq            1f
630        // h = 4/8/32/64
631        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
632        mov             w16, #(0x3334/2)
633        mov             w17, #(0x5556/2)
634        csel            w16, w16, w17, eq
635        dup             v16.4h,  w16
636        sqdmulh         v0.4h,   v0.4h,   v16.4h
6371:
638        dup             v0.16b,  v0.b[0]
6392:
640        st1             {v0.16b}, [x0], x1
641        st1             {v0.16b}, [x6], x1
642        subs            w4,  w4,  #4
643        st1             {v0.16b}, [x0], x1
644        st1             {v0.16b}, [x6], x1
645        b.gt            2b
646        ret
647
648L(ipred_dc_h32):
649        AARCH64_VALID_JUMP_TARGET
650        ld1             {v0.16b, v1.16b}, [x2], #32
651        uaddlv          h0,      v0.16b
652        uaddlv          h1,      v1.16b
653        add             x2,  x2,  #1
654        add             v0.4h,   v0.4h,   v1.4h
655        br              x3
656L(ipred_dc_w32):
657        AARCH64_VALID_JUMP_TARGET
658        ld1             {v1.16b, v2.16b}, [x2]
659        add             v0.4h,   v0.4h,   v16.4h
660        uaddlv          h1,      v1.16b
661        uaddlv          h2,      v2.16b
662        cmp             w4,  #32
663        add             v0.4h,   v0.4h,   v1.4h
664        add             v0.4h,   v0.4h,   v2.4h
665        ushl            v4.4h,   v0.4h,   v17.4h
666        b.eq            1f
667        // h = 8/16/64
668        cmp             w4,  #8
669        mov             w16, #(0x3334/2)
670        mov             w17, #(0x5556/2)
671        csel            w16, w16, w17, eq
672        dup             v16.4h,  w16
673        sqdmulh         v4.4h,   v4.4h,   v16.4h
6741:
675        dup             v0.16b,  v4.b[0]
676        dup             v1.16b,  v4.b[0]
6772:
678        st1             {v0.16b, v1.16b}, [x0], x1
679        st1             {v0.16b, v1.16b}, [x6], x1
680        subs            w4,  w4,  #4
681        st1             {v0.16b, v1.16b}, [x0], x1
682        st1             {v0.16b, v1.16b}, [x6], x1
683        b.gt            2b
684        ret
685
686L(ipred_dc_h64):
687        AARCH64_VALID_JUMP_TARGET
688        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
689        uaddlv          h0,      v0.16b
690        uaddlv          h1,      v1.16b
691        uaddlv          h2,      v2.16b
692        uaddlv          h3,      v3.16b
693        add             v0.4h,   v0.4h,   v1.4h
694        add             v2.4h,   v2.4h,   v3.4h
695        add             x2,  x2,  #1
696        add             v0.4h,   v0.4h,   v2.4h
697        br              x3
698L(ipred_dc_w64):
699        AARCH64_VALID_JUMP_TARGET
700        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
701        add             v0.4h,   v0.4h,   v16.4h
702        uaddlv          h1,      v1.16b
703        uaddlv          h2,      v2.16b
704        uaddlv          h3,      v3.16b
705        uaddlv          h4,      v4.16b
706        add             v1.4h,   v1.4h,   v2.4h
707        add             v3.4h,   v3.4h,   v4.4h
708        cmp             w4,  #64
709        add             v0.4h,   v0.4h,   v1.4h
710        add             v0.4h,   v0.4h,   v3.4h
711        ushl            v4.4h,   v0.4h,   v17.4h
712        b.eq            1f
713        // h = 16/32
714        mov             w16, #(0x5556/2)
715        movk            w16, #(0x3334/2), lsl #16
716        lsr             w16, w16, w4
717        dup             v16.4h,  w16
718        sqdmulh         v4.4h,   v4.4h,   v16.4h
7191:
720        dup             v0.16b,  v4.b[0]
721        dup             v1.16b,  v4.b[0]
722        dup             v2.16b,  v4.b[0]
723        dup             v3.16b,  v4.b[0]
7242:
725        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
726        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
727        subs            w4,  w4,  #4
728        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
729        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
730        b.gt            2b
731        ret
732endfunc
733
734jumptable ipred_dc_tbl
735        .word L(ipred_dc_h64) - ipred_dc_tbl
736        .word L(ipred_dc_h32) - ipred_dc_tbl
737        .word L(ipred_dc_h16) - ipred_dc_tbl
738        .word L(ipred_dc_h8)  - ipred_dc_tbl
739        .word L(ipred_dc_h4)  - ipred_dc_tbl
740        .word L(ipred_dc_w64) - ipred_dc_tbl
741        .word L(ipred_dc_w32) - ipred_dc_tbl
742        .word L(ipred_dc_w16) - ipred_dc_tbl
743        .word L(ipred_dc_w8)  - ipred_dc_tbl
744        .word L(ipred_dc_w4)  - ipred_dc_tbl
745endjumptable
746
747// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
748//                            const pixel *const topleft,
749//                            const int width, const int height, const int a,
750//                            const int max_width, const int max_height);
751function ipred_paeth_8bpc_neon, export=1
752        clz             w9,  w3
753        movrel          x5,  ipred_paeth_tbl
754        sub             w9,  w9,  #25
755        ldrsw           x9,  [x5, w9, uxtw #2]
756        ld1r            {v4.16b},  [x2]
757        add             x8,  x2,  #1
758        sub             x2,  x2,  #4
759        add             x5,  x5,  x9
760        mov             x7,  #-4
761        add             x6,  x0,  x1
762        lsl             x1,  x1,  #1
763        br              x5
76440:
765        AARCH64_VALID_JUMP_TARGET
766        ld1r            {v5.4s},  [x8]
767        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
7684:
769        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
770        zip1            v0.2s,   v0.2s,   v1.2s
771        zip1            v2.2s,   v2.2s,   v3.2s
772        uaddw           v16.8h,  v6.8h,   v0.8b
773        uaddw           v17.8h,  v6.8h,   v2.8b
774        sqxtun          v16.8b,  v16.8h           // base
775        sqxtun2         v16.16b, v17.8h
776        zip1            v0.2d,   v0.2d,   v2.2d
777        uabd            v20.16b, v5.16b,  v16.16b // tdiff
778        uabd            v22.16b, v4.16b,  v16.16b // tldiff
779        uabd            v16.16b, v0.16b,  v16.16b // ldiff
780        umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
781        cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
782        cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
783        bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
784        bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
785        st1             {v20.s}[3], [x0], x1
786        st1             {v20.s}[2], [x6], x1
787        subs            w4,  w4,  #4
788        st1             {v20.s}[1], [x0], x1
789        st1             {v20.s}[0], [x6], x1
790        b.gt            4b
791        ret
79280:
793        AARCH64_VALID_JUMP_TARGET
794        ld1r            {v5.2d},  [x8]
795        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
7968:
797        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
798        uaddw           v16.8h,  v6.8h,   v0.8b
799        uaddw           v17.8h,  v6.8h,   v1.8b
800        uaddw           v18.8h,  v6.8h,   v2.8b
801        uaddw           v19.8h,  v6.8h,   v3.8b
802        sqxtun          v16.8b,  v16.8h           // base
803        sqxtun2         v16.16b, v17.8h
804        sqxtun          v18.8b,  v18.8h
805        sqxtun2         v18.16b, v19.8h
806        zip1            v2.2d,   v2.2d,   v3.2d
807        zip1            v0.2d,   v0.2d,   v1.2d
808        uabd            v21.16b, v5.16b,  v18.16b // tdiff
809        uabd            v20.16b, v5.16b,  v16.16b
810        uabd            v23.16b, v4.16b,  v18.16b // tldiff
811        uabd            v22.16b, v4.16b,  v16.16b
812        uabd            v17.16b, v2.16b,  v18.16b // ldiff
813        uabd            v16.16b, v0.16b,  v16.16b
814        umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
815        umin            v18.16b, v20.16b, v22.16b
816        cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
817        cmhs            v20.16b, v22.16b, v20.16b
818        cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
819        cmhs            v16.16b, v18.16b, v16.16b
820        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
821        bsl             v20.16b, v5.16b,  v4.16b
822        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
823        bit             v20.16b, v0.16b,  v16.16b
824        st1             {v21.d}[1], [x0], x1
825        st1             {v21.d}[0], [x6], x1
826        subs            w4,  w4,  #4
827        st1             {v20.d}[1], [x0], x1
828        st1             {v20.d}[0], [x6], x1
829        b.gt            8b
830        ret
831160:
832320:
833640:
834        AARCH64_VALID_JUMP_TARGET
835        ld1             {v5.16b},  [x8], #16
836        mov             w9,  w3
837        // Set up pointers for four rows in parallel; x0, x6, x5, x10
838        add             x5,  x0,  x1
839        add             x10, x6,  x1
840        lsl             x1,  x1,  #1
841        sub             x1,  x1,  w3, uxtw
8421:
843        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
8442:
845        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
846        usubl2          v7.8h,   v5.16b,  v4.16b
847        uaddw           v24.8h,  v6.8h,   v0.8b
848        uaddw           v25.8h,  v7.8h,   v0.8b
849        uaddw           v26.8h,  v6.8h,   v1.8b
850        uaddw           v27.8h,  v7.8h,   v1.8b
851        uaddw           v28.8h,  v6.8h,   v2.8b
852        uaddw           v29.8h,  v7.8h,   v2.8b
853        uaddw           v30.8h,  v6.8h,   v3.8b
854        uaddw           v31.8h,  v7.8h,   v3.8b
855        sqxtun          v17.8b,  v26.8h           // base
856        sqxtun2         v17.16b, v27.8h
857        sqxtun          v16.8b,  v24.8h
858        sqxtun2         v16.16b, v25.8h
859        sqxtun          v19.8b,  v30.8h
860        sqxtun2         v19.16b, v31.8h
861        sqxtun          v18.8b,  v28.8h
862        sqxtun2         v18.16b, v29.8h
863        uabd            v23.16b, v5.16b,  v19.16b // tdiff
864        uabd            v22.16b, v5.16b,  v18.16b
865        uabd            v21.16b, v5.16b,  v17.16b
866        uabd            v20.16b, v5.16b,  v16.16b
867        uabd            v27.16b, v4.16b,  v19.16b // tldiff
868        uabd            v26.16b, v4.16b,  v18.16b
869        uabd            v25.16b, v4.16b,  v17.16b
870        uabd            v24.16b, v4.16b,  v16.16b
871        uabd            v19.16b, v3.16b,  v19.16b // ldiff
872        uabd            v18.16b, v2.16b,  v18.16b
873        uabd            v17.16b, v1.16b,  v17.16b
874        uabd            v16.16b, v0.16b,  v16.16b
875        umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
876        umin            v30.16b, v22.16b, v26.16b
877        umin            v29.16b, v21.16b, v25.16b
878        umin            v28.16b, v20.16b, v24.16b
879        cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
880        cmhs            v22.16b, v26.16b, v22.16b
881        cmhs            v21.16b, v25.16b, v21.16b
882        cmhs            v20.16b, v24.16b, v20.16b
883        cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
884        cmhs            v18.16b, v30.16b, v18.16b
885        cmhs            v17.16b, v29.16b, v17.16b
886        cmhs            v16.16b, v28.16b, v16.16b
887        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
888        bsl             v22.16b, v5.16b,  v4.16b
889        bsl             v21.16b, v5.16b,  v4.16b
890        bsl             v20.16b, v5.16b,  v4.16b
891        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
892        bit             v22.16b, v2.16b,  v18.16b
893        bit             v21.16b, v1.16b,  v17.16b
894        bit             v20.16b, v0.16b,  v16.16b
895        subs            w3,  w3,  #16
896        st1             {v23.16b}, [x0],  #16
897        st1             {v22.16b}, [x6],  #16
898        st1             {v21.16b}, [x5],  #16
899        st1             {v20.16b}, [x10], #16
900        b.le            8f
901        ld1             {v5.16b},  [x8], #16
902        b               2b
9038:
904        subs            w4,  w4,  #4
905        b.le            9f
906        // End of horizontal loop, move pointers to next four rows
907        sub             x8,  x8,  w9, uxtw
908        add             x0,  x0,  x1
909        add             x6,  x6,  x1
910        // Load the top row as early as possible
911        ld1             {v5.16b},  [x8], #16
912        add             x5,  x5,  x1
913        add             x10, x10, x1
914        mov             w3,  w9
915        b               1b
9169:
917        ret
918endfunc
919
920jumptable ipred_paeth_tbl
921        .word 640b - ipred_paeth_tbl
922        .word 320b - ipred_paeth_tbl
923        .word 160b - ipred_paeth_tbl
924        .word 80b  - ipred_paeth_tbl
925        .word 40b  - ipred_paeth_tbl
926endjumptable
927
928// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
929//                             const pixel *const topleft,
930//                             const int width, const int height, const int a,
931//                             const int max_width, const int max_height);
932function ipred_smooth_8bpc_neon, export=1
933        movrel          x10, X(sm_weights)
934        add             x11, x10, w4, uxtw
935        add             x10, x10, w3, uxtw
936        clz             w9,  w3
937        movrel          x5,  ipred_smooth_tbl
938        sub             x12, x2,  w4, uxtw
939        sub             w9,  w9,  #25
940        ldrsw           x9,  [x5, w9, uxtw #2]
941        ld1r            {v4.16b},  [x12] // bottom
942        add             x8,  x2,  #1
943        add             x5,  x5,  x9
944        add             x6,  x0,  x1
945        lsl             x1,  x1,  #1
946        br              x5
94740:
948        AARCH64_VALID_JUMP_TARGET
949        ld1r            {v6.2s}, [x8]             // top
950        ld1r            {v7.2s}, [x10]            // weights_hor
951        sub             x2,  x2,  #4
952        mov             x7,  #-4
953        dup             v5.16b,  v6.b[3]          // right
954        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
955        uxtl            v7.8h,   v7.8b            // weights_hor
9564:
957        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
958        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
959        shll            v20.8h,  v5.8b,   #8      // right*256
960        shll            v21.8h,  v5.8b,   #8
961        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
962        zip1            v0.2s,   v3.2s,   v2.2s
963        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
964        zip1            v18.2s,  v18.2s,  v19.2s
965        shll            v22.8h,  v4.8b,   #8      // bottom*256
966        shll            v23.8h,  v4.8b,   #8
967        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
968        usubl           v1.8h,   v1.8b,   v5.8b
969        uxtl            v16.8h,  v16.8b           // weights_ver
970        uxtl            v18.8h,  v18.8b
971        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
972        mla             v21.8h,  v1.8h,   v7.8h
973        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
974        mla             v23.8h,  v6.8h,   v18.8h
975        uhadd           v20.8h,  v20.8h,  v22.8h
976        uhadd           v21.8h,  v21.8h,  v23.8h
977        rshrn           v20.8b,  v20.8h,  #8
978        rshrn           v21.8b,  v21.8h,  #8
979        st1             {v20.s}[0], [x0], x1
980        st1             {v20.s}[1], [x6], x1
981        subs            w4,  w4,  #4
982        st1             {v21.s}[0], [x0], x1
983        st1             {v21.s}[1], [x6], x1
984        b.gt            4b
985        ret
98680:
987        AARCH64_VALID_JUMP_TARGET
988        ld1             {v6.8b}, [x8]             // top
989        ld1             {v7.8b}, [x10]            // weights_hor
990        sub             x2,  x2,  #4
991        mov             x7,  #-4
992        dup             v5.16b,  v6.b[7]          // right
993        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
994        uxtl            v7.8h,   v7.8b            // weights_hor
9958:
996        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
997        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
998        shll            v20.8h,  v5.8b,   #8      // right*256
999        shll            v21.8h,  v5.8b,   #8
1000        shll            v22.8h,  v5.8b,   #8
1001        shll            v23.8h,  v5.8b,   #8
1002        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1003        usubl           v1.8h,   v1.8b,   v5.8b
1004        usubl           v2.8h,   v2.8b,   v5.8b
1005        usubl           v3.8h,   v3.8b,   v5.8b
1006        shll            v24.8h,  v4.8b,   #8      // bottom*256
1007        shll            v25.8h,  v4.8b,   #8
1008        shll            v26.8h,  v4.8b,   #8
1009        shll            v27.8h,  v4.8b,   #8
1010        uxtl            v16.8h,  v16.8b           // weights_ver
1011        uxtl            v17.8h,  v17.8b
1012        uxtl            v18.8h,  v18.8b
1013        uxtl            v19.8h,  v19.8b
1014        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1015        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
1016        mla             v22.8h,  v1.8h,   v7.8h
1017        mla             v23.8h,  v0.8h,   v7.8h
1018        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1019        mla             v25.8h,  v6.8h,   v17.8h
1020        mla             v26.8h,  v6.8h,   v18.8h
1021        mla             v27.8h,  v6.8h,   v19.8h
1022        uhadd           v20.8h,  v20.8h,  v24.8h
1023        uhadd           v21.8h,  v21.8h,  v25.8h
1024        uhadd           v22.8h,  v22.8h,  v26.8h
1025        uhadd           v23.8h,  v23.8h,  v27.8h
1026        rshrn           v20.8b,  v20.8h,  #8
1027        rshrn           v21.8b,  v21.8h,  #8
1028        rshrn           v22.8b,  v22.8h,  #8
1029        rshrn           v23.8b,  v23.8h,  #8
1030        st1             {v20.8b}, [x0], x1
1031        st1             {v21.8b}, [x6], x1
1032        subs            w4,  w4,  #4
1033        st1             {v22.8b}, [x0], x1
1034        st1             {v23.8b}, [x6], x1
1035        b.gt            8b
1036        ret
1037160:
1038320:
1039640:
1040        AARCH64_VALID_JUMP_TARGET
1041        add             x12, x2,  w3, uxtw
1042        sub             x2,  x2,  #2
1043        mov             x7,  #-2
1044        ld1r            {v5.16b}, [x12]           // right
1045        sub             x1,  x1,  w3, uxtw
1046        mov             w9,  w3
1047
10481:
1049        ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
1050        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1051        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1052        usubl           v1.8h,   v1.8b,   v5.8b
1053        uxtl            v16.8h,  v16.8b           // weights_ver
1054        uxtl            v17.8h,  v17.8b
10552:
1056        ld1             {v7.16b}, [x10],  #16     // weights_hor
1057        ld1             {v3.16b}, [x8],   #16     // top
1058        shll            v20.8h,  v5.8b,   #8      // right*256
1059        shll            v21.8h,  v5.8b,   #8
1060        shll            v22.8h,  v5.8b,   #8
1061        shll            v23.8h,  v5.8b,   #8
1062        uxtl            v6.8h,   v7.8b            // weights_hor
1063        uxtl2           v7.8h,   v7.16b
1064        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1065        usubl2          v3.8h,   v3.16b,  v4.16b
1066        mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1067        mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
1068        mla             v22.8h,  v0.8h,   v6.8h
1069        mla             v23.8h,  v0.8h,   v7.8h
1070        shll            v24.8h,  v4.8b,   #8      // bottom*256
1071        shll            v25.8h,  v4.8b,   #8
1072        shll            v26.8h,  v4.8b,   #8
1073        shll            v27.8h,  v4.8b,   #8
1074        mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1075        mla             v25.8h,  v3.8h,   v16.8h
1076        mla             v26.8h,  v2.8h,   v17.8h
1077        mla             v27.8h,  v3.8h,   v17.8h
1078        uhadd           v20.8h,  v20.8h,  v24.8h
1079        uhadd           v21.8h,  v21.8h,  v25.8h
1080        uhadd           v22.8h,  v22.8h,  v26.8h
1081        uhadd           v23.8h,  v23.8h,  v27.8h
1082        rshrn           v20.8b,  v20.8h,  #8
1083        rshrn2          v20.16b, v21.8h,  #8
1084        rshrn           v22.8b,  v22.8h,  #8
1085        rshrn2          v22.16b, v23.8h,  #8
1086        subs            w3,  w3,  #16
1087        st1             {v20.16b}, [x0],  #16
1088        st1             {v22.16b}, [x6],  #16
1089        b.gt            2b
1090        subs            w4,  w4,  #2
1091        b.le            9f
1092        sub             x8,  x8,  w9, uxtw
1093        sub             x10, x10, w9, uxtw
1094        add             x0,  x0,  x1
1095        add             x6,  x6,  x1
1096        mov             w3,  w9
1097        b               1b
10989:
1099        ret
1100endfunc
1101
1102jumptable ipred_smooth_tbl
1103        .word 640b - ipred_smooth_tbl
1104        .word 320b - ipred_smooth_tbl
1105        .word 160b - ipred_smooth_tbl
1106        .word 80b  - ipred_smooth_tbl
1107        .word 40b  - ipred_smooth_tbl
1108endjumptable
1109
1110// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1111//                               const pixel *const topleft,
1112//                               const int width, const int height, const int a,
1113//                               const int max_width, const int max_height);
1114function ipred_smooth_v_8bpc_neon, export=1
1115        movrel          x7,  X(sm_weights)
1116        add             x7,  x7,  w4, uxtw
1117        clz             w9,  w3
1118        movrel          x5,  ipred_smooth_v_tbl
1119        sub             x8,  x2,  w4, uxtw
1120        sub             w9,  w9,  #25
1121        ldrsw           x9,  [x5, w9, uxtw #2]
1122        ld1r            {v4.16b},  [x8] // bottom
1123        add             x2,  x2,  #1
1124        add             x5,  x5,  x9
1125        add             x6,  x0,  x1
1126        lsl             x1,  x1,  #1
1127        br              x5
112840:
1129        AARCH64_VALID_JUMP_TARGET
1130        ld1r            {v6.2s}, [x2]             // top
1131        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
11324:
1133        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1134        shll            v22.8h,  v4.8b,   #8      // bottom*256
1135        shll            v23.8h,  v4.8b,   #8
1136        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1137        zip1            v18.2s,  v18.2s,  v19.2s
1138        uxtl            v16.8h,  v16.8b           // weights_ver
1139        uxtl            v18.8h,  v18.8b
1140        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1141        mla             v23.8h,  v6.8h,   v18.8h
1142        rshrn           v22.8b,  v22.8h,  #8
1143        rshrn           v23.8b,  v23.8h,  #8
1144        st1             {v22.s}[0], [x0], x1
1145        st1             {v22.s}[1], [x6], x1
1146        subs            w4,  w4,  #4
1147        st1             {v23.s}[0], [x0], x1
1148        st1             {v23.s}[1], [x6], x1
1149        b.gt            4b
1150        ret
115180:
1152        AARCH64_VALID_JUMP_TARGET
1153        ld1             {v6.8b}, [x2]             // top
1154        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
11558:
1156        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1157        shll            v24.8h,  v4.8b,   #8      // bottom*256
1158        shll            v25.8h,  v4.8b,   #8
1159        shll            v26.8h,  v4.8b,   #8
1160        shll            v27.8h,  v4.8b,   #8
1161        uxtl            v16.8h,  v16.8b           // weights_ver
1162        uxtl            v17.8h,  v17.8b
1163        uxtl            v18.8h,  v18.8b
1164        uxtl            v19.8h,  v19.8b
1165        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1166        mla             v25.8h,  v6.8h,   v17.8h
1167        mla             v26.8h,  v6.8h,   v18.8h
1168        mla             v27.8h,  v6.8h,   v19.8h
1169        rshrn           v24.8b,  v24.8h,  #8
1170        rshrn           v25.8b,  v25.8h,  #8
1171        rshrn           v26.8b,  v26.8h,  #8
1172        rshrn           v27.8b,  v27.8h,  #8
1173        st1             {v24.8b}, [x0], x1
1174        st1             {v25.8b}, [x6], x1
1175        subs            w4,  w4,  #4
1176        st1             {v26.8b}, [x0], x1
1177        st1             {v27.8b}, [x6], x1
1178        b.gt            8b
1179        ret
1180160:
1181320:
1182640:
1183        AARCH64_VALID_JUMP_TARGET
1184        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1185        add             x5,  x0,  x1
1186        add             x8,  x6,  x1
1187        lsl             x1,  x1,  #1
1188        sub             x1,  x1,  w3, uxtw
1189        mov             w9,  w3
1190
11911:
1192        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1193        uxtl            v16.8h,  v16.8b           // weights_ver
1194        uxtl            v17.8h,  v17.8b
1195        uxtl            v18.8h,  v18.8b
1196        uxtl            v19.8h,  v19.8b
11972:
1198        ld1             {v3.16b}, [x2],   #16     // top
1199        shll            v20.8h,  v4.8b,   #8      // bottom*256
1200        shll            v21.8h,  v4.8b,   #8
1201        shll            v22.8h,  v4.8b,   #8
1202        shll            v23.8h,  v4.8b,   #8
1203        shll            v24.8h,  v4.8b,   #8
1204        shll            v25.8h,  v4.8b,   #8
1205        shll            v26.8h,  v4.8b,   #8
1206        shll            v27.8h,  v4.8b,   #8
1207        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1208        usubl2          v3.8h,   v3.16b,  v4.16b
1209        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1210        mla             v21.8h,  v3.8h,   v16.8h
1211        mla             v22.8h,  v2.8h,   v17.8h
1212        mla             v23.8h,  v3.8h,   v17.8h
1213        mla             v24.8h,  v2.8h,   v18.8h
1214        mla             v25.8h,  v3.8h,   v18.8h
1215        mla             v26.8h,  v2.8h,   v19.8h
1216        mla             v27.8h,  v3.8h,   v19.8h
1217        rshrn           v20.8b,  v20.8h,  #8
1218        rshrn2          v20.16b, v21.8h,  #8
1219        rshrn           v22.8b,  v22.8h,  #8
1220        rshrn2          v22.16b, v23.8h,  #8
1221        rshrn           v24.8b,  v24.8h,  #8
1222        rshrn2          v24.16b, v25.8h,  #8
1223        rshrn           v26.8b,  v26.8h,  #8
1224        rshrn2          v26.16b, v27.8h,  #8
1225        subs            w3,  w3,  #16
1226        st1             {v20.16b}, [x0],  #16
1227        st1             {v22.16b}, [x6],  #16
1228        st1             {v24.16b}, [x5],  #16
1229        st1             {v26.16b}, [x8],  #16
1230        b.gt            2b
1231        subs            w4,  w4,  #4
1232        b.le            9f
1233        sub             x2,  x2,  w9, uxtw
1234        add             x0,  x0,  x1
1235        add             x6,  x6,  x1
1236        add             x5,  x5,  x1
1237        add             x8,  x8,  x1
1238        mov             w3,  w9
1239        b               1b
12409:
1241        ret
1242endfunc
1243
1244jumptable ipred_smooth_v_tbl
1245        .word 640b - ipred_smooth_v_tbl
1246        .word 320b - ipred_smooth_v_tbl
1247        .word 160b - ipred_smooth_v_tbl
1248        .word 80b  - ipred_smooth_v_tbl
1249        .word 40b  - ipred_smooth_v_tbl
1250endjumptable
1251
1252// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1253//                               const pixel *const topleft,
1254//                               const int width, const int height, const int a,
1255//                               const int max_width, const int max_height);
1256function ipred_smooth_h_8bpc_neon, export=1
1257        movrel          x8,  X(sm_weights)
1258        add             x8,  x8,  w3, uxtw
1259        clz             w9,  w3
1260        movrel          x5,  ipred_smooth_h_tbl
1261        add             x12, x2,  w3, uxtw
1262        sub             w9,  w9,  #25
1263        ldrsw           x9,  [x5, w9, uxtw #2]
1264        ld1r            {v5.16b},  [x12] // right
1265        add             x5,  x5,  x9
1266        add             x6,  x0,  x1
1267        lsl             x1,  x1,  #1
1268        br              x5
126940:
1270        AARCH64_VALID_JUMP_TARGET
1271        ld1r            {v7.2s}, [x8]             // weights_hor
1272        sub             x2,  x2,  #4
1273        mov             x7,  #-4
1274        uxtl            v7.8h,   v7.8b            // weights_hor
12754:
1276        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1277        shll            v20.8h,  v5.8b,   #8      // right*256
1278        shll            v21.8h,  v5.8b,   #8
1279        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
1280        zip1            v0.2s,   v3.2s,   v2.2s
1281        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1282        usubl           v1.8h,   v1.8b,   v5.8b
1283        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1284        mla             v21.8h,  v1.8h,   v7.8h
1285        rshrn           v20.8b,  v20.8h,  #8
1286        rshrn           v21.8b,  v21.8h,  #8
1287        st1             {v20.s}[0], [x0], x1
1288        st1             {v20.s}[1], [x6], x1
1289        subs            w4,  w4,  #4
1290        st1             {v21.s}[0], [x0], x1
1291        st1             {v21.s}[1], [x6], x1
1292        b.gt            4b
1293        ret
129480:
1295        AARCH64_VALID_JUMP_TARGET
1296        ld1             {v7.8b}, [x8]             // weights_hor
1297        sub             x2,  x2,  #4
1298        mov             x7,  #-4
1299        uxtl            v7.8h,   v7.8b            // weights_hor
13008:
1301        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1302        shll            v20.8h,  v5.8b,   #8      // right*256
1303        shll            v21.8h,  v5.8b,   #8
1304        shll            v22.8h,  v5.8b,   #8
1305        shll            v23.8h,  v5.8b,   #8
1306        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
1307        usubl           v2.8h,   v2.8b,   v5.8b
1308        usubl           v1.8h,   v1.8b,   v5.8b
1309        usubl           v0.8h,   v0.8b,   v5.8b
1310        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1311        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
1312        mla             v22.8h,  v1.8h,   v7.8h
1313        mla             v23.8h,  v0.8h,   v7.8h
1314        rshrn           v20.8b,  v20.8h,  #8
1315        rshrn           v21.8b,  v21.8h,  #8
1316        rshrn           v22.8b,  v22.8h,  #8
1317        rshrn           v23.8b,  v23.8h,  #8
1318        st1             {v20.8b}, [x0], x1
1319        st1             {v21.8b}, [x6], x1
1320        subs            w4,  w4,  #4
1321        st1             {v22.8b}, [x0], x1
1322        st1             {v23.8b}, [x6], x1
1323        b.gt            8b
1324        ret
1325160:
1326320:
1327640:
1328        AARCH64_VALID_JUMP_TARGET
1329        sub             x2,  x2,  #4
1330        mov             x7,  #-4
1331        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1332        add             x5,  x0,  x1
1333        add             x10, x6,  x1
1334        lsl             x1,  x1,  #1
1335        sub             x1,  x1,  w3, uxtw
1336        mov             w9,  w3
1337
13381:
1339        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
1340        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1341        usubl           v1.8h,   v1.8b,   v5.8b
1342        usubl           v2.8h,   v2.8b,   v5.8b
1343        usubl           v3.8h,   v3.8b,   v5.8b
13442:
1345        ld1             {v7.16b}, [x8],   #16     // weights_hor
1346        shll            v20.8h,  v5.8b,   #8      // right*256
1347        shll            v21.8h,  v5.8b,   #8
1348        shll            v22.8h,  v5.8b,   #8
1349        shll            v23.8h,  v5.8b,   #8
1350        shll            v24.8h,  v5.8b,   #8
1351        shll            v25.8h,  v5.8b,   #8
1352        shll            v26.8h,  v5.8b,   #8
1353        shll            v27.8h,  v5.8b,   #8
1354        uxtl            v6.8h,   v7.8b            // weights_hor
1355        uxtl2           v7.8h,   v7.16b
1356        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1357        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
1358        mla             v22.8h,  v2.8h,   v6.8h
1359        mla             v23.8h,  v2.8h,   v7.8h
1360        mla             v24.8h,  v1.8h,   v6.8h
1361        mla             v25.8h,  v1.8h,   v7.8h
1362        mla             v26.8h,  v0.8h,   v6.8h
1363        mla             v27.8h,  v0.8h,   v7.8h
1364        rshrn           v20.8b,  v20.8h,  #8
1365        rshrn2          v20.16b, v21.8h,  #8
1366        rshrn           v22.8b,  v22.8h,  #8
1367        rshrn2          v22.16b, v23.8h,  #8
1368        rshrn           v24.8b,  v24.8h,  #8
1369        rshrn2          v24.16b, v25.8h,  #8
1370        rshrn           v26.8b,  v26.8h,  #8
1371        rshrn2          v26.16b, v27.8h,  #8
1372        subs            w3,  w3,  #16
1373        st1             {v20.16b}, [x0],  #16
1374        st1             {v22.16b}, [x6],  #16
1375        st1             {v24.16b}, [x5],  #16
1376        st1             {v26.16b}, [x10], #16
1377        b.gt            2b
1378        subs            w4,  w4,  #4
1379        b.le            9f
1380        sub             x8,  x8,  w9, uxtw
1381        add             x0,  x0,  x1
1382        add             x6,  x6,  x1
1383        add             x5,  x5,  x1
1384        add             x10, x10, x1
1385        mov             w3,  w9
1386        b               1b
13879:
1388        ret
1389endfunc
1390
1391jumptable ipred_smooth_h_tbl
1392        .word 640b - ipred_smooth_h_tbl
1393        .word 320b - ipred_smooth_h_tbl
1394        .word 160b - ipred_smooth_h_tbl
1395        .word 80b  - ipred_smooth_h_tbl
1396        .word 40b  - ipred_smooth_h_tbl
1397endjumptable
1398
1399const padding_mask_buf
1400        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1401        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1402        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1403        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1404padding_mask:
1405        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1406        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1407        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1408        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1409endconst
1410
1411// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
1412//                                       const pixel *const in, const int end);
1413function ipred_z1_upsample_edge_8bpc_neon, export=1
1414        movrel          x4,  padding_mask
1415        ld1             {v0.16b},  [x2]           // in[]
1416        add             x5,  x2,  w3,  uxtw       // in[end]
1417        sub             x4,  x4,  w3,  uxtw
1418
1419        ld1r            {v1.16b},  [x5]           // padding
1420        ld1             {v3.16b},  [x4]           // padding_mask
1421
1422        movi            v31.8h,  #9
1423
1424        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
1425
1426        ext             v4.16b,  v0.16b,  v1.16b,  #1
1427        ext             v5.16b,  v0.16b,  v1.16b,  #2
1428        ext             v6.16b,  v0.16b,  v1.16b,  #3
1429
1430        uaddl           v16.8h,  v4.8b,   v5.8b   // in[i+1] + in[i+2]
1431        uaddl2          v17.8h,  v4.16b,  v5.16b
1432        uaddl           v18.8h,  v0.8b,   v6.8b   // in[i+0] + in[i+3]
1433        uaddl2          v19.8h,  v0.16b,  v6.16b
1434        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
1435        mul             v17.8h,  v17.8h,  v31.8h
1436        sub             v16.8h,  v16.8h,  v18.8h
1437        sub             v17.8h,  v17.8h,  v19.8h
1438
1439        sqrshrun        v16.8b,  v16.8h,  #4
1440        sqrshrun2       v16.16b, v17.8h,  #4
1441
1442        zip1            v0.16b,  v4.16b,  v16.16b
1443        zip2            v1.16b,  v4.16b,  v16.16b
1444
1445        st1             {v0.16b, v1.16b}, [x0]
1446
1447        ret
1448endfunc
1449
1450// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz,
1451//                                       const pixel *const in);
1452function ipred_z2_upsample_edge_8bpc_neon, export=1
1453        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
1454        movrel          x4,  padding_mask
1455        ld1             {v0.16b},  [x2]           // in[]
1456        add             x5,  x2,  w1,  uxtw       // in[sz]
1457        sub             x4,  x4,  w1,  uxtw
1458
1459        ld1r            {v2.16b},  [x2]           // in[0] for padding
1460        ld1r            {v1.16b},  [x5]           // padding
1461        ld1             {v3.16b},  [x4]           // padding_mask
1462
1463        movi            v31.8h,  #9
1464
1465        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
1466
1467        ext             v4.16b,  v2.16b,  v0.16b,  #15
1468        ext             v5.16b,  v0.16b,  v1.16b,  #1
1469        ext             v6.16b,  v0.16b,  v1.16b,  #2
1470
1471        uaddl           v16.8h,  v0.8b,   v5.8b   // in[i+0] + in[i+1]
1472        uaddl           v18.8h,  v4.8b,   v6.8b   // in[i-1] + in[i+2]
1473        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
1474        sub             v16.8h,  v16.8h,  v18.8h
1475
1476        sqrshrun        v16.8b,  v16.8h,  #4
1477
1478        add             x5,  x0,  #16
1479
1480        zip1            v2.16b,  v0.16b,  v16.16b
1481
1482        st1             {v1.b}[0], [x5]
1483        // In case sz=8, output one single pixel in out[16].
1484        st1             {v2.16b}, [x0]
1485
1486        ret
1487endfunc
1488
1489const edge_filter
1490        .byte 0, 4, 8, 0
1491        .byte 0, 5, 6, 0
1492// Leaving out the coeffs for strength=3
1493//      .byte 2, 4, 4, 0
1494endconst
1495
1496// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
1497//                                     const pixel *const in, const int end,
1498//                                     const int strength);
1499function ipred_z1_filter_edge_8bpc_neon, export=1
1500        cmp             w4, #3
1501        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
1502
1503        movrel          x5,  edge_filter, -3
1504        add             x5,  x5,  w4,  uxtw #2    // edge_filter + (strength - 1)*4 + 1
1505
1506        ld1             {v31.h}[0], [x5]          // kernel[1-2]
1507
1508        ld1             {v0.16b}, [x2], #16
1509
1510        dup             v30.16b, v31.b[0]
1511        dup             v31.16b, v31.b[1]
15121:
1513        // in[end], is the last valid pixel. We produce 16 pixels out by
1514        // using 18 pixels in - the last pixel used is [17] of the ones
1515        // read/buffered.
1516        cmp             w3,  #17
1517        ld1             {v1.16b}, [x2], #16
1518        b.lt            2f
1519        ext             v2.16b,  v0.16b,  v1.16b,  #1
1520        ext             v3.16b,  v0.16b,  v1.16b,  #2
1521        umull           v4.8h,   v0.8b,   v30.8b
1522        umlal           v4.8h,   v2.8b,   v31.8b
1523        umlal           v4.8h,   v3.8b,   v30.8b
1524        umull2          v5.8h,   v0.16b,  v30.16b
1525        umlal2          v5.8h,   v2.16b,  v31.16b
1526        umlal2          v5.8h,   v3.16b,  v30.16b
1527        subs            w1,  w1,  #16
1528        mov             v0.16b,  v1.16b
1529        rshrn           v4.8b,   v4.8h,   #4
1530        rshrn2          v4.16b,  v5.8h,   #4
1531        sub             w3,  w3,  #16
1532        st1             {v4.16b}, [x0], #16
1533        b.gt            1b
1534        ret
15352:
1536        // Right padding
1537
1538        // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
1539        movrel          x5,  padding_mask
1540        sub             w6,  w3,  #32
1541        sub             x5,  x5,  w3,  uxtw
1542        add             x6,  x2,  w6,  sxtw
1543
1544        ld1             {v2.16b}, [x5]            // padding_mask
1545
1546        ld1r            {v1.16b}, [x6]
1547        bit             v0.16b,  v1.16b,  v2.16b  // Pad v0-v1
1548
1549        // Filter one block
1550        ext             v2.16b,  v0.16b,  v1.16b,  #1
1551        ext             v3.16b,  v0.16b,  v1.16b,  #2
1552        umull           v4.8h,   v0.8b,   v30.8b
1553        umlal           v4.8h,   v2.8b,   v31.8b
1554        umlal           v4.8h,   v3.8b,   v30.8b
1555        umull2          v5.8h,   v0.16b,  v30.16b
1556        umlal2          v5.8h,   v2.16b,  v31.16b
1557        umlal2          v5.8h,   v3.16b,  v30.16b
1558        subs            w1,  w1,  #16
1559        rshrn           v4.8b,   v4.8h,   #4
1560        rshrn2          v4.16b,  v5.8h,   #4
1561        st1             {v4.16b}, [x0], #16
1562        b.le            9f
15635:
1564        // After one block, any remaining output would only be filtering
1565        // padding - thus just store the padding.
1566        subs            w1,  w1,  #16
1567        st1             {v1.16b}, [x0], #16
1568        b.gt            5b
15699:
1570        ret
1571
1572L(fivetap):
1573        sub             x2,  x2,  #1              // topleft -= 1
1574        movi            v29.16b, #2
1575        ld1             {v0.16b}, [x2], #16
1576        movi            v30.16b, #4
1577        movi            v31.16b, #4
1578        ins             v0.b[0], v0.b[1]
15791:
1580        // in[end+1], is the last valid pixel. We produce 16 pixels out by
1581        // using 20 pixels in - the last pixel used is [19] of the ones
1582        // read/buffered.
1583        cmp             w3,  #18
1584        ld1             {v1.16b}, [x2], #16
1585        b.lt            2f                        // if (end + 1 < 19)
1586        ext             v2.16b,  v0.16b,  v1.16b,  #1
1587        ext             v3.16b,  v0.16b,  v1.16b,  #2
1588        ext             v4.16b,  v0.16b,  v1.16b,  #3
1589        ext             v5.16b,  v0.16b,  v1.16b,  #4
1590        umull           v6.8h,   v0.8b,   v29.8b
1591        umlal           v6.8h,   v2.8b,   v30.8b
1592        umlal           v6.8h,   v3.8b,   v31.8b
1593        umlal           v6.8h,   v4.8b,   v30.8b
1594        umlal           v6.8h,   v5.8b,   v29.8b
1595        umull2          v7.8h,   v0.16b,  v29.16b
1596        umlal2          v7.8h,   v2.16b,  v30.16b
1597        umlal2          v7.8h,   v3.16b,  v31.16b
1598        umlal2          v7.8h,   v4.16b,  v30.16b
1599        umlal2          v7.8h,   v5.16b,  v29.16b
1600        subs            w1,  w1,  #16
1601        mov             v0.16b,  v1.16b
1602        rshrn           v6.8b,   v6.8h,   #4
1603        rshrn2          v6.16b,  v7.8h,   #4
1604        sub             w3,  w3,  #16
1605        st1             {v6.16b}, [x0], #16
1606        b.gt            1b
1607        ret
16082:
1609        // Right padding
1610
1611        // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
1612        movrel          x5,  padding_mask, -1
1613        sub             w6,  w3,  #31
1614        sub             x5,  x5,  w3,  uxtw
1615        add             x6,  x2,  w6,  sxtw
1616
1617        ld1             {v2.16b, v3.16b}, [x5]    // padding_mask
1618
1619        ld1r            {v28.16b}, [x6]
1620        bit             v0.16b,  v28.16b, v2.16b  // Pad v0-v1
1621        bit             v1.16b,  v28.16b, v3.16b
16224:
1623        // Filter one block
1624        ext             v2.16b,  v0.16b,  v1.16b,  #1
1625        ext             v3.16b,  v0.16b,  v1.16b,  #2
1626        ext             v4.16b,  v0.16b,  v1.16b,  #3
1627        ext             v5.16b,  v0.16b,  v1.16b,  #4
1628        umull           v6.8h,   v0.8b,   v29.8b
1629        umlal           v6.8h,   v2.8b,   v30.8b
1630        umlal           v6.8h,   v3.8b,   v31.8b
1631        umlal           v6.8h,   v4.8b,   v30.8b
1632        umlal           v6.8h,   v5.8b,   v29.8b
1633        umull2          v7.8h,   v0.16b,  v29.16b
1634        umlal2          v7.8h,   v2.16b,  v30.16b
1635        umlal2          v7.8h,   v3.16b,  v31.16b
1636        umlal2          v7.8h,   v4.16b,  v30.16b
1637        umlal2          v7.8h,   v5.16b,  v29.16b
1638        subs            w1,  w1,  #16
1639        mov             v0.16b,  v1.16b
1640        mov             v1.16b,  v28.16b
1641        rshrn           v6.8b,   v6.8h,   #4
1642        rshrn2          v6.16b,  v7.8h,   #4
1643        sub             w3,  w3,  #16
1644        st1             {v6.16b}, [x0], #16
1645        b.le            9f
1646        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
1647        // filter properly once more - aka (w3 >= 0).
1648        cmp             w3,  #0
1649        b.ge            4b
16505:
1651        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
1652        // last valid pixel - thus just output that without filtering.
1653        subs            w1,  w1,  #16
1654        st1             {v1.16b}, [x0], #16
1655        b.gt            5b
16569:
1657        ret
1658endfunc
1659
1660// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
1661//                                const int n);
1662function ipred_pixel_set_8bpc_neon, export=1
1663        dup             v0.16b,  w1
16641:
1665        subs            w2,  w2,  #16
1666        st1             {v0.16b}, [x0], #16
1667        b.gt            1b
1668        ret
1669endfunc
1670
1671// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1672//                               const pixel *const top,
1673//                               const int width, const int height,
1674//                               const int dx, const int max_base_x);
1675function ipred_z1_fill1_8bpc_neon, export=1
1676        clz             w9,  w3
1677        movrel          x8,  ipred_z1_fill1_tbl
1678        sub             w9,  w9,  #25
1679        ldrsw           x9,  [x8, w9, uxtw #2]
1680        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1681        add             x8,  x8,  x9
1682        ld1r            {v31.16b}, [x10]          // padding
1683        mov             w7,  w5
1684        mov             w15, #64
1685        br              x8
168640:
1687        AARCH64_VALID_JUMP_TARGET
16884:
1689        lsr             w8,  w7,  #6              // base
1690        and             w9,  w7,  #0x3e           // frac
1691        add             w7,  w7,  w5              // xpos += dx
1692        cmp             w8,  w6                   // base >= max_base_x
1693        lsr             w10, w7,  #6              // base
1694        and             w11, w7,  #0x3e           // frac
1695        b.ge            49f
1696        ldr             d0,  [x2, w8, uxtw]       // top[base]
1697        ldr             d2,  [x2, w10, uxtw]
1698        dup             v4.4h,   w9               // frac
1699        dup             v5.4h,   w11
1700        ext             v1.8b,   v0.8b,   v0.8b,   #1 // top[base+1]
1701        ext             v3.8b,   v2.8b,   v2.8b,   #1
1702        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
1703        usubl           v7.8h,   v3.8b,   v2.8b
1704        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
1705        ushll           v17.8h,  v2.8b,   #6
1706        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
1707        mla             v17.4h,  v7.4h,   v5.4h
1708        rshrn           v16.8b,  v16.8h,  #6
1709        rshrn           v17.8b,  v17.8h,  #6
1710        st1             {v16.s}[0], [x0], x1
1711        add             w7,  w7,  w5              // xpos += dx
1712        subs            w4,  w4,  #2
1713        st1             {v17.s}[0], [x0], x1
1714        b.gt            4b
1715        ret
1716
171749:
1718        st1             {v31.s}[0], [x0], x1
1719        subs            w4,  w4,  #2
1720        st1             {v31.s}[0], [x0], x1
1721        b.gt            49b
1722        ret
1723
172480:
1725        AARCH64_VALID_JUMP_TARGET
17268:
1727        lsr             w8,  w7,  #6              // base
1728        and             w9,  w7,  #0x3e           // frac
1729        add             w7,  w7,  w5              // xpos += dx
1730        cmp             w8,  w6                   // base >= max_base_x
1731        lsr             w10, w7,  #6              // base
1732        and             w11, w7,  #0x3e           // frac
1733        b.ge            89f
1734        ldr             q0,  [x2, w8, uxtw]       // top[base]
1735        ldr             q2,  [x2, w10, uxtw]
1736        dup             v4.8b,   w9               // frac
1737        dup             v5.8b,   w11
1738        sub             w9,  w15, w9              // 64 - frac
1739        sub             w11, w15, w11
1740        dup             v6.8b,   w9               // 64 - frac
1741        dup             v7.8b,   w11
1742        ext             v1.16b,  v0.16b,  v0.16b,  #1 // top[base+1]
1743        ext             v3.16b,  v2.16b,  v2.16b,  #1
1744        umull           v16.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
1745        umlal           v16.8h,  v1.8b,   v4.8b   // + top[base+1]*frac
1746        umull           v17.8h,  v2.8b,   v7.8b
1747        umlal           v17.8h,  v3.8b,   v5.8b
1748        rshrn           v16.8b,  v16.8h,  #6
1749        rshrn           v17.8b,  v17.8h,  #6
1750        st1             {v16.8b}, [x0], x1
1751        add             w7,  w7,  w5              // xpos += dx
1752        subs            w4,  w4,  #2
1753        st1             {v17.8b}, [x0], x1
1754        b.gt            8b
1755        ret
1756
175789:
1758        st1             {v31.8b}, [x0], x1
1759        subs            w4,  w4,  #2
1760        st1             {v31.8b}, [x0], x1
1761        b.gt            89b
1762        ret
1763
1764160:
1765320:
1766640:
1767        AARCH64_VALID_JUMP_TARGET
1768
1769        mov             w12, w3
1770
1771        add             x13, x0,  x1
1772        lsl             x1,  x1,  #1
1773        sub             x1,  x1,  w3,  uxtw
17741:
1775        lsr             w8,  w7,  #6              // base
1776        and             w9,  w7,  #0x3e           // frac
1777        add             w7,  w7,  w5              // xpos += dx
1778        cmp             w8,  w6                   // base >= max_base_x
1779        lsr             w10, w7,  #6              // base
1780        and             w11, w7,  #0x3e           // frac
1781        b.ge            169f
1782        add             x8,  x2,  w8,  uxtw
1783        add             x10, x2,  w10, uxtw
1784        dup             v4.16b,  w9               // frac
1785        dup             v5.16b,  w11
1786        ld1             {v0.16b, v1.16b}, [x8],  #32 // top[base]
1787        ld1             {v2.16b, v3.16b}, [x10], #32
1788        sub             w9,  w15, w9              // 64 - frac
1789        sub             w11, w15, w11
1790        dup             v6.16b,  w9               // 64 - frac
1791        dup             v7.16b,  w11
1792        add             w7,  w7,  w5              // xpos += dx
17932:
1794        ext             v16.16b, v0.16b,  v1.16b,  #1 // top[base+1]
1795        ext             v17.16b, v2.16b,  v3.16b,  #1
1796        subs            w3,  w3,  #16
1797        umull           v18.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
1798        umlal           v18.8h,  v16.8b,  v4.8b   // + top[base+1]*frac
1799        umull2          v19.8h,  v0.16b,  v6.16b
1800        umlal2          v19.8h,  v16.16b, v4.16b
1801        umull           v20.8h,  v2.8b,   v7.8b
1802        umlal           v20.8h,  v17.8b,  v5.8b
1803        umull2          v21.8h,  v2.16b,  v7.16b
1804        umlal2          v21.8h,  v17.16b, v5.16b
1805        rshrn           v16.8b,  v18.8h,  #6
1806        rshrn2          v16.16b, v19.8h,  #6
1807        rshrn           v17.8b,  v20.8h,  #6
1808        rshrn2          v17.16b, v21.8h,  #6
1809        st1             {v16.16b}, [x0],  #16
1810        st1             {v17.16b}, [x13], #16
1811        b.le            3f
1812        mov             v0.16b,  v1.16b
1813        ld1             {v1.16b}, [x8],  #16 // top[base]
1814        mov             v2.16b,  v3.16b
1815        ld1             {v3.16b}, [x10], #16
1816        b               2b
1817
18183:
1819        subs            w4,  w4,  #2
1820        b.le            9f
1821        add             x0,  x0,  x1
1822        add             x13, x13, x1
1823        mov             w3,  w12
1824        b               1b
18259:
1826        ret
1827
1828169:
1829        st1             {v31.16b}, [x0],  #16
1830        subs            w3,  w3,  #16
1831        st1             {v31.16b}, [x13], #16
1832        b.gt            169b
1833        subs            w4,  w4,  #2
1834        b.le            9b
1835        add             x0,  x0,  x1
1836        add             x13, x13, x1
1837        mov             w3,  w12
1838        b               169b
1839endfunc
1840
1841jumptable ipred_z1_fill1_tbl
1842        .word 640b - ipred_z1_fill1_tbl
1843        .word 320b - ipred_z1_fill1_tbl
1844        .word 160b - ipred_z1_fill1_tbl
1845        .word 80b  - ipred_z1_fill1_tbl
1846        .word 40b  - ipred_z1_fill1_tbl
1847endjumptable
1848
1849function ipred_z1_fill2_8bpc_neon, export=1
1850        cmp             w3,  #8
1851        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1852        ld1r            {v31.16b}, [x10]          // padding
1853        mov             w7,  w5
1854        mov             w15, #64
1855        b.eq            8f
1856
18574:      // w == 4
1858        lsr             w8,  w7,  #6              // base
1859        and             w9,  w7,  #0x3e           // frac
1860        add             w7,  w7,  w5              // xpos += dx
1861        cmp             w8,  w6                   // base >= max_base_x
1862        lsr             w10, w7,  #6              // base
1863        and             w11, w7,  #0x3e           // frac
1864        b.ge            49f
1865        ldr             d0,  [x2, w8, uxtw]       // top[base]
1866        ldr             d2,  [x2, w10, uxtw]
1867        dup             v4.4h,   w9               // frac
1868        dup             v5.4h,   w11
1869        uzp2            v1.8b,   v0.8b,   v0.8b   // top[base+1]
1870        uzp1            v0.8b,   v0.8b,   v0.8b   // top[base]
1871        uzp2            v3.8b,   v2.8b,   v2.8b
1872        uzp1            v2.8b,   v2.8b,   v2.8b
1873        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
1874        usubl           v7.8h,   v3.8b,   v2.8b
1875        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
1876        ushll           v17.8h,  v2.8b,   #6
1877        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
1878        mla             v17.4h,  v7.4h,   v5.4h
1879        rshrn           v16.8b,  v16.8h,  #6
1880        rshrn           v17.8b,  v17.8h,  #6
1881        st1             {v16.s}[0], [x0], x1
1882        add             w7,  w7,  w5              // xpos += dx
1883        subs            w4,  w4,  #2
1884        st1             {v17.s}[0], [x0], x1
1885        b.gt            4b
1886        ret
1887
188849:
1889        st1             {v31.s}[0], [x0], x1
1890        subs            w4,  w4,  #2
1891        st1             {v31.s}[0], [x0], x1
1892        b.gt            49b
1893        ret
1894
18958:      // w == 8
1896        lsr             w8,  w7,  #6              // base
1897        and             w9,  w7,  #0x3e           // frac
1898        add             w7,  w7,  w5              // xpos += dx
1899        cmp             w8,  w6                   // base >= max_base_x
1900        lsr             w10, w7,  #6              // base
1901        and             w11, w7,  #0x3e           // frac
1902        b.ge            89f
1903        ldr             q0,  [x2, w8, uxtw]       // top[base]
1904        ldr             q2,  [x2, w10, uxtw]
1905        dup             v4.8b,   w9               // frac
1906        dup             v5.8b,   w11
1907        sub             w9,  w15, w9              // 64 - frac
1908        sub             w11, w15, w11
1909        dup             v6.8b,   w9               // 64 - frac
1910        dup             v7.8b,   w11
1911        uzp2            v1.16b,  v0.16b,  v0.16b  // top[base+1]
1912        uzp1            v0.16b,  v0.16b,  v0.16b  // top[base]
1913        uzp2            v3.16b,  v2.16b,  v2.16b
1914        uzp1            v2.16b,  v2.16b,  v2.16b
1915        umull           v16.8h,  v1.8b,   v4.8b   // top[base+1]*frac
1916        umlal           v16.8h,  v0.8b,   v6.8b   // + top[base]*(64-frac)
1917        umull           v17.8h,  v3.8b,   v5.8b
1918        umlal           v17.8h,  v2.8b,   v7.8b
1919        rshrn           v16.8b,  v16.8h,  #6
1920        rshrn           v17.8b,  v17.8h,  #6
1921        st1             {v16.8b}, [x0], x1
1922        add             w7,  w7,  w5              // xpos += dx
1923        subs            w4,  w4,  #2
1924        st1             {v17.8b}, [x0], x1
1925        b.gt            8b
1926        ret
1927
192889:
1929        st1             {v31.8b}, [x0], x1
1930        subs            w4,  w4,  #2
1931        st1             {v31.8b}, [x0], x1
1932        b.gt            89b
1933        ret
1934endfunc
1935
1936// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
1937//                              const int n);
1938function ipred_reverse_8bpc_neon, export=1
1939        sub             x1,  x1,  #16
1940        add             x3,  x0,  #8
1941        mov             x4,  #16
19421:
1943        ld1             {v0.16b}, [x1]
1944        subs            w2,  w2,  #16
1945        rev64           v0.16b,  v0.16b
1946        sub             x1,  x1,  #16
1947        st1             {v0.d}[1], [x0], x4
1948        st1             {v0.d}[0], [x3], x4
1949        b.gt            1b
1950        ret
1951endfunc
1952
1953const increments
1954        .short          0,  1,  2,  3,  4,  5,  6,  7
1955        .short          8,  9,  10, 11, 12, 13, 14, 15
1956endconst
1957
1958// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1959//                               const pixel *const top,
1960//                               const pixel *const left,
1961//                               const int width, const int height,
1962//                               const int dx, const int dy);
1963function ipred_z2_fill1_8bpc_neon, export=1
1964        clz             w10, w4
1965        movrel          x9,  ipred_z2_fill1_tbl
1966        sub             w10, w10, #25
1967        ldrsw           x10, [x9, w10, uxtw #2]
1968        mov             w8,  #(1 << 6)            // xpos = 1 << 6
1969        add             x9,  x9,  x10
1970        sub             w8,  w8,  w6              // xpos -= dx
1971
1972        movrel          x11, increments
1973        ld1             {v31.8h},  [x11]          // increments
1974        neg             w7,  w7                   // -dy
1975
1976        br              x9
197740:
1978        AARCH64_VALID_JUMP_TARGET
1979
1980        dup             v30.4h,  w7               // -dy
1981        movi            v17.8b,  #1
1982
1983        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
1984        movi            v25.16b, #0x3e
1985        add             v30.4h,  v16.4h,  v30.4h  // -= dy
1986
1987        xtn             v31.8b,  v31.8h           // {0,1,2,3}
1988
1989        // Worst case height for w=4 is 16, but we need at least h+1 elements
1990        ld1             {v0.16b, v1.16b}, [x3]    // left[]
1991
1992        movi            v26.16b, #64
1993        movi            v19.16b, #2
1994
1995        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
1996        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
1997        and             v27.8b,  v27.8b,  v25.8b  // frac_y
1998
1999        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2000
2001        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2002        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2003
2004        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
2005
2006        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
2007
2008        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2009
2010        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2011
2012        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2013        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2014
2015        movi            v29.8b,  #2
20164:
2017        asr             w9,  w8,  #6              // base_x
2018        dup             v6.4h,   w8               // xpos
2019        sub             w8,  w8,  w6              // xpos -= dx
2020        cmp             w9,  #-4                  // base_x <= -4
2021        asr             w11, w8,  #6              // base_x
2022        b.le            49f
2023
2024        dup             v7.4h,   w8               // xpos
2025
2026        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2027        ldr             d4,  [x2, w11, sxtw]
2028
2029        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2030
2031        // Cut corners here; only doing tbl over v0 here; we only
2032        // seem to need the last pixel, from v1, after skipping to the
2033        // left-only codepath below.
2034        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2035
2036        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2037        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2038
2039        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
2040        ext             v5.8b,   v4.8b,   v4.8b,   #1
2041
2042        and             v6.8b,   v6.8b,   v25.8b  // frac_x
2043
2044        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2045
2046        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
2047        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
2048
2049        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
2050
2051        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
2052
2053        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
2054        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2055
2056        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2057        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
2058
2059        cmge            v20.8b,  v20.8b,  #0
2060
2061        rshrn           v16.8b,  v16.8h,  #6
2062        rshrn           v22.8b,  v22.8h,  #6
2063
2064        bit             v16.8b,  v22.8b,  v20.8b
2065
2066        st1             {v16.s}[0], [x0], x1
2067        sub             w8,  w8,  w6              // xpos -= dx
2068        subs            w5,  w5,  #2
2069        st1             {v16.s}[1], [x0], x1
2070        b.le            9f
2071
2072        ext             v16.8b,  v17.8b,  v17.8b, #4
2073        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2074        b               4b
2075
207649:
2077        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2]
2078
2079        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2080
2081        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
2082        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2083        rshrn           v18.8b,  v18.8h,  #6
2084
2085        st1             {v18.s}[0], [x0], x1
2086        subs            w5,  w5,  #2
2087        st1             {v18.s}[1], [x0], x1
2088        b.le            9f
2089
2090        ext             v16.8b,  v17.8b,  v17.8b, #4
2091        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2092        b               49b
2093
20949:
2095        ret
2096
209780:
2098        AARCH64_VALID_JUMP_TARGET
2099
2100        dup             v30.8h,  w7               // -dy
2101        movi            v17.8b,  #1
2102
2103        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
2104        movi            v25.16b, #0x3e
2105        add             v30.8h,  v16.8h,  v30.8h  // -= dy
2106
2107        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2108
2109        // Worst case height for w=8 is 32, but we need at least h+1 elements
2110        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
2111
2112        movi            v26.16b, #64
2113        movi            v19.16b, #2
2114
2115        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2116        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2117        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2118
2119        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2120
2121        // Cut corners here; for the first row we don't expect to need to
2122        // read outside of v0.
2123        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2124
2125        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
2126        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
2127
2128        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2129
2130        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
2131
2132        movi            v24.8b,  #2               // 2
21338:
2134        asr             w9,  w8,  #6              // base_x
2135        dup             v16.8h,   w8              // xpos
2136        sub             w8,  w8,  w6              // xpos -= dx
2137        cmp             w9,  #-8                  // base_x <= -8
2138        asr             w11, w8,  #6              // base_x
2139        b.le            89f
2140
2141        dup             v17.8h,   w8              // xpos
2142
2143        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2144        ldr             q6,  [x2, w11, sxtw]
2145
2146        // Cut corners here; only doing tbl over v0-v1 here; we only
2147        // seem to need the last pixel, from v2, after skipping to the
2148        // left-only codepath below.
2149        tbl             v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1]
2150
2151        shrn            v21.8b,  v16.8h,  #6      // first base_x
2152        shrn2           v21.16b, v17.8h,  #6
2153        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2154        xtn2            v16.16b, v17.8h
2155
2156        tbl             v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2]
2157
2158        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
2159        ext             v7.16b,  v6.16b,  v6.16b,  #1
2160
2161        and             v16.16b, v16.16b, v25.16b // frac_x
2162
2163        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
2164        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
2165
2166        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
2167
2168        add             v21.16b, v21.16b, v31.16b // actual base_x
2169
2170        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2171        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2172        umull           v17.8h,  v19.8b,  v28.8b
2173        umlal           v17.8h,  v20.8b,  v27.8b
2174
2175        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2176        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2177        umull2          v23.8h,  v4.16b,  v7.16b
2178        umlal2          v23.8h,  v5.16b,  v16.16b
2179
2180        cmge            v21.16b, v21.16b, #0
2181
2182        rshrn           v6.8b,   v6.8h,   #6
2183        rshrn2          v6.16b,  v17.8h,  #6
2184        rshrn           v22.8b,  v22.8h,  #6
2185        rshrn2          v22.16b, v23.8h,  #6
2186
2187        bit             v6.16b,  v22.16b, v21.16b
2188
2189        st1             {v6.d}[0], [x0], x1
2190        sub             w8,  w8,  w6              // xpos -= dx
2191        subs            w5,  w5,  #2
2192        st1             {v6.d}[1], [x0], x1
2193        b.le            9f
2194
2195        mov             v18.8b,  v20.8b
2196        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2197        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2198        b               8b
2199
220089:
2201        tbl             v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1]
2202        tbl             v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2]
2203
2204        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2205        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2206        umull           v17.8h,  v19.8b,  v28.8b
2207        umlal           v17.8h,  v20.8b,  v27.8b
2208
2209        rshrn           v6.8b,   v6.8h,   #6
2210        rshrn2          v6.16b,  v17.8h,  #6
2211
2212        st1             {v6.d}[0], [x0], x1
2213        subs            w5,  w5,  #2
2214        st1             {v6.d}[1], [x0], x1
2215        b.le            9f
2216
2217        mov             v18.8b,  v20.8b
2218        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2219        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2220        b               89b
2221
22229:
2223        ret
2224
2225160:
2226        AARCH64_VALID_JUMP_TARGET
2227
2228        stp             d8,  d9,  [sp, #-0x40]!
2229        stp             d10, d11, [sp, #0x10]
2230        stp             d12, d13, [sp, #0x20]
2231        stp             d14, d15, [sp, #0x30]
2232
2233        add             x11, x11, #16             // increments
2234
2235        dup             v18.8h,  w7               // -dy
2236        movi            v17.16b, #1
2237        add             x3,  x3,  #1              // Skip past left[0]
2238
2239        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
2240
2241        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2242        mul             v19.8h,  v14.8h,  v18.8h  // {8,9,10,11,12,13,14,15}* -dy
2243        movi            v25.16b, #0x3e
2244        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2245        add             v18.8h,  v19.8h,  v18.8h
2246
2247        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2248        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
2249
2250        // Worst case height is 64.
2251        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
2252        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
2253
2254        movi            v26.16b, #64
2255        movi            v19.16b, #2
2256
2257        xtn             v27.8b,  v16.8h           // (uint8_t)ypos
2258        xtn2            v27.16b, v18.8h
2259        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
2260        shrn2           v29.16b, v18.8h,  #6
2261        mov             v18.16b, v15.16b          // left[0]
2262        and             v27.16b, v27.16b, v25.16b // frac_y
2263
2264        // Cut corners here; for the first row we don't expect to need to
2265        // read outside of v0.
2266        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
2267
2268        add             v30.16b, v29.16b, v19.16b // base_y + 2
2269        add             v29.16b, v29.16b, v17.16b // base_y + 1
2270
2271        sub             v28.16b, v26.16b, v27.16b // 64 - frac_y
2272
2273        movi            v24.16b, #2               // 2
227416:
2275        asr             w9,  w8,  #6              // base_x
2276        dup             v16.8h,   w8              // xpos
2277        sub             w8,  w8,  w6              // xpos -= dx
2278        cmp             w9,  #-16                 // base_x <= -16
2279        asr             w11, w8,  #6              // base_x
2280        b.le            169f
2281
2282        dup             v17.8h,   w8              // xpos
2283
2284        add             x9,  x2,  w9,  sxtw
2285        add             x11, x2,  w11, sxtw
2286
2287        ld1             {v4.16b, v5.16b}, [x9]    // top[base_x]
2288        mov             v19.16b, v15.16b          // left[0]
2289        ld1             {v6.16b, v7.16b}, [x11]
2290
2291        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2292
2293        mov             v20.16b, v15.16b          // left[0]
2294
2295        shrn            v21.8b,  v16.8h,  #6      // first base_x
2296        shrn            v22.8b,  v17.8h,  #6
2297        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2298        xtn             v17.8b,  v17.8h
2299
2300        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2301
2302        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
2303        trn1            v22.2d,  v22.2d,  v22.2d
2304        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
2305        trn1            v17.2d,  v17.2d,  v17.2d
2306
2307        ext             v5.16b,  v4.16b,  v5.16b,  #1 // top[base_x+1]
2308        ext             v7.16b,  v6.16b,  v7.16b,  #1
2309
2310        and             v16.16b, v16.16b, v25.16b // frac_x
2311        and             v17.16b, v17.16b, v25.16b
2312
2313        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2314        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2315
2316        sub             v8.16b,  v26.16b, v16.16b // 64 - frac_x
2317        sub             v9.16b,  v26.16b, v17.16b
2318
2319        umull2          v11.8h,  v18.16b, v28.16b
2320        umlal2          v11.8h,  v19.16b, v27.16b
2321
2322        add             v21.16b, v21.16b, v31.16b // actual base_x
2323        add             v22.16b, v22.16b, v31.16b
2324
2325        umull           v12.8h,  v19.8b,  v28.8b
2326        umlal           v12.8h,  v20.8b,  v27.8b
2327        umull2          v13.8h,  v19.16b, v28.16b
2328        umlal2          v13.8h,  v20.16b, v27.16b
2329
2330        rshrn           v10.8b,  v10.8h,  #6
2331        rshrn2          v10.16b, v11.8h,  #6
2332        rshrn           v11.8b,  v12.8h,  #6
2333        rshrn2          v11.16b, v13.8h,  #6
2334
2335        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2336        umlal           v12.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2337        umull2          v13.8h,  v4.16b,  v8.16b
2338        umlal2          v13.8h,  v5.16b,  v16.16b
2339        umull           v14.8h,  v6.8b,   v9.8b
2340        umlal           v14.8h,  v7.8b,   v17.8b
2341        umull2          v18.8h,  v6.16b,  v9.16b
2342        umlal2          v18.8h,  v7.16b,  v17.16b
2343
2344        cmge            v21.16b, v21.16b, #0
2345        cmge            v22.16b, v22.16b, #0
2346
2347        rshrn           v12.8b,  v12.8h,  #6
2348        rshrn2          v12.16b, v13.8h,  #6
2349        rshrn           v13.8b,  v14.8h,  #6
2350        rshrn2          v13.16b, v18.8h,  #6
2351
2352        bit             v10.16b, v12.16b, v21.16b
2353        bit             v11.16b, v13.16b, v22.16b
2354
2355        st1             {v10.16b}, [x0], x1
2356        subs            w5,  w5,  #2
2357        sub             w8,  w8,  w6              // xpos -= dx
2358        st1             {v11.16b}, [x0], x1
2359        b.le            9f
2360
2361        mov             v18.16b, v20.16b
2362        add             v29.16b, v29.16b, v24.16b // base_y += 2
2363        add             v30.16b, v30.16b, v24.16b // base_y += 2
2364        b               16b
2365
2366169:
2367        mov             v19.16b, v15.16b
2368        mov             v20.16b, v15.16b
2369        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2370        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2371
2372        umull           v4.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2373        umlal           v4.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2374        umull2          v5.8h,   v18.16b, v28.16b
2375        umlal2          v5.8h,   v19.16b, v27.16b
2376        umull           v6.8h,   v19.8b,  v28.8b
2377        umlal           v6.8h,   v20.8b,  v27.8b
2378        umull2          v7.8h,   v19.16b, v28.16b
2379        umlal2          v7.8h,   v20.16b, v27.16b
2380
2381        rshrn           v4.8b,   v4.8h,   #6
2382        rshrn2          v4.16b,  v5.8h,   #6
2383        rshrn           v5.8b,   v6.8h,   #6
2384        rshrn2          v5.16b,  v7.8h,   #6
2385
2386        st1             {v4.16b}, [x0], x1
2387        subs            w5,  w5,  #2
2388        st1             {v5.16b}, [x0], x1
2389        b.le            9f
2390
2391        mov             v18.16b, v20.16b
2392        add             v29.16b, v29.16b, v24.16b // base_y += 2
2393        add             v30.16b, v30.16b, v24.16b // base_y += 2
2394        b               169b
2395
23969:
2397        ldp             d14, d15, [sp, #0x30]
2398        ldp             d12, d13, [sp, #0x20]
2399        ldp             d10, d11, [sp, #0x10]
2400        ldp             d8,  d9,  [sp], 0x40
2401        ret
2402
2403320:
2404640:
2405        AARCH64_VALID_JUMP_TARGET
2406
2407        stp             d8,  d9,  [sp, #-0x40]!
2408        stp             d10, d11, [sp, #0x10]
2409        stp             d12, d13, [sp, #0x20]
2410        stp             d14, d15, [sp, #0x30]
2411
2412        add             x11, x11, #16             // increments
2413
2414        dup             v25.8h,  w7               // -dy
2415        add             x3,  x3,  #1              // Skip past left[0]
2416
2417        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
2418
2419        add             x13, x0,  x1              // alternating row
2420        lsl             x1,  x1,  #1              // stride *= 2
2421        sub             x1,  x1,  w4,  uxtw       // stride -= width
2422
2423        movi            v11.8h,  #8
2424        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
2425        add             v26.8h,  v26.8h,  v25.8h  // -= dy
2426        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
2427
2428        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2429        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
2430
2431        // Worst case height is 64.
2432        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
2433        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
2434
2435        mov             w12, w4                   // orig w
2436        neg             w14, w4                   // -w
2437
24381:
2439        mov             v23.16b, v26.16b          // reset ypos
2440
2441        asr             w9,  w8,  #6              // base_x
2442        dup             v16.8h,   w8              // xpos
2443        sub             w8,  w8,  w6              // xpos -= dx
2444        cmp             w9,  w14                  // base_x <= -w
2445        asr             w11, w8,  #6              // base_x
2446        b.le            329f
2447
2448        dup             v17.8h,   w8              // xpos
2449        sub             w8,  w8,  w6              // xpos -= dx
2450
2451        add             x9,  x2,  w9,  sxtw
2452        add             x11, x2,  w11, sxtw
2453
2454        sqshrn          v21.8b,  v16.8h,  #6      // first base_x
2455        sqshrn          v22.8b,  v17.8h,  #6
2456        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2457        xtn             v17.8b,  v17.8h
2458
2459        ld1             {v4.16b}, [x9], #16       // top[base_x]
2460        ld1             {v6.16b}, [x11], #16
2461
2462        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
2463        trn1            v22.2d,  v22.2d,  v22.2d
2464        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
2465        trn1            v17.2d,  v17.2d,  v17.2d
2466
2467        movi            v10.16b, #0x3e
2468        movi            v11.16b, #64
2469
2470        and             v16.16b, v16.16b, v10.16b // frac_x
2471        and             v17.16b, v17.16b, v10.16b
2472
2473        sub             v8.16b,  v11.16b, v16.16b // 64 - frac_x
2474        sub             v9.16b,  v11.16b, v17.16b
2475
2476        add             v21.16b, v21.16b, v31.16b // actual base_x
2477        add             v22.16b, v22.16b, v31.16b
2478
24792:
2480        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2481        movi            v12.16b, #64
2482        movi            v20.16b, #2
2483        movi            v10.16b, #0x3e
2484
2485        smov            w10,     v22.b[0]
2486
2487        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
2488        xtn2            v27.16b, v13.8h
2489        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2490        shrn2           v29.16b, v13.8h,  #6
2491        cmp             w10, #0                   // base_x (bottom left) >= 0
2492        and             v27.16b, v27.16b, v10.16b // frac_y
2493
2494        mov             v18.16b, v15.16b          // left[0]
2495
2496        b.ge            4f
2497
2498        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
2499        movi            v13.16b, #1
2500
2501        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2502        add             v29.16b, v29.16b, v13.16b // base_y + 1
2503        mov             v19.16b, v15.16b          // left[0]
2504
2505        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
2506
2507        ld1             {v5.16b}, [x9], #16       // top[base_x]
2508        ld1             {v7.16b}, [x11], #16
2509
2510        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2511        add             v29.16b, v29.16b, v13.16b // base_y + 2
2512
2513        mov             v20.16b, v15.16b          // left[0]
2514        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2515
2516        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2517        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2518        umull2          v11.8h,  v18.16b, v28.16b
2519        umlal2          v11.8h,  v19.16b, v27.16b
2520        umull           v12.8h,  v19.8b,  v28.8b
2521        umlal           v12.8h,  v20.8b,  v27.8b
2522        umull2          v13.8h,  v19.16b, v28.16b
2523        umlal2          v13.8h,  v20.16b, v27.16b
2524
2525        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
2526        ext             v19.16b, v6.16b,  v7.16b,  #1
2527
2528        rshrn           v10.8b,  v10.8h,  #6
2529        rshrn2          v10.16b, v11.8h,  #6
2530        rshrn           v11.8b,  v12.8h,  #6
2531        rshrn2          v11.16b, v13.8h,  #6
2532
2533        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2534        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
2535        umull2          v13.8h,  v4.16b,  v8.16b
2536        umlal2          v13.8h,  v18.16b, v16.16b
2537        umull           v14.8h,  v6.8b,   v9.8b
2538        umlal           v14.8h,  v19.8b,  v17.8b
2539        umull2          v20.8h,  v6.16b,  v9.16b
2540        umlal2          v20.8h,  v19.16b, v17.16b
2541
2542        cmge            v18.16b, v21.16b, #0
2543        cmge            v19.16b, v22.16b, #0
2544
2545        rshrn           v12.8b,  v12.8h,  #6
2546        rshrn2          v12.16b, v13.8h,  #6
2547        rshrn           v13.8b,  v14.8h,  #6
2548        rshrn2          v13.16b, v20.8h,  #6
2549
2550        bit             v10.16b, v12.16b, v18.16b
2551        bit             v11.16b, v13.16b, v19.16b
2552
2553        st1             {v10.16b}, [x0], #16
2554        subs            w4,  w4,  #16
2555        st1             {v11.16b}, [x13], #16
2556        b.le            3f
2557
2558        movi            v10.16b, #16
2559        mov             v4.16b,  v5.16b
2560        mov             v6.16b,  v7.16b
2561        add             v21.16b, v21.16b, v10.16b // base_x += 16
2562        add             v22.16b, v22.16b, v10.16b
2563        b               2b
2564
25653:
2566        subs            w5,  w5,  #2
2567        b.le            9f
2568        movi            v10.8h, #128
2569        add             x0,  x0,  x1
2570        add             x13, x13, x1
2571        mov             w4,  w12                  // reset w
2572        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
2573        b               1b
2574
25754:      // The rest of the row only predicted from top[]
2576        ld1             {v5.16b}, [x9], #16       // top[base_x]
2577        ld1             {v7.16b}, [x11], #16
2578
2579        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
2580        ext             v19.16b, v6.16b,  v7.16b,  #1
2581
2582        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2583        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
2584        umull2          v13.8h,  v4.16b,  v8.16b
2585        umlal2          v13.8h,  v18.16b, v16.16b
2586        umull           v14.8h,  v6.8b,   v9.8b
2587        umlal           v14.8h,  v19.8b,  v17.8b
2588        umull2          v20.8h,  v6.16b,  v9.16b
2589        umlal2          v20.8h,  v19.16b, v17.16b
2590
2591        rshrn           v12.8b,  v12.8h,  #6
2592        rshrn2          v12.16b, v13.8h,  #6
2593        rshrn           v13.8b,  v14.8h,  #6
2594        rshrn2          v13.16b, v20.8h,  #6
2595
2596        st1             {v12.16b}, [x0], #16
2597        subs            w4,  w4,  #16
2598        st1             {v13.16b}, [x13], #16
2599        b.le            3b
2600
2601        mov             v4.16b,  v5.16b
2602        mov             v6.16b,  v7.16b
2603        b               4b
2604
2605329:    // The rest of the block only predicted from left[]
2606        add             x1,  x1,  w4,  uxtw       // restore stride
2607        mov             w12, w5                   // orig remaining h
26081:
2609        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2610        movi            v12.16b, #64
2611        movi            v10.16b, #0x3e
2612
2613        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
2614        xtn2            v27.16b, v13.8h
2615        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2616        shrn2           v29.16b, v13.8h,  #6
2617        and             v27.16b, v27.16b, v10.16b // frac_y
2618
2619        mov             v18.16b, v15.16b          // left[0]
2620        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
2621        movi            v21.16b, #1
2622
2623        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2624        add             v29.16b, v29.16b, v21.16b // base_y + 1
2625
2626        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
26272:
2628        mov             v19.16b, v15.16b          // left[0]
2629        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2630        add             v29.16b, v29.16b, v21.16b // base_y + 2
2631        mov             v20.16b, v15.16b          // left[0]
2632        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2633        add             v29.16b, v29.16b, v21.16b // next base_y
2634
2635        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2636        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2637        umull2          v11.8h,  v18.16b, v28.16b
2638        umlal2          v11.8h,  v19.16b, v27.16b
2639        umull           v12.8h,  v19.8b,  v28.8b
2640        umlal           v12.8h,  v20.8b,  v27.8b
2641        umull2          v13.8h,  v19.16b, v28.16b
2642        umlal2          v13.8h,  v20.16b, v27.16b
2643
2644        rshrn           v10.8b,  v10.8h,  #6
2645        rshrn2          v10.16b, v11.8h,  #6
2646        rshrn           v11.8b,  v12.8h,  #6
2647        rshrn2          v11.16b, v13.8h,  #6
2648
2649        st1             {v10.16b}, [x0], x1
2650        subs            w5,  w5,  #2
2651        st1             {v11.16b}, [x13], x1
2652        b.le            3f
2653        mov             v18.16b, v20.16b
2654        b               2b
2655
26563:
2657        subs            w4,  w4,  #16
2658        b.le            9f
2659
2660        lsr             x1,  x1,  #1
2661        msub            x0,  x1,  x12, x0         // ptr -= h * stride
2662        msub            x13, x1,  x12, x13
2663        lsl             x1,  x1,  #1
2664        add             x0,  x0,  #16
2665        add             x13, x13, #16
2666        mov             w5,  w12                  // reset h
2667        b               1b
2668
26699:
2670        ldp             d14, d15, [sp, #0x30]
2671        ldp             d12, d13, [sp, #0x20]
2672        ldp             d10, d11, [sp, #0x10]
2673        ldp             d8,  d9,  [sp], 0x40
2674        ret
2675endfunc
2676
2677jumptable ipred_z2_fill1_tbl
2678        .word 640b - ipred_z2_fill1_tbl
2679        .word 320b - ipred_z2_fill1_tbl
2680        .word 160b - ipred_z2_fill1_tbl
2681        .word 80b  - ipred_z2_fill1_tbl
2682        .word 40b  - ipred_z2_fill1_tbl
2683endjumptable
2684
2685function ipred_z2_fill2_8bpc_neon, export=1
2686        cmp             w4,  #8
2687        mov             w8,  #(2 << 6)            // xpos = 2 << 6
2688        sub             w8,  w8,  w6              // xpos -= dx
2689
2690        movrel          x11, increments
2691        ld1             {v31.8h},  [x11]          // increments
2692        neg             w7,  w7                   // -dy
2693        b.eq            80f
2694
269540:
2696        dup             v30.4h,  w7               // -dy
2697        movi            v17.8b,  #1
2698
2699        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2700        movi            v25.16b, #0x3e
2701        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2702
2703        xtn             v31.8b,  v31.8h           // {0,1,2,3}
2704
2705        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2706        // from left.
2707        ld1             {v0.16b}, [x3]            // left[]
2708
2709        movi            v26.16b, #64
2710        movi            v19.16b, #2
2711
2712        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2713        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2714        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2715
2716        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2717
2718        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2719        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2720
2721        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
2722
2723        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
2724
2725        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2726
2727        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2728
2729        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2730        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2731
2732        movi            v29.8b,  #2
2733        add             v31.8b,  v31.8b,  v31.8b  // {0,2,4,6,0,2,4,6}
27344:
2735        asr             w9,  w8,  #6              // base_x
2736        dup             v6.4h,   w8               // xpos
2737        sub             w8,  w8,  w6              // xpos -= dx
2738        cmp             w9,  #-8                  // base_x <= -8
2739        asr             w11, w8,  #6              // base_x
2740        b.le            49f
2741
2742        dup             v7.4h,   w8               // xpos
2743
2744        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2745        ldr             d4,  [x2, w11, sxtw]
2746
2747        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2748
2749        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2750
2751        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2752        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2753
2754        uzp2            v3.8b,   v2.8b,   v4.8b   // top[base_x+1]
2755        uzp1            v2.8b,   v2.8b,   v4.8b   // top[base_x]
2756
2757        and             v6.8b,   v6.8b,   v25.8b  // frac_x
2758
2759        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2760
2761        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
2762
2763        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
2764
2765        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
2766        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2767
2768        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2769        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
2770
2771        cmge            v20.8b,  v20.8b,  #0
2772
2773        rshrn           v16.8b,  v16.8h,  #6
2774        rshrn           v22.8b,  v22.8h,  #6
2775
2776        bit             v16.8b,  v22.8b,  v20.8b
2777
2778        st1             {v16.s}[0], [x0], x1
2779        sub             w8,  w8,  w6              // xpos -= dx
2780        subs            w5,  w5,  #2
2781        st1             {v16.s}[1], [x0], x1
2782        b.le            9f
2783
2784        ext             v16.8b,  v17.8b,  v17.8b, #4
2785        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2786        b               4b
2787
278849:
2789        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2790
2791        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2792
2793        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
2794        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2795        rshrn           v18.8b,  v18.8h,  #6
2796
2797        st1             {v18.s}[0], [x0], x1
2798        subs            w5,  w5,  #2
2799        st1             {v18.s}[1], [x0], x1
2800        b.le            9f
2801
2802        ext             v16.8b,  v17.8b,  v17.8b, #4
2803        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2804        b               49b
2805
28069:
2807        ret
2808
280980:
2810        dup             v30.8h,  w7               // -dy
2811        movi            v17.8b,  #1
2812
2813        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
2814        movi            v25.16b, #0x3e
2815        add             v30.8h,  v16.8h,  v30.8h  // -= dy
2816
2817        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2818
2819        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2820        // from left.
2821        ld1             {v0.16b}, [x3]    // left[]
2822
2823        movi            v26.16b, #64
2824        movi            v19.16b, #2
2825
2826        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2827        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2828        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2829
2830        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2831
2832        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2833
2834        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
2835        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
2836
2837        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2838
2839        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
2840
2841        movi            v24.8b,  #2               // 2
2842        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
28438:
2844        asr             w9,  w8,  #6              // base_x
2845        dup             v16.8h,   w8              // xpos
2846        sub             w8,  w8,  w6              // xpos -= dx
2847        cmp             w9,  #-16                 // base_x <= -16
2848        asr             w11, w8,  #6              // base_x
2849        b.le            89f
2850
2851        dup             v17.8h,   w8              // xpos
2852
2853        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2854        ldr             q6,  [x2, w11, sxtw]
2855
2856        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
2857
2858        shrn            v21.8b,  v16.8h,  #6      // first base_x
2859        shrn2           v21.16b, v17.8h,  #6
2860        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2861        xtn2            v16.16b, v17.8h
2862
2863        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
2864
2865        uzp2            v5.16b,  v4.16b,  v6.16b  // top[base_x+1]
2866        uzp1            v4.16b,  v4.16b,  v6.16b  // top[base_x]
2867
2868        and             v16.16b, v16.16b, v25.16b // frac_x
2869
2870        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
2871
2872        add             v21.16b, v21.16b, v31.16b // actual base_x
2873
2874        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2875        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2876        umull           v17.8h,  v19.8b,  v28.8b
2877        umlal           v17.8h,  v20.8b,  v27.8b
2878
2879        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2880        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2881        umull2          v23.8h,  v4.16b,  v7.16b
2882        umlal2          v23.8h,  v5.16b,  v16.16b
2883
2884        cmge            v21.16b, v21.16b, #0
2885
2886        rshrn           v6.8b,   v6.8h,   #6
2887        rshrn2          v6.16b,  v17.8h,  #6
2888        rshrn           v22.8b,  v22.8h,  #6
2889        rshrn2          v22.16b, v23.8h,  #6
2890
2891        bit             v6.16b,  v22.16b, v21.16b
2892
2893        st1             {v6.d}[0], [x0], x1
2894        sub             w8,  w8,  w6              // xpos -= dx
2895        subs            w5,  w5,  #2
2896        st1             {v6.d}[1], [x0], x1
2897        b.le            9f
2898
2899        mov             v18.8b,  v20.8b
2900        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2901        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2902        b               8b
2903
290489:
2905        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
2906        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
2907
2908        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2909        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2910        umull           v17.8h,  v19.8b,  v28.8b
2911        umlal           v17.8h,  v20.8b,  v27.8b
2912
2913        rshrn           v6.8b,   v6.8h,   #6
2914        rshrn2          v6.16b,  v17.8h,  #6
2915
2916        st1             {v6.d}[0], [x0], x1
2917        subs            w5,  w5,  #2
2918        st1             {v6.d}[1], [x0], x1
2919        b.le            9f
2920
2921        mov             v18.8b,  v20.8b
2922        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2923        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2924        b               89b
2925
29269:
2927        ret
2928endfunc
2929
2930function ipred_z2_fill3_8bpc_neon, export=1
2931        cmp             w4,  #8
2932        mov             w8,  #(1 << 6)            // xpos = 1 << 6
2933        sub             w8,  w8,  w6              // xpos -= dx
2934
2935        movrel          x11, increments
2936        ld1             {v31.8h},  [x11]          // increments
2937        neg             w7,  w7                   // -dy
2938        b.eq            80f
2939
294040:
2941        dup             v30.4h,  w7               // -dy
2942        movi            v17.8b,  #1
2943
2944        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2945        movi            v25.16b, #0x3e
2946        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2947
2948        xtn             v31.8b,  v31.8h           // {0,1,2,3}
2949
2950        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
2951        ld1             {v0.16b, v1.16b}, [x3]    // left[]
2952
2953        movi            v26.16b, #64
2954        movi            v19.16b, #2
2955
2956        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2957        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2958        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2959
2960        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
2961
2962        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2963        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2964
2965        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2966
2967        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3
2968
2969        trn1            v29.2s,  v29.2s,  v28.2s  // base_y + 0, base_y + 2
2970        trn1            v30.2s,  v30.2s,  v24.2s  // base_y + 1, base_y + 3
2971
2972        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2973
2974        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2975        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2976
2977        movi            v24.8b,  #4
29784:
2979        asr             w9,  w8,  #6              // base_x
2980        dup             v6.4h,   w8               // xpos
2981        sub             w8,  w8,  w6              // xpos -= dx
2982        cmp             w9,  #-4                  // base_x <= -4
2983        asr             w11, w8,  #6              // base_x
2984        b.le            49f
2985
2986        dup             v7.4h,   w8               // xpos
2987
2988        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2989        ldr             d4,  [x2, w11, sxtw]
2990
2991        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2992
2993        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
2994        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
2995
2996        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2997        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2998
2999        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
3000        ext             v5.8b,   v4.8b,   v4.8b,   #1
3001
3002        and             v6.8b,   v6.8b,   v25.8b  // frac_x
3003
3004        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
3005        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
3006
3007        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
3008
3009        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
3010
3011        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
3012        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
3013
3014        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
3015        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
3016
3017        cmge            v20.8b,  v20.8b,  #0
3018
3019        rshrn           v16.8b,  v16.8h,  #6
3020        rshrn           v22.8b,  v22.8h,  #6
3021
3022        bit             v16.8b,  v22.8b,  v20.8b
3023
3024        st1             {v16.s}[0], [x0], x1
3025        sub             w8,  w8,  w6              // xpos -= dx
3026        subs            w5,  w5,  #2
3027        st1             {v16.s}[1], [x0], x1
3028        b.le            9f
3029
3030        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
3031        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
3032        b               4b
3033
303449:
3035        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
3036        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
3037
3038        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
3039        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
3040        rshrn           v18.8b,  v18.8h,  #6
3041
3042        st1             {v18.s}[0], [x0], x1
3043        subs            w5,  w5,  #2
3044        st1             {v18.s}[1], [x0], x1
3045        b.le            9f
3046
3047        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
3048        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
3049        b               49b
3050
30519:
3052        ret
3053
305480:
3055        dup             v30.8h,  w7               // -dy
3056        movi            v17.8b,  #1
3057
3058        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
3059        movi            v25.16b, #0x3e
3060        add             v30.8h,  v16.8h,  v30.8h  // -= dy
3061
3062        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
3063
3064        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3065        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
3066
3067        movi            v26.16b, #64
3068        movi            v19.16b, #2
3069
3070        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
3071        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
3072        and             v27.8b,  v27.8b,  v25.8b  // frac_y
3073
3074        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
3075
3076        add             v28.8b,  v29.8b,  v17.8b  // base_y + 1
3077        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
3078
3079        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
3080        add             v24.8b,  v28.8b,  v19.8b  // base_y + 3
3081
3082        trn1            v29.2d,  v29.2d,  v30.2d  // base_y + 0, base_y + 2
3083        trn1            v30.2d,  v28.2d,  v24.2d  // base_y + 1, base_y + 3
3084
3085        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
3086
3087        movi            v24.16b, #4
3088
3089        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
3090        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
30918:
3092        asr             w9,  w8,  #6              // base_x
3093        dup             v16.8h,   w8              // xpos
3094        sub             w8,  w8,  w6              // xpos -= dx
3095        cmp             w9,  #-8                  // base_x <= -8
3096        asr             w11, w8,  #6              // base_x
3097        b.le            89f
3098
3099        dup             v17.8h,   w8              // xpos
3100
3101        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
3102        ldr             q6,  [x2, w11, sxtw]
3103
3104        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3105        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3106
3107        shrn            v21.8b,  v16.8h,  #6      // first base_x
3108        shrn2           v21.16b, v17.8h,  #6
3109        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
3110        xtn2            v16.16b, v17.8h
3111
3112        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
3113        ext             v7.16b,  v6.16b,  v6.16b,  #1
3114
3115        and             v16.16b, v16.16b, v25.16b // frac_x
3116
3117        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
3118        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
3119
3120        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
3121
3122        add             v21.16b, v21.16b, v31.16b // actual base_x
3123
3124        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
3125        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
3126        umull2          v17.8h,  v18.16b, v28.16b
3127        umlal2          v17.8h,  v19.16b, v27.16b
3128
3129        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
3130        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
3131        umull2          v23.8h,  v4.16b,  v7.16b
3132        umlal2          v23.8h,  v5.16b,  v16.16b
3133
3134        cmge            v21.16b, v21.16b, #0
3135
3136        rshrn           v6.8b,   v6.8h,   #6
3137        rshrn2          v6.16b,  v17.8h,  #6
3138        rshrn           v22.8b,  v22.8h,  #6
3139        rshrn2          v22.16b, v23.8h,  #6
3140
3141        bit             v6.16b,  v22.16b, v21.16b
3142
3143        st1             {v6.d}[0], [x0], x1
3144        sub             w8,  w8,  w6              // xpos -= dx
3145        subs            w5,  w5,  #2
3146        st1             {v6.d}[1], [x0], x1
3147        b.le            9f
3148
3149        add             v29.16b, v29.16b, v24.16b // base_y += 4
3150        add             v30.16b, v30.16b, v24.16b // base_y += 4
3151        b               8b
3152
315389:
3154        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3155        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3156
3157        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
3158        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
3159        umull2          v17.8h,  v18.16b, v28.16b
3160        umlal2          v17.8h,  v19.16b, v27.16b
3161
3162        rshrn           v6.8b,   v6.8h,   #6
3163        rshrn2          v6.16b,  v17.8h,  #6
3164
3165        st1             {v6.d}[0], [x0], x1
3166        subs            w5,  w5,  #2
3167        st1             {v6.d}[1], [x0], x1
3168        b.le            9f
3169
3170        add             v29.16b, v29.16b, v24.16b // base_y += 4
3171        add             v30.16b, v30.16b, v24.16b // base_y += 4
3172        b               89b
3173
31749:
3175        ret
3176endfunc
3177
3178
3179// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3180//                               const pixel *const left,
3181//                               const int width, const int height,
3182//                               const int dy, const int max_base_y);
3183function ipred_z3_fill1_8bpc_neon, export=1
3184        cmp             w6,  #64
3185        clz             w9,  w3
3186        movrel          x8,  ipred_z3_fill1_tbl
3187        sub             w9,  w9,  #25
3188        ldrsw           x9,  [x8, w9, uxtw #2]
3189        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3190        add             x8,  x8,  x9
3191        movrel          x11, increments
3192        ld1r            {v31.16b}, [x10]          // padding
3193        ld1             {v30.8h},  [x11]          // increments
3194        mov             w7,  w5
3195        b.gt            L(ipred_z3_fill1_large_h16)
3196        br              x8
3197
319840:
3199        AARCH64_VALID_JUMP_TARGET
3200        dup             v29.4h,  w5               // dy
3201
3202        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
3203        movi            v23.16b, #0x3e
3204
3205        // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
3206        ld1             {v0.16b, v1.16b}, [x2] // left[]
3207        add             v30.4h,  v29.4h,  v30.4h  // ypos
3208
3209        movi            v22.16b, #64
3210        movi            v20.16b, #1
3211        movi            v21.16b, #2
3212
3213        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3214        uqshrn          v26.8b,  v30.8h,  #6      // base
3215        and             v24.8b,  v24.8b,  v23.8b  // frac
3216
3217        mov             v4.8b,   v31.8b
3218        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3219        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3220        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3221
3222        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
3223
3224        trn1            v27.2s,  v27.2s,  v28.2s  // base + 1, base + 2
3225        trn1            v24.2s,  v24.2s,  v24.2s  // frac
3226        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
32271:
3228        mov             v5.8b,   v31.8b
3229        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
3230
3231        trn1            v4.2s,   v4.2s,   v5.2s   // left[base], left[base+1]
3232
3233        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3234        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3235        rshrn           v16.8b,  v16.8h,  #6
3236        st1             {v16.s}[0], [x0], x1
3237        subs            w4,  w4,  #2
3238        st1             {v16.s}[1], [x0], x1
3239        b.le            9f
3240
3241        ext             v4.8b,   v5.8b,   v5.8b,  #4
3242        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
3243        b               1b
3244
32459:
3246        ret
3247
324880:
3249        AARCH64_VALID_JUMP_TARGET
3250        dup             v29.8h,  w5               // dy
3251
3252        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
3253        movi            v23.16b, #0x3e
3254
3255        // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
3256        ld1             {v0.16b, v1.16b, v2.16b}, [x2] // left[]
3257        add             v30.8h,  v29.8h,  v30.8h  // ypos
3258
3259        movi            v22.16b, #64
3260        movi            v20.16b, #1
3261        movi            v21.16b, #2
3262
3263        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3264        uqshrn          v26.8b,  v30.8h,  #6      // base
3265        and             v24.8b,  v24.8b,  v23.8b  // frac
3266
3267        mov             v4.8b,   v31.8b
3268        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3269        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3270        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3271
3272        tbx             v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
32731:
3274        mov             v5.8b,   v31.8b
3275        mov             v6.8b,   v31.8b
3276        tbx             v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
3277        tbx             v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
3278
3279        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3280        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3281        umull           v17.8h,  v5.8b,   v25.8b
3282        umlal           v17.8h,  v6.8b,   v24.8b
3283        rshrn           v16.8b,  v16.8h,  #6
3284        rshrn           v17.8b,  v17.8h,  #6
3285        st1             {v16.8b}, [x0], x1
3286        subs            w4,  w4,  #2
3287        st1             {v17.8b}, [x0], x1
3288        b.le            9f
3289
3290        mov             v4.8b,   v6.8b
3291        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
3292        uqadd           v28.8b,  v28.8b,  v21.8b  // base += 2
3293        b               1b
3294
32959:
3296        ret
3297
3298160:
3299        AARCH64_VALID_JUMP_TARGET
3300        dup             v28.8h,  w5               // dy
3301
3302        shl             v29.8h,  v28.8h,  #3      // 8*dy
3303        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
3304        movi            v23.16b, #0x3e
3305
3306        // This is only executed if we've checked that max_base_y <= 64.
3307        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
3308        add             v28.8h,  v28.8h,  v30.8h  // ypos
3309
3310        movi            v22.16b, #64
3311        movi            v20.16b, #1
3312        movi            v21.16b, #2
3313
3314        add             v29.8h,  v28.8h,  v29.8h  // ypos + 8*dy
3315
3316        xtn             v24.8b,  v28.8h           // (uint8_t)ypos
3317        xtn2            v24.16b, v29.8h
3318        uqshrn          v26.8b,  v28.8h,  #6      // base
3319        uqshrn2         v26.16b, v29.8h,  #6
3320        and             v24.16b, v24.16b, v23.16b // frac
3321
3322        mov             v4.16b,  v31.16b
3323        uqadd           v27.16b, v26.16b, v20.16b // base + 1
3324        uqadd           v28.16b, v26.16b, v21.16b // base + 2
3325        sub             v25.16b, v22.16b, v24.16b // 64 - frac
3326
3327        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
33281:
3329        mov             v5.16b,  v31.16b
3330        mov             v6.16b,  v31.16b
3331        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
3332        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]
3333
3334        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3335        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3336        umull2          v17.8h,  v4.16b,  v25.16b
3337        umlal2          v17.8h,  v5.16b,  v24.16b
3338        umull           v18.8h,  v5.8b,   v25.8b
3339        umlal           v18.8h,  v6.8b,   v24.8b
3340        umull2          v19.8h,  v5.16b,  v25.16b
3341        umlal2          v19.8h,  v6.16b,  v24.16b
3342        rshrn           v16.8b,  v16.8h,  #6
3343        rshrn2          v16.16b, v17.8h,  #6
3344        rshrn           v17.8b,  v18.8h,  #6
3345        rshrn2          v17.16b, v19.8h,  #6
3346        st1             {v16.16b}, [x0], x1
3347        subs            w4,  w4,  #2
3348        st1             {v17.16b}, [x0], x1
3349        b.le            9f
3350
3351        mov             v4.16b,  v6.16b
3352        uqadd           v27.16b, v27.16b, v21.16b // base += 2
3353        uqadd           v28.16b, v28.16b, v21.16b // base += 2
3354        b               1b
3355
33569:
3357        ret
3358320:
3359640:
3360        AARCH64_VALID_JUMP_TARGET
3361        dup             v28.8h,  w5               // dy
3362        mov             w12, w3
3363
3364        add             x13, x0,  x1
3365
3366        shl             v29.8h,  v28.8h,  #3      // 8*dy
3367        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
3368        movi            v23.16b, #0x3e
3369
3370        lsl             x1,  x1,  #1
3371        sub             x1,  x1,  w3,  uxtw
3372        add             v30.8h,  v28.8h,  v30.8h  // ypos
3373
3374        // This is only executed if we've checked that max_base_y <= 64.
3375        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
3376
3377        movi            v22.16b, #64
3378        movi            v20.16b, #1
3379        movi            v21.16b, #2
3380
33811:
3382        mov             v26.16b,  v30.16b         // reset ypos
3383
33842:
3385        add             v27.8h,  v26.8h,  v29.8h  // ypos + 8*dy
3386        uqshrn          v16.8b,  v26.8h,  #6      // base
3387        uqshrn2         v16.16b, v27.8h,  #6
3388        xtn             v24.8b,  v26.8h           // (uint8_t)ypos
3389        xtn2            v24.16b, v27.8h
3390        umov            w14,     v16.b[0]
3391        and             v24.16b, v24.16b, v23.16b // frac
3392
3393        uqadd           v17.16b, v16.16b, v20.16b // base + 1
3394        cmp             w14, w6                   // base >= max_base_y
3395        uqadd           v18.16b, v16.16b, v21.16b // base + 2
3396        sub             v25.16b, v22.16b, v24.16b // 64 - frac
3397
3398        b.ge            4f
3399
3400        mov             v4.16b,  v31.16b
3401        mov             v5.16b,  v31.16b
3402        mov             v6.16b,  v31.16b
3403        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
3404        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
3405        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]
3406
3407        subs            w3,  w3,  #16
3408        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3409        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3410        umull2          v17.8h,  v4.16b,  v25.16b
3411        umlal2          v17.8h,  v5.16b,  v24.16b
3412        umull           v18.8h,  v5.8b,   v25.8b
3413        umlal           v18.8h,  v6.8b,   v24.8b
3414        umull2          v19.8h,  v5.16b,  v25.16b
3415        umlal2          v19.8h,  v6.16b,  v24.16b
3416        rshrn           v16.8b,  v16.8h,  #6
3417        rshrn2          v16.16b, v17.8h,  #6
3418        rshrn           v17.8b,  v18.8h,  #6
3419        rshrn2          v17.16b, v19.8h,  #6
3420        st1             {v16.16b}, [x0],  #16
3421        st1             {v17.16b}, [x13], #16
3422        b.le            3f
3423        add             v26.8h,  v27.8h,  v29.8h  // ypos += 16*dy
3424        b               2b
3425
34263:
3427        subs            w4,  w4,  #2
3428        b.le            9f
3429        movi            v16.8h,  #128
3430        add             x0,  x0,  x1
3431        add             x13, x13, x1
3432        add             v30.8h,  v30.8h,  v16.8h  // ypos = dy + y*(1<<6)*2
3433        mov             w3,  w12
3434        b               1b
3435
34364:
3437        subs            w3,  w3,  #16
3438        st1             {v31.16b}, [x0],  #16
3439        st1             {v31.16b}, [x13], #16
3440        b.gt            4b
3441        b               3b
3442
34439:
3444        ret
3445
3446L(ipred_z3_fill1_large_h16):
3447        // Fallback case for max_base_y > 64; similar to the z1
3448        // implementation. This does the filtering vertically, filling out
3449        // a 2x pixel column at a time.
3450        mov             w15, #64
3451        add             x13, x0,  x1
3452        lsl             x1,  x1,  #1
3453
3454        mov             w12, w4
34551:
3456        lsr             w8,  w7,  #6              // base
3457        and             w9,  w7,  #0x3e           // frac
3458        add             w7,  w7,  w5              // ypos += dy
3459        cmp             w8,  w6                   // base >= max_base_y
3460        lsr             w10, w7,  #6              // base
3461        and             w11, w7,  #0x3e           // frac
3462        b.ge            ipred_z3_fill_padding_neon
3463        add             x8,  x2,  w8,  uxtw
3464        add             x10, x2,  w10, uxtw
3465        dup             v4.16b,  w9               // frac
3466        dup             v5.16b,  w11
3467        ld1             {v0.16b, v1.16b}, [x8],  #32 // left[base]
3468        ld1             {v2.16b, v3.16b}, [x10], #32
3469        sub             w9,  w15, w9              // 64 - frac
3470        sub             w11, w15, w11
3471        dup             v6.16b,  w9               // 64 - frac
3472        dup             v7.16b,  w11
3473        add             w7,  w7,  w5              // ypos += dy
34742:
3475        ext             v16.16b, v0.16b,  v1.16b,  #1 // left[base+1]
3476        ext             v17.16b, v2.16b,  v3.16b,  #1
3477        subs            w4,  w4,  #16
3478        umull           v18.8h,  v16.8b,  v4.8b   // left[base+1]*frac
3479        umlal           v18.8h,  v0.8b,   v6.8b   // + left[base]*(64-frac)
3480        umull2          v19.8h,  v16.16b, v4.16b
3481        umlal2          v19.8h,  v0.16b,  v6.16b
3482        umull           v20.8h,  v17.8b,  v5.8b
3483        umlal           v20.8h,  v2.8b,   v7.8b
3484        umull2          v21.8h,  v17.16b, v5.16b
3485        umlal2          v21.8h,  v2.16b,  v7.16b
3486        rshrn           v16.8b,  v18.8h,  #6
3487        rshrn2          v16.16b, v19.8h,  #6
3488        rshrn           v17.8b,  v20.8h,  #6
3489        rshrn2          v17.16b, v21.8h,  #6
3490        zip1            v18.16b, v16.16b, v17.16b
3491        zip2            v19.16b, v16.16b, v17.16b
3492        st1             {v18.h}[0], [x0],  x1
3493        st1             {v18.h}[1], [x13], x1
3494        st1             {v18.h}[2], [x0],  x1
3495        st1             {v18.h}[3], [x13], x1
3496        st1             {v18.h}[4], [x0],  x1
3497        st1             {v18.h}[5], [x13], x1
3498        st1             {v18.h}[6], [x0],  x1
3499        st1             {v18.h}[7], [x13], x1
3500        st1             {v19.h}[0], [x0],  x1
3501        st1             {v19.h}[1], [x13], x1
3502        st1             {v19.h}[2], [x0],  x1
3503        st1             {v19.h}[3], [x13], x1
3504        st1             {v19.h}[4], [x0],  x1
3505        st1             {v19.h}[5], [x13], x1
3506        st1             {v19.h}[6], [x0],  x1
3507        st1             {v19.h}[7], [x13], x1
3508        b.le            3f
3509        mov             v0.16b,  v1.16b
3510        ld1             {v1.16b}, [x8],  #16      // left[base]
3511        mov             v2.16b,  v3.16b
3512        ld1             {v3.16b}, [x10], #16
3513        b               2b
3514
35153:
3516        subs            w3,  w3,  #2
3517        b.le            9f
3518        lsr             x1,  x1,  #1
3519        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3520        msub            x13, x1,  x12, x13
3521        lsl             x1,  x1,  #1
3522        add             x0,  x0,  #2
3523        add             x13, x13, #2
3524        mov             w4,  w12
3525        b               1b
35269:
3527        ret
3528endfunc
3529
3530jumptable ipred_z3_fill1_tbl
3531        .word 640b - ipred_z3_fill1_tbl
3532        .word 320b - ipred_z3_fill1_tbl
3533        .word 160b - ipred_z3_fill1_tbl
3534        .word 80b  - ipred_z3_fill1_tbl
3535        .word 40b  - ipred_z3_fill1_tbl
3536endjumptable
3537
3538function ipred_z3_fill_padding_neon, export=0
3539        cmp             w3,  #16
3540        movrel          x8,  ipred_z3_fill_padding_tbl
3541        b.gt            ipred_z3_fill_padding_wide
3542        // w3 = remaining width, w4 = constant height
3543        mov             w12, w4
3544
35451:
3546        // Fill a WxH rectangle with padding. W can be any number;
3547        // this fills the exact width by filling in the largest
3548        // power of two in the remaining width, and repeating.
3549        clz             w9,  w3
3550        sub             w9,  w9,  #25
3551        ldrsw           x9,  [x8, w9, uxtw #2]
3552        add             x9,  x8,  x9
3553        br              x9
3554
355520:
3556        AARCH64_VALID_JUMP_TARGET
35572:
3558        st1             {v31.h}[0], [x0],  x1
3559        subs            w4,  w4,  #4
3560        st1             {v31.h}[0], [x13], x1
3561        st1             {v31.h}[0], [x0],  x1
3562        st1             {v31.h}[0], [x13], x1
3563        b.gt            2b
3564        subs            w3,  w3,  #2
3565        lsr             x1,  x1,  #1
3566        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3567        msub            x13, x1,  x12, x13
3568        b.le            9f
3569        lsl             x1,  x1,  #1
3570        add             x0,  x0,  #2
3571        add             x13, x13, #2
3572        mov             w4,  w12
3573        b               1b
3574
357540:
3576        AARCH64_VALID_JUMP_TARGET
35774:
3578        st1             {v31.s}[0], [x0],  x1
3579        subs            w4,  w4,  #4
3580        st1             {v31.s}[0], [x13], x1
3581        st1             {v31.s}[0], [x0],  x1
3582        st1             {v31.s}[0], [x13], x1
3583        b.gt            4b
3584        subs            w3,  w3,  #4
3585        lsr             x1,  x1,  #1
3586        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3587        msub            x13, x1,  x12, x13
3588        b.le            9f
3589        lsl             x1,  x1,  #1
3590        add             x0,  x0,  #4
3591        add             x13, x13, #4
3592        mov             w4,  w12
3593        b               1b
3594
359580:
3596        AARCH64_VALID_JUMP_TARGET
35978:
3598        st1             {v31.8b}, [x0],  x1
3599        subs            w4,  w4,  #4
3600        st1             {v31.8b}, [x13], x1
3601        st1             {v31.8b}, [x0],  x1
3602        st1             {v31.8b}, [x13], x1
3603        b.gt            8b
3604        subs            w3,  w3,  #8
3605        lsr             x1,  x1,  #1
3606        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3607        msub            x13, x1,  x12, x13
3608        b.le            9f
3609        lsl             x1,  x1,  #1
3610        add             x0,  x0,  #8
3611        add             x13, x13, #8
3612        mov             w4,  w12
3613        b               1b
3614
3615160:
3616320:
3617640:
3618        AARCH64_VALID_JUMP_TARGET
361916:
3620        st1             {v31.16b}, [x0],  x1
3621        subs            w4,  w4,  #4
3622        st1             {v31.16b}, [x13], x1
3623        st1             {v31.16b}, [x0],  x1
3624        st1             {v31.16b}, [x13], x1
3625        b.gt            16b
3626        subs            w3,  w3,  #16
3627        lsr             x1,  x1,  #1
3628        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3629        msub            x13, x1,  x12, x13
3630        b.le            9f
3631        lsl             x1,  x1,  #1
3632        add             x0,  x0,  #16
3633        add             x13, x13, #16
3634        mov             w4,  w12
3635        b               1b
3636
36379:
3638        ret
3639endfunc
3640
3641jumptable ipred_z3_fill_padding_tbl
3642        .word 640b - ipred_z3_fill_padding_tbl
3643        .word 320b - ipred_z3_fill_padding_tbl
3644        .word 160b - ipred_z3_fill_padding_tbl
3645        .word 80b  - ipred_z3_fill_padding_tbl
3646        .word 40b  - ipred_z3_fill_padding_tbl
3647        .word 20b  - ipred_z3_fill_padding_tbl
3648endjumptable
3649
3650function ipred_z3_fill_padding_wide
3651        // Fill a WxH rectangle with padding, with W > 16.
3652        lsr             x1,  x1,  #1
3653        mov             w12, w3
3654        sub             x1,  x1,  w3,  uxtw
36551:
3656        ands            w5,  w3,  #15
3657        b.eq            2f
3658        // If the width isn't aligned to 16, first do one 16 byte write
3659        // and align the start pointer.
3660        sub             w3,  w3,  w5
3661        st1             {v31.16b}, [x0]
3662        add             x0,  x0,  w5,  uxtw
36632:
3664        // Fill the rest of the line with aligned 16 byte writes.
3665        subs            w3,  w3,  #16
3666        st1             {v31.16b}, [x0], #16
3667        b.gt            2b
3668        subs            w4,  w4,  #1
3669        add             x0,  x0,  x1
3670        b.le            9f
3671        mov             w3,  w12
3672        b               1b
36739:
3674        ret
3675endfunc
3676
3677function ipred_z3_fill2_8bpc_neon, export=1
3678        cmp             w3,  #8
3679        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3680        movrel          x11, increments
3681        ld1r            {v31.16b}, [x10]          // padding
3682        ld1             {v30.8h},  [x11]          // increments
3683        b.eq            80f
3684
368540:     // w == 4
3686        dup             v29.4h,  w5               // dy
3687
3688        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
3689        movi            v23.16b, #0x3e
3690
3691        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
3692        // so max_base_y <= 32.
3693        ld1             {v0.16b, v1.16b}, [x2] // left[]
3694        add             v30.4h,  v29.4h,  v30.4h  // ypos
3695
3696        movi            v22.16b, #64
3697        movi            v20.16b, #1
3698        movi            v21.16b, #2
3699
3700        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3701        uqshrn          v26.8b,  v30.8h,  #6      // base
3702        and             v24.8b,  v24.8b,  v23.8b  // frac
3703
3704        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3705        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3706        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3707        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
3708
3709        trn1            v24.2s,  v24.2s,  v24.2s  // frac
3710        trn1            v26.2s,  v26.2s,  v28.2s  // base + 0, base + 2
3711        trn1            v27.2s,  v27.2s,  v29.2s  // base + 1, base + 3
3712        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
3713
3714        movi            v21.16b, #4
37151:
3716        mov             v4.8b,   v31.8b
3717        mov             v5.8b,   v31.8b
3718        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
3719        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
3720
3721        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3722        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3723        rshrn           v16.8b,  v16.8h,  #6
3724        st1             {v16.s}[0], [x0], x1
3725        subs            w4,  w4,  #2
3726        st1             {v16.s}[1], [x0], x1
3727        b.le            9f
3728
3729        uqadd           v26.8b,  v26.8b,  v21.8b  // base += 4
3730        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 4
3731        b               1b
3732
37339:
3734        ret
3735
373680:     // w == 8
3737        dup             v29.8h,  w5               // dy
3738
3739        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
3740        movi            v23.16b, #0x3e
3741
3742        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
3743        // so max_base_y <= 32.
3744        ld1             {v0.16b, v1.16b}, [x2] // left[]
3745        add             v30.8h,  v29.8h,  v30.8h  // ypos
3746
3747        movi            v22.16b, #64
3748        movi            v20.16b, #1
3749        movi            v21.16b, #2
3750
3751        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3752        uqshrn          v26.8b,  v30.8h,  #6      // base
3753        and             v24.8b,  v24.8b,  v23.8b  // frac
3754
3755        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3756        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3757        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3758        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
3759
3760        trn1            v24.2d,  v24.2d,  v24.2d  // frac
3761        trn1            v26.2d,  v26.2d,  v28.2d  // base + 0, base + 2
3762        trn1            v27.2d,  v27.2d,  v29.2d  // base + 1, base + 3
3763        trn1            v25.2d,  v25.2d,  v25.2d  // 64 - frac
3764
3765        movi            v21.16b, #4
37661:
3767        mov             v4.16b,  v31.16b
3768        mov             v5.16b,  v31.16b
3769        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
3770        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]
3771
3772        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3773        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3774        umull2          v17.8h,  v4.16b,  v25.16b
3775        umlal2          v17.8h,  v5.16b,  v24.16b
3776        rshrn           v16.8b,  v16.8h,  #6
3777        rshrn           v17.8b,  v17.8h,  #6
3778        st1             {v16.8b}, [x0], x1
3779        subs            w4,  w4,  #2
3780        st1             {v17.8b}, [x0], x1
3781        b.le            9f
3782
3783        uqadd           v26.16b, v26.16b, v21.16b // base += 4
3784        uqadd           v27.16b, v27.16b, v21.16b // base += 4
3785        b               1b
3786
37879:
3788        ret
3789endfunc
3790
3791
3792// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3793//                             const pixel *const topleft,
3794//                             const int width, const int height, const int filt_idx,
3795//                             const int max_width, const int max_height);
3796function ipred_filter_8bpc_neon, export=1
3797        and             w5,  w5,  #511
3798        movrel          x6,  X(filter_intra_taps)
3799        lsl             w5,  w5,  #6
3800        add             x6,  x6,  w5, uxtw
3801        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
3802        clz             w9,  w3
3803        movrel          x5,  ipred_filter_tbl
3804        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
3805        sub             w9,  w9,  #26
3806        ldrsw           x9,  [x5, w9, uxtw #2]
3807        sxtl            v16.8h,  v16.8b
3808        sxtl            v17.8h,  v17.8b
3809        add             x5,  x5,  x9
3810        sxtl            v18.8h,  v18.8b
3811        sxtl            v19.8h,  v19.8b
3812        add             x6,  x0,  x1
3813        lsl             x1,  x1,  #1
3814        sxtl            v20.8h,  v20.8b
3815        sxtl            v21.8h,  v21.8b
3816        sxtl            v22.8h,  v22.8b
3817        br              x5
381840:
3819        AARCH64_VALID_JUMP_TARGET
3820        ldur            s0,  [x2, #1]             // top (0-3)
3821        sub             x2,  x2,  #2
3822        mov             x7,  #-2
3823        uxtl            v0.8h,   v0.8b            // top (0-3)
38244:
3825        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
3826        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3827        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3828        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3829        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
3830        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3831        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3832        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3833        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3834        sqrshrun        v2.8b,   v2.8h,   #4
3835        subs            w4,  w4,  #2
3836        st1             {v2.s}[0], [x0], x1
3837        uxtl            v0.8h,   v2.8b
3838        st1             {v2.s}[1], [x6], x1
3839        ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
3840        b.gt            4b
3841        ret
384280:
3843        AARCH64_VALID_JUMP_TARGET
3844        ldur            d0,  [x2, #1]             // top (0-7)
3845        sub             x2,  x2,  #2
3846        mov             x7,  #-2
3847        uxtl            v0.8h,   v0.8b            // top (0-7)
38488:
3849        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
3850        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3851        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3852        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3853        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
3854        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3855        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3856        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3857        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3858        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
3859        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
3860        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
3861        sqrshrun        v2.8b,   v2.8h,   #4
3862        uxtl            v1.8h,   v2.8b            // first block, in 16 bit
3863        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
3864        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
3865        mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
3866        mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
3867        sqrshrun        v3.8b,   v3.8h,   #4
3868        subs            w4,  w4,  #2
3869        st2             {v2.s, v3.s}[0], [x0], x1
3870        zip2            v0.2s,   v2.2s,   v3.2s
3871        st2             {v2.s, v3.s}[1], [x6], x1
3872        uxtl            v0.8h,   v0.8b
3873        b.gt            8b
3874        ret
3875160:
3876320:
3877        AARCH64_VALID_JUMP_TARGET
3878        add             x8,  x2,  #1
3879        sub             x2,  x2,  #2
3880        mov             x7,  #-2
3881        sub             x1,  x1,  w3, uxtw
3882        mov             w9,  w3
3883
38841:
3885        ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
3886        uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
38872:
3888        ld1             {v2.16b}, [x8],   #16     // top(0-15)
3889        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
3890        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
3891        uxtl            v1.8h,   v2.8b            // top(0-7)
3892        uxtl2           v2.8h,   v2.16b           // top(8-15)
3893        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
3894        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
3895        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
3896        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
3897        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
3898
3899        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
3900        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
3901        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
3902        sqrshrun        v3.8b,   v3.8h,   #4
3903        uxtl            v0.8h,   v3.8b            // first block, in 16 bit
3904        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
3905        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
3906        mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3907        mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3908
3909        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
3910        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
3911        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
3912        sqrshrun        v4.8b,   v4.8h,   #4
3913        uxtl            v0.8h,   v4.8b            // second block, in 16 bit
3914        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
3915        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
3916        mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3917        mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3918
3919        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
3920        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
3921        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
3922        sqrshrun        v5.8b,   v5.8h,   #4
3923        uxtl            v0.8h,   v5.8b            // third block, in 16 bit
3924        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
3925        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
3926        mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3927        mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3928
3929        subs            w3,  w3,  #16
3930        sqrshrun        v6.8b,   v6.8h,   #4
3931
3932        st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
3933        st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
3934        b.le            8f
3935        ins             v0.h[2], v2.h[7]
3936        ins             v0.b[0], v6.b[7]
3937        ins             v0.b[2], v6.b[3]
3938        b               2b
39398:
3940        subs            w4,  w4,  #2
3941        b.le            9f
3942        sub             x8,  x6,  w9, uxtw
3943        add             x0,  x0,  x1
3944        add             x6,  x6,  x1
3945        mov             w3,  w9
3946        b               1b
39479:
3948        ret
3949endfunc
3950
3951jumptable ipred_filter_tbl
3952        .word 320b - ipred_filter_tbl
3953        .word 160b - ipred_filter_tbl
3954        .word 80b  - ipred_filter_tbl
3955        .word 40b  - ipred_filter_tbl
3956endjumptable
3957
3958// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3959//                         const pixel *const pal, const uint8_t *idx,
3960//                         const int w, const int h);
3961function pal_pred_8bpc_neon, export=1
3962        ld1             {v0.8b}, [x2]
3963        clz             w9,  w4
3964        movrel          x6,  pal_pred_tbl
3965        sub             w9,  w9,  #25
3966        movi            v31.16b, #7
3967        ldrsw           x9,  [x6, w9, uxtw #2]
3968        add             x6,  x6,  x9
3969        add             x2,  x0,  x1
3970        lsl             x1,  x1,  #1
3971        br              x6
397240:
3973        AARCH64_VALID_JUMP_TARGET
39744:
3975        ld1             {v1.8b}, [x3], #8
3976        subs            w5,  w5,  #4
3977        ushr            v3.8b,   v1.8b,   #4
3978        and             v2.8b,   v1.8b,   v31.8b
3979        zip1            v1.16b,  v2.16b,  v3.16b
3980        tbl             v1.16b, {v0.16b}, v1.16b
3981        st1             {v1.s}[0], [x0], x1
3982        st1             {v1.s}[1], [x2], x1
3983        st1             {v1.s}[2], [x0], x1
3984        st1             {v1.s}[3], [x2], x1
3985        b.gt            4b
3986        ret
398780:
3988        AARCH64_VALID_JUMP_TARGET
39898:
3990        ld1             {v1.16b}, [x3], #16
3991        subs            w5,  w5,  #4
3992        ushr            v4.16b,  v1.16b,  #4
3993        and             v3.16b,  v1.16b,  v31.16b
3994        zip1            v1.16b,  v3.16b,  v4.16b
3995        zip2            v2.16b,  v3.16b,  v4.16b
3996        tbl             v1.16b, {v0.16b}, v1.16b
3997        st1             {v1.d}[0], [x0], x1
3998        tbl             v2.16b, {v0.16b}, v2.16b
3999        st1             {v1.d}[1], [x2], x1
4000        st1             {v2.d}[0], [x0], x1
4001        st1             {v2.d}[1], [x2], x1
4002        b.gt            8b
4003        ret
4004160:
4005        AARCH64_VALID_JUMP_TARGET
400616:
4007        ld1             {v1.16b, v2.16b}, [x3], #32
4008        subs            w5,  w5,  #4
4009        ushr            v5.16b,  v1.16b,  #4
4010        and             v4.16b,  v1.16b,  v31.16b
4011        ushr            v7.16b,  v2.16b,  #4
4012        and             v6.16b,  v2.16b,  v31.16b
4013        zip1            v1.16b,  v4.16b,  v5.16b
4014        zip2            v2.16b,  v4.16b,  v5.16b
4015        zip1            v3.16b,  v6.16b,  v7.16b
4016        tbl             v1.16b, {v0.16b}, v1.16b
4017        zip2            v4.16b,  v6.16b,  v7.16b
4018        tbl             v2.16b, {v0.16b}, v2.16b
4019        st1             {v1.16b}, [x0], x1
4020        tbl             v3.16b, {v0.16b}, v3.16b
4021        st1             {v2.16b}, [x2], x1
4022        tbl             v4.16b, {v0.16b}, v4.16b
4023        st1             {v3.16b}, [x0], x1
4024        st1             {v4.16b}, [x2], x1
4025        b.gt            16b
4026        ret
4027320:
4028        AARCH64_VALID_JUMP_TARGET
402932:
4030        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
4031        subs            w5,  w5,  #4
4032        ushr            v21.16b, v16.16b, #4
4033        and             v20.16b, v16.16b, v31.16b
4034        ushr            v23.16b, v17.16b, #4
4035        and             v22.16b, v17.16b, v31.16b
4036        ushr            v25.16b, v18.16b, #4
4037        and             v24.16b, v18.16b, v31.16b
4038        ushr            v27.16b, v19.16b, #4
4039        and             v26.16b, v19.16b, v31.16b
4040        zip1            v16.16b, v20.16b, v21.16b
4041        zip2            v17.16b, v20.16b, v21.16b
4042        zip1            v18.16b, v22.16b, v23.16b
4043        zip2            v19.16b, v22.16b, v23.16b
4044        zip1            v20.16b, v24.16b, v25.16b
4045        zip2            v21.16b, v24.16b, v25.16b
4046        tbl             v16.16b, {v0.16b}, v16.16b
4047        zip1            v22.16b, v26.16b, v27.16b
4048        tbl             v17.16b, {v0.16b}, v17.16b
4049        zip2            v23.16b, v26.16b, v27.16b
4050        tbl             v18.16b, {v0.16b}, v18.16b
4051        tbl             v19.16b, {v0.16b}, v19.16b
4052        tbl             v20.16b, {v0.16b}, v20.16b
4053        st1             {v16.16b, v17.16b}, [x0], x1
4054        tbl             v21.16b, {v0.16b}, v21.16b
4055        st1             {v18.16b, v19.16b}, [x2], x1
4056        tbl             v22.16b, {v0.16b}, v22.16b
4057        st1             {v20.16b, v21.16b}, [x0], x1
4058        tbl             v23.16b, {v0.16b}, v23.16b
4059        st1             {v22.16b, v23.16b}, [x2], x1
4060        b.gt            32b
4061        ret
4062640:
4063        AARCH64_VALID_JUMP_TARGET
406464:
4065        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
4066        subs            w5,  w5,  #2
4067        ushr            v21.16b, v16.16b, #4
4068        and             v20.16b, v16.16b, v31.16b
4069        ushr            v23.16b, v17.16b, #4
4070        and             v22.16b, v17.16b, v31.16b
4071        ushr            v25.16b, v18.16b, #4
4072        and             v24.16b, v18.16b, v31.16b
4073        ushr            v27.16b, v19.16b, #4
4074        and             v26.16b, v19.16b, v31.16b
4075        zip1            v16.16b, v20.16b, v21.16b
4076        zip2            v17.16b, v20.16b, v21.16b
4077        zip1            v18.16b, v22.16b, v23.16b
4078        zip2            v19.16b, v22.16b, v23.16b
4079        zip1            v20.16b, v24.16b, v25.16b
4080        zip2            v21.16b, v24.16b, v25.16b
4081        tbl             v16.16b, {v0.16b}, v16.16b
4082        zip1            v22.16b, v26.16b, v27.16b
4083        tbl             v17.16b, {v0.16b}, v17.16b
4084        zip2            v23.16b, v26.16b, v27.16b
4085        tbl             v18.16b, {v0.16b}, v18.16b
4086        tbl             v19.16b, {v0.16b}, v19.16b
4087        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
4088        tbl             v20.16b, {v0.16b}, v20.16b
4089        tbl             v21.16b, {v0.16b}, v21.16b
4090        tbl             v22.16b, {v0.16b}, v22.16b
4091        tbl             v23.16b, {v0.16b}, v23.16b
4092        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
4093        b.gt            64b
4094        ret
4095endfunc
4096
4097jumptable pal_pred_tbl
4098        .word 640b - pal_pred_tbl
4099        .word 320b - pal_pred_tbl
4100        .word 160b - pal_pred_tbl
4101        .word 80b  - pal_pred_tbl
4102        .word 40b  - pal_pred_tbl
4103endjumptable
4104
4105// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4106//                              const pixel *const topleft,
4107//                              const int width, const int height,
4108//                              const int16_t *ac, const int alpha);
4109function ipred_cfl_128_8bpc_neon, export=1
4110        clz             w9,  w3
4111        movrel          x7,  ipred_cfl_128_tbl
4112        sub             w9,  w9,  #26
4113        ldrsw           x9,  [x7, w9, uxtw #2]
4114        movi            v0.8h,   #128 // dc
4115        dup             v1.8h,   w6   // alpha
4116        add             x7,  x7,  x9
4117        add             x6,  x0,  x1
4118        lsl             x1,  x1,  #1
4119        br              x7
4120L(ipred_cfl_splat_w4):
4121        AARCH64_VALID_JUMP_TARGET
41221:
4123        ld1             {v2.8h, v3.8h}, [x5], #32
4124        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4125        mul             v3.8h,   v3.8h,   v1.8h
4126        cmlt            v4.8h,   v2.8h,   #0     // sign
4127        cmlt            v5.8h,   v3.8h,   #0
4128        add             v2.8h,   v2.8h,   v4.8h  // diff + sign
4129        add             v3.8h,   v3.8h,   v5.8h
4130        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4131        srshr           v3.8h,   v3.8h,   #6
4132        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4133        add             v3.8h,   v3.8h,   v0.8h
4134        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4135        sqxtun          v3.8b,   v3.8h
4136        st1             {v2.s}[0],  [x0], x1
4137        st1             {v2.s}[1],  [x6], x1
4138        subs            w4,  w4,  #4
4139        st1             {v3.s}[0],  [x0], x1
4140        st1             {v3.s}[1],  [x6], x1
4141        b.gt            1b
4142        ret
4143L(ipred_cfl_splat_w8):
4144        AARCH64_VALID_JUMP_TARGET
41451:
4146        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
4147        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4148        mul             v3.8h,   v3.8h,   v1.8h
4149        mul             v4.8h,   v4.8h,   v1.8h
4150        mul             v5.8h,   v5.8h,   v1.8h
4151        cmlt            v16.8h,  v2.8h,   #0     // sign
4152        cmlt            v17.8h,  v3.8h,   #0
4153        cmlt            v18.8h,  v4.8h,   #0
4154        cmlt            v19.8h,  v5.8h,   #0
4155        add             v2.8h,   v2.8h,   v16.8h // diff + sign
4156        add             v3.8h,   v3.8h,   v17.8h
4157        add             v4.8h,   v4.8h,   v18.8h
4158        add             v5.8h,   v5.8h,   v19.8h
4159        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4160        srshr           v3.8h,   v3.8h,   #6
4161        srshr           v4.8h,   v4.8h,   #6
4162        srshr           v5.8h,   v5.8h,   #6
4163        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4164        add             v3.8h,   v3.8h,   v0.8h
4165        add             v4.8h,   v4.8h,   v0.8h
4166        add             v5.8h,   v5.8h,   v0.8h
4167        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4168        sqxtun          v3.8b,   v3.8h
4169        sqxtun          v4.8b,   v4.8h
4170        sqxtun          v5.8b,   v5.8h
4171        st1             {v2.8b},  [x0], x1
4172        st1             {v3.8b},  [x6], x1
4173        subs            w4,  w4,  #4
4174        st1             {v4.8b},  [x0], x1
4175        st1             {v5.8b},  [x6], x1
4176        b.gt            1b
4177        ret
4178L(ipred_cfl_splat_w16):
4179        AARCH64_VALID_JUMP_TARGET
4180        add             x7,  x5,  w3, uxtw #1
4181        sub             x1,  x1,  w3, uxtw
4182        mov             w9,  w3
41831:
4184        ld1             {v2.8h, v3.8h}, [x5], #32
4185        ld1             {v4.8h, v5.8h}, [x7], #32
4186        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4187        mul             v3.8h,   v3.8h,   v1.8h
4188        mul             v4.8h,   v4.8h,   v1.8h
4189        mul             v5.8h,   v5.8h,   v1.8h
4190        cmlt            v16.8h,  v2.8h,   #0     // sign
4191        cmlt            v17.8h,  v3.8h,   #0
4192        cmlt            v18.8h,  v4.8h,   #0
4193        cmlt            v19.8h,  v5.8h,   #0
4194        add             v2.8h,   v2.8h,   v16.8h // diff + sign
4195        add             v3.8h,   v3.8h,   v17.8h
4196        add             v4.8h,   v4.8h,   v18.8h
4197        add             v5.8h,   v5.8h,   v19.8h
4198        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4199        srshr           v3.8h,   v3.8h,   #6
4200        srshr           v4.8h,   v4.8h,   #6
4201        srshr           v5.8h,   v5.8h,   #6
4202        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4203        add             v3.8h,   v3.8h,   v0.8h
4204        add             v4.8h,   v4.8h,   v0.8h
4205        add             v5.8h,   v5.8h,   v0.8h
4206        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4207        sqxtun          v3.8b,   v3.8h
4208        sqxtun          v4.8b,   v4.8h
4209        sqxtun          v5.8b,   v5.8h
4210        subs            w3,  w3,  #16
4211        st1             {v2.8b, v3.8b},  [x0], #16
4212        st1             {v4.8b, v5.8b},  [x6], #16
4213        b.gt            1b
4214        subs            w4,  w4,  #2
4215        add             x5,  x5,  w9, uxtw #1
4216        add             x7,  x7,  w9, uxtw #1
4217        add             x0,  x0,  x1
4218        add             x6,  x6,  x1
4219        mov             w3,  w9
4220        b.gt            1b
4221        ret
4222endfunc
4223
4224jumptable ipred_cfl_128_tbl
4225ipred_cfl_splat_tbl:
4226        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4227        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
4228        .word L(ipred_cfl_splat_w8)  - ipred_cfl_128_tbl
4229        .word L(ipred_cfl_splat_w4)  - ipred_cfl_128_tbl
4230endjumptable
4231
4232// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4233//                              const pixel *const topleft,
4234//                              const int width, const int height,
4235//                              const int16_t *ac, const int alpha);
4236function ipred_cfl_top_8bpc_neon, export=1
4237        clz             w9,  w3
4238        movrel          x7,  ipred_cfl_top_tbl
4239        sub             w9,  w9,  #26
4240        ldrsw           x9,  [x7, w9, uxtw #2]
4241        dup             v1.8h,   w6   // alpha
4242        add             x2,  x2,  #1
4243        add             x7,  x7,  x9
4244        add             x6,  x0,  x1
4245        lsl             x1,  x1,  #1
4246        br              x7
42474:
4248        AARCH64_VALID_JUMP_TARGET
4249        ld1r            {v0.2s},  [x2]
4250        uaddlv          h0,      v0.8b
4251        urshr           v0.4h,   v0.4h,   #3
4252        dup             v0.8h,   v0.h[0]
4253        b               L(ipred_cfl_splat_w4)
42548:
4255        AARCH64_VALID_JUMP_TARGET
4256        ld1             {v0.8b},  [x2]
4257        uaddlv          h0,      v0.8b
4258        urshr           v0.4h,   v0.4h,   #3
4259        dup             v0.8h,   v0.h[0]
4260        b               L(ipred_cfl_splat_w8)
426116:
4262        AARCH64_VALID_JUMP_TARGET
4263        ld1             {v0.16b}, [x2]
4264        uaddlv          h0,      v0.16b
4265        urshr           v0.4h,   v0.4h,   #4
4266        dup             v0.8h,   v0.h[0]
4267        b               L(ipred_cfl_splat_w16)
426832:
4269        AARCH64_VALID_JUMP_TARGET
4270        ld1             {v2.16b, v3.16b}, [x2]
4271        uaddlv          h2,      v2.16b
4272        uaddlv          h3,      v3.16b
4273        add             v2.4h,   v2.4h,   v3.4h
4274        urshr           v2.4h,   v2.4h,   #5
4275        dup             v0.8h,   v2.h[0]
4276        b               L(ipred_cfl_splat_w16)
4277endfunc
4278
4279jumptable ipred_cfl_top_tbl
4280        .word 32b - ipred_cfl_top_tbl
4281        .word 16b - ipred_cfl_top_tbl
4282        .word 8b  - ipred_cfl_top_tbl
4283        .word 4b  - ipred_cfl_top_tbl
4284endjumptable
4285
4286// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4287//                               const pixel *const topleft,
4288//                               const int width, const int height,
4289//                               const int16_t *ac, const int alpha);
4290function ipred_cfl_left_8bpc_neon, export=1
4291        sub             x2,  x2,  w4, uxtw
4292        clz             w9,  w3
4293        clz             w8,  w4
4294        movrel          x10, ipred_cfl_splat_tbl
4295        movrel          x7,  ipred_cfl_left_tbl
4296        sub             w9,  w9,  #26
4297        sub             w8,  w8,  #26
4298        ldrsw           x9,  [x10, w9, uxtw #2]
4299        ldrsw           x8,  [x7,  w8, uxtw #2]
4300        dup             v1.8h,   w6   // alpha
4301        add             x9,  x10, x9
4302        add             x7,  x7,  x8
4303        add             x6,  x0,  x1
4304        lsl             x1,  x1,  #1
4305        br              x7
4306
4307L(ipred_cfl_left_h4):
4308        AARCH64_VALID_JUMP_TARGET
4309        ld1r            {v0.2s},  [x2]
4310        uaddlv          h0,      v0.8b
4311        urshr           v0.4h,   v0.4h,   #3
4312        dup             v0.8h,   v0.h[0]
4313        br              x9
4314
4315L(ipred_cfl_left_h8):
4316        AARCH64_VALID_JUMP_TARGET
4317        ld1             {v0.8b},  [x2]
4318        uaddlv          h0,      v0.8b
4319        urshr           v0.4h,   v0.4h,   #3
4320        dup             v0.8h,   v0.h[0]
4321        br              x9
4322
4323L(ipred_cfl_left_h16):
4324        AARCH64_VALID_JUMP_TARGET
4325        ld1             {v0.16b}, [x2]
4326        uaddlv          h0,      v0.16b
4327        urshr           v0.4h,   v0.4h,   #4
4328        dup             v0.8h,   v0.h[0]
4329        br              x9
4330
4331L(ipred_cfl_left_h32):
4332        AARCH64_VALID_JUMP_TARGET
4333        ld1             {v2.16b, v3.16b}, [x2]
4334        uaddlv          h2,      v2.16b
4335        uaddlv          h3,      v3.16b
4336        add             v2.4h,   v2.4h,   v3.4h
4337        urshr           v2.4h,   v2.4h,   #5
4338        dup             v0.8h,   v2.h[0]
4339        br              x9
4340endfunc
4341
4342jumptable ipred_cfl_left_tbl
4343        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
4344        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
4345        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
4346        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
4347endjumptable
4348
4349// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4350//                          const pixel *const topleft,
4351//                          const int width, const int height,
4352//                          const int16_t *ac, const int alpha);
4353function ipred_cfl_8bpc_neon, export=1
4354        sub             x2,  x2,  w4, uxtw
4355        add             w8,  w3,  w4             // width + height
4356        dup             v1.8h,   w6              // alpha
4357        clz             w9,  w3
4358        clz             w6,  w4
4359        dup             v16.8h, w8               // width + height
4360        movrel          x7,  ipred_cfl_tbl
4361        rbit            w8,  w8                  // rbit(width + height)
4362        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
4363        sub             w6,  w6,  #26
4364        clz             w8,  w8                  // ctz(width + height)
4365        ldrsw           x9,  [x7, w9, uxtw #2]
4366        ldrsw           x6,  [x7, w6, uxtw #2]
4367        neg             w8,  w8                  // -ctz(width + height)
4368        add             x9,  x7,  x9
4369        add             x7,  x7,  x6
4370        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
4371        dup             v17.8h,  w8              // -ctz(width + height)
4372        add             x6,  x0,  x1
4373        lsl             x1,  x1,  #1
4374        br              x7
4375
4376L(ipred_cfl_h4):
4377        AARCH64_VALID_JUMP_TARGET
4378        ld1             {v0.s}[0],  [x2], #4
4379        ins             v0.s[1], wzr
4380        add             x2,  x2,  #1
4381        uaddlv          h0,      v0.8b
4382        br              x9
4383L(ipred_cfl_w4):
4384        AARCH64_VALID_JUMP_TARGET
4385        ld1             {v2.s}[0],  [x2]
4386        ins             v2.s[1], wzr
4387        add             v0.4h,   v0.4h,   v16.4h
4388        uaddlv          h2,      v2.8b
4389        cmp             w4,  #4
4390        add             v0.4h,   v0.4h,   v2.4h
4391        ushl            v0.4h,   v0.4h,   v17.4h
4392        b.eq            1f
4393        // h = 8/16
4394        mov             w16, #(0x3334/2)
4395        movk            w16, #(0x5556/2), lsl #16
4396        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
4397        lsr             w16, w16, w17
4398        dup             v16.4h,  w16
4399        sqdmulh         v0.4h,   v0.4h,   v16.4h
44001:
4401        dup             v0.8h,   v0.h[0]
4402        b               L(ipred_cfl_splat_w4)
4403
4404L(ipred_cfl_h8):
4405        AARCH64_VALID_JUMP_TARGET
4406        ld1             {v0.8b},  [x2], #8
4407        uaddlv          h0,      v0.8b
4408        add             x2,  x2,  #1
4409        br              x9
4410L(ipred_cfl_w8):
4411        AARCH64_VALID_JUMP_TARGET
4412        ld1             {v2.8b},  [x2]
4413        add             v0.4h,   v0.4h,   v16.4h
4414        uaddlv          h2,      v2.8b
4415        cmp             w4,  #8
4416        add             v0.4h,   v0.4h,   v2.4h
4417        ushl            v0.4h,   v0.4h,   v17.4h
4418        b.eq            1f
4419        // h = 4/16/32
4420        cmp             w4,  #32
4421        mov             w16, #(0x3334/2)
4422        mov             w17, #(0x5556/2)
4423        csel            w16, w16, w17, eq
4424        dup             v16.4h,  w16
4425        sqdmulh         v0.4h,   v0.4h,   v16.4h
44261:
4427        dup             v0.8h,   v0.h[0]
4428        b               L(ipred_cfl_splat_w8)
4429
4430L(ipred_cfl_h16):
4431        AARCH64_VALID_JUMP_TARGET
4432        ld1             {v0.16b}, [x2], #16
4433        uaddlv          h0,      v0.16b
4434        add             x2,  x2,  #1
4435        br              x9
4436L(ipred_cfl_w16):
4437        AARCH64_VALID_JUMP_TARGET
4438        ld1             {v2.16b}, [x2]
4439        add             v0.4h,   v0.4h,   v16.4h
4440        uaddlv          h2,      v2.16b
4441        cmp             w4,  #16
4442        add             v0.4h,   v0.4h,   v2.4h
4443        ushl            v0.4h,   v0.4h,   v17.4h
4444        b.eq            1f
4445        // h = 4/8/32
4446        cmp             w4,  #4
4447        mov             w16, #(0x3334/2)
4448        mov             w17, #(0x5556/2)
4449        csel            w16, w16, w17, eq
4450        dup             v16.4h,  w16
4451        sqdmulh         v0.4h,   v0.4h,   v16.4h
44521:
4453        dup             v0.8h,   v0.h[0]
4454        b               L(ipred_cfl_splat_w16)
4455
4456L(ipred_cfl_h32):
4457        AARCH64_VALID_JUMP_TARGET
4458        ld1             {v2.16b, v3.16b}, [x2], #32
4459        uaddlv          h2,      v2.16b
4460        uaddlv          h3,      v3.16b
4461        add             x2,  x2,  #1
4462        add             v0.4h,   v2.4h,   v3.4h
4463        br              x9
4464L(ipred_cfl_w32):
4465        AARCH64_VALID_JUMP_TARGET
4466        ld1             {v2.16b, v3.16b}, [x2]
4467        add             v0.4h,   v0.4h,   v16.4h
4468        uaddlv          h2,      v2.16b
4469        uaddlv          h3,      v3.16b
4470        cmp             w4,  #32
4471        add             v0.4h,   v0.4h,   v2.4h
4472        add             v0.4h,   v0.4h,   v3.4h
4473        ushl            v0.4h,   v0.4h,   v17.4h
4474        b.eq            1f
4475        // h = 8/16
4476        mov             w16, #(0x5556/2)
4477        movk            w16, #(0x3334/2), lsl #16
4478        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
4479        lsr             w16, w16, w17
4480        dup             v16.4h,  w16
4481        sqdmulh         v0.4h,   v0.4h,   v16.4h
44821:
4483        dup             v0.8h,   v0.h[0]
4484        b               L(ipred_cfl_splat_w16)
4485endfunc
4486
4487jumptable ipred_cfl_tbl
4488        .word L(ipred_cfl_h32) - ipred_cfl_tbl
4489        .word L(ipred_cfl_h16) - ipred_cfl_tbl
4490        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
4491        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
4492        .word L(ipred_cfl_w32) - ipred_cfl_tbl
4493        .word L(ipred_cfl_w16) - ipred_cfl_tbl
4494        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
4495        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
4496endjumptable
4497
4498// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
4499//                           const ptrdiff_t stride, const int w_pad,
4500//                           const int h_pad, const int cw, const int ch);
4501function ipred_cfl_ac_420_8bpc_neon, export=1
4502        clz             w8,  w5
4503        lsl             w4,  w4,  #2
4504        movrel          x7,  ipred_cfl_ac_420_tbl
4505        sub             w8,  w8,  #27
4506        ldrsw           x8,  [x7, w8, uxtw #2]
4507        movi            v16.8h,  #0
4508        movi            v17.8h,  #0
4509        movi            v18.8h,  #0
4510        movi            v19.8h,  #0
4511        add             x7,  x7,  x8
4512        sub             w8,  w6,  w4         // height - h_pad
4513        rbit            w9,  w5              // rbit(width)
4514        rbit            w10, w6              // rbit(height)
4515        clz             w9,  w9              // ctz(width)
4516        clz             w10, w10             // ctz(height)
4517        add             w9,  w9,  w10        // log2sz
4518        add             x10, x1,  x2
4519        dup             v31.4s,  w9
4520        lsl             x2,  x2,  #1
4521        neg             v31.4s,  v31.4s      // -log2sz
4522        br              x7
4523
4524L(ipred_cfl_ac_420_w4):
4525        AARCH64_VALID_JUMP_TARGET
45261:      // Copy and subsample input
4527        ld1             {v0.8b},   [x1],  x2
4528        ld1             {v1.8b},   [x10], x2
4529        ld1             {v0.d}[1], [x1],  x2
4530        ld1             {v1.d}[1], [x10], x2
4531        uaddlp          v0.8h,   v0.16b
4532        uaddlp          v1.8h,   v1.16b
4533        add             v0.8h,   v0.8h,   v1.8h
4534        shl             v0.8h,   v0.8h,   #1
4535        subs            w8,  w8,  #2
4536        st1             {v0.8h}, [x0], #16
4537        add             v16.8h,  v16.8h,  v0.8h
4538        b.gt            1b
4539        trn2            v1.2d,   v0.2d,   v0.2d
4540        trn2            v0.2d,   v0.2d,   v0.2d
4541L(ipred_cfl_ac_420_w4_hpad):
4542        cbz             w4,  3f
45432:      // Vertical padding (h_pad > 0)
4544        subs            w4,  w4,  #4
4545        st1             {v0.8h, v1.8h}, [x0], #32
4546        add             v16.8h,  v16.8h,  v0.8h
4547        add             v17.8h,  v17.8h,  v1.8h
4548        b.gt            2b
45493:
4550        // Aggregate the sums
4551        add             v0.8h,   v16.8h,  v17.8h
4552        uaddlv          s0,  v0.8h                // sum
4553        sub             x0,  x0,  w6, uxtw #3
4554        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
4555        dup             v4.8h,   v4.h[0]
45566:      // Subtract dc from ac
4557        ld1             {v0.8h, v1.8h}, [x0]
4558        subs            w6,  w6,  #4
4559        sub             v0.8h,   v0.8h,   v4.8h
4560        sub             v1.8h,   v1.8h,   v4.8h
4561        st1             {v0.8h, v1.8h}, [x0], #32
4562        b.gt            6b
4563        ret
4564
4565L(ipred_cfl_ac_420_w8):
4566        AARCH64_VALID_JUMP_TARGET
4567        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
45681:      // Copy and subsample input, without padding
4569        ld1             {v0.16b}, [x1],  x2
4570        ld1             {v1.16b}, [x10], x2
4571        ld1             {v2.16b}, [x1],  x2
4572        uaddlp          v0.8h,   v0.16b
4573        ld1             {v3.16b}, [x10], x2
4574        uaddlp          v1.8h,   v1.16b
4575        uaddlp          v2.8h,   v2.16b
4576        uaddlp          v3.8h,   v3.16b
4577        add             v0.8h,   v0.8h,   v1.8h
4578        add             v2.8h,   v2.8h,   v3.8h
4579        shl             v0.8h,   v0.8h,   #1
4580        shl             v1.8h,   v2.8h,   #1
4581        subs            w8,  w8,  #2
4582        st1             {v0.8h, v1.8h}, [x0], #32
4583        add             v16.8h,  v16.8h,  v0.8h
4584        add             v17.8h,  v17.8h,  v1.8h
4585        b.gt            1b
4586        mov             v0.16b,  v1.16b
4587        b               L(ipred_cfl_ac_420_w8_hpad)
4588
4589L(ipred_cfl_ac_420_w8_wpad):
45901:      // Copy and subsample input, padding 4
4591        ld1             {v0.8b},   [x1],  x2
4592        ld1             {v1.8b},   [x10], x2
4593        ld1             {v0.d}[1], [x1],  x2
4594        ld1             {v1.d}[1], [x10], x2
4595        uaddlp          v0.8h,   v0.16b
4596        uaddlp          v1.8h,   v1.16b
4597        add             v0.8h,   v0.8h,   v1.8h
4598        shl             v0.8h,   v0.8h,   #1
4599        dup             v1.4h,   v0.h[3]
4600        dup             v3.4h,   v0.h[7]
4601        trn2            v2.2d,   v0.2d,   v0.2d
4602        subs            w8,  w8,  #2
4603        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
4604        add             v16.4h,  v16.4h,  v0.4h
4605        add             v17.4h,  v17.4h,  v1.4h
4606        add             v18.4h,  v18.4h,  v2.4h
4607        add             v19.4h,  v19.4h,  v3.4h
4608        b.gt            1b
4609        trn1            v0.2d,   v2.2d,   v3.2d
4610        trn1            v1.2d,   v2.2d,   v3.2d
4611
4612L(ipred_cfl_ac_420_w8_hpad):
4613        cbz             w4,  3f
46142:      // Vertical padding (h_pad > 0)
4615        subs            w4,  w4,  #4
4616        st1             {v0.8h, v1.8h}, [x0], #32
4617        add             v16.8h,  v16.8h,  v0.8h
4618        add             v17.8h,  v17.8h,  v1.8h
4619        st1             {v0.8h, v1.8h}, [x0], #32
4620        add             v18.8h,  v18.8h,  v0.8h
4621        add             v19.8h,  v19.8h,  v1.8h
4622        b.gt            2b
46233:
4624
4625L(ipred_cfl_ac_420_w8_calc_subtract_dc):
4626        // Aggregate the sums
4627        add             v0.8h,   v16.8h,  v17.8h
4628        add             v2.8h,   v18.8h,  v19.8h
4629        uaddlp          v0.4s,   v0.8h
4630        uaddlp          v2.4s,   v2.8h
4631        add             v0.4s,   v0.4s,   v2.4s
4632        addv            s0,  v0.4s                // sum
4633        sub             x0,  x0,  w6, uxtw #4
4634        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
4635        dup             v4.8h,   v4.h[0]
4636L(ipred_cfl_ac_420_w8_subtract_dc):
46376:      // Subtract dc from ac
4638        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
4639        subs            w6,  w6,  #4
4640        sub             v0.8h,   v0.8h,   v4.8h
4641        sub             v1.8h,   v1.8h,   v4.8h
4642        sub             v2.8h,   v2.8h,   v4.8h
4643        sub             v3.8h,   v3.8h,   v4.8h
4644        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4645        b.gt            6b
4646        ret
4647
4648L(ipred_cfl_ac_420_w16):
4649        AARCH64_VALID_JUMP_TARGET
4650        movrel          x7,  ipred_cfl_ac_420_w16_tbl
4651        ldrsw           x3,  [x7, w3, uxtw #2]
4652        add             x7,  x7,  x3
4653        br              x7
4654
4655L(ipred_cfl_ac_420_w16_wpad0):
4656        AARCH64_VALID_JUMP_TARGET
46571:      // Copy and subsample input, without padding
4658        ld1             {v0.16b, v1.16b}, [x1],  x2
4659        ld1             {v2.16b, v3.16b}, [x10], x2
4660        uaddlp          v0.8h,   v0.16b
4661        ld1             {v4.16b, v5.16b}, [x1],  x2
4662        uaddlp          v1.8h,   v1.16b
4663        ld1             {v6.16b, v7.16b}, [x10], x2
4664        uaddlp          v2.8h,   v2.16b
4665        uaddlp          v3.8h,   v3.16b
4666        uaddlp          v4.8h,   v4.16b
4667        uaddlp          v5.8h,   v5.16b
4668        uaddlp          v6.8h,   v6.16b
4669        uaddlp          v7.8h,   v7.16b
4670        add             v0.8h,   v0.8h,   v2.8h
4671        add             v1.8h,   v1.8h,   v3.8h
4672        add             v4.8h,   v4.8h,   v6.8h
4673        add             v5.8h,   v5.8h,   v7.8h
4674        shl             v0.8h,   v0.8h,   #1
4675        shl             v1.8h,   v1.8h,   #1
4676        shl             v2.8h,   v4.8h,   #1
4677        shl             v3.8h,   v5.8h,   #1
4678        subs            w8,  w8,  #2
4679        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4680        add             v16.8h,  v16.8h,  v0.8h
4681        add             v17.8h,  v17.8h,  v1.8h
4682        add             v18.8h,  v18.8h,  v2.8h
4683        add             v19.8h,  v19.8h,  v3.8h
4684        b.gt            1b
4685        mov             v0.16b,  v2.16b
4686        mov             v1.16b,  v3.16b
4687        b               L(ipred_cfl_ac_420_w16_hpad)
4688
4689L(ipred_cfl_ac_420_w16_wpad1):
4690        AARCH64_VALID_JUMP_TARGET
46911:      // Copy and subsample input, padding 4
4692        ldr             d1,  [x1,  #16]
4693        ld1             {v0.16b}, [x1],  x2
4694        ldr             d3,  [x10, #16]
4695        ld1             {v2.16b}, [x10], x2
4696        uaddlp          v1.4h,   v1.8b
4697        ldr             d5,  [x1,  #16]
4698        uaddlp          v0.8h,   v0.16b
4699        ld1             {v4.16b}, [x1],  x2
4700        uaddlp          v3.4h,   v3.8b
4701        ldr             d7,  [x10, #16]
4702        uaddlp          v2.8h,   v2.16b
4703        ld1             {v6.16b}, [x10], x2
4704        uaddlp          v5.4h,   v5.8b
4705        uaddlp          v4.8h,   v4.16b
4706        uaddlp          v7.4h,   v7.8b
4707        uaddlp          v6.8h,   v6.16b
4708        add             v1.4h,   v1.4h,   v3.4h
4709        add             v0.8h,   v0.8h,   v2.8h
4710        add             v5.4h,   v5.4h,   v7.4h
4711        add             v4.8h,   v4.8h,   v6.8h
4712        shl             v1.4h,   v1.4h,   #1
4713        shl             v0.8h,   v0.8h,   #1
4714        shl             v3.4h,   v5.4h,   #1
4715        shl             v2.8h,   v4.8h,   #1
4716        dup             v4.4h,   v1.h[3]
4717        dup             v5.4h,   v3.h[3]
4718        trn1            v1.2d,   v1.2d,   v4.2d
4719        trn1            v3.2d,   v3.2d,   v5.2d
4720        subs            w8,  w8,  #2
4721        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4722        add             v16.8h,  v16.8h,  v0.8h
4723        add             v17.8h,  v17.8h,  v1.8h
4724        add             v18.8h,  v18.8h,  v2.8h
4725        add             v19.8h,  v19.8h,  v3.8h
4726        b.gt            1b
4727        mov             v0.16b,  v2.16b
4728        mov             v1.16b,  v3.16b
4729        b               L(ipred_cfl_ac_420_w16_hpad)
4730
4731L(ipred_cfl_ac_420_w16_wpad2):
4732        AARCH64_VALID_JUMP_TARGET
47331:      // Copy and subsample input, padding 8
4734        ld1             {v0.16b}, [x1],  x2
4735        ld1             {v2.16b}, [x10], x2
4736        ld1             {v4.16b}, [x1],  x2
4737        uaddlp          v0.8h,   v0.16b
4738        ld1             {v6.16b}, [x10], x2
4739        uaddlp          v2.8h,   v2.16b
4740        uaddlp          v4.8h,   v4.16b
4741        uaddlp          v6.8h,   v6.16b
4742        add             v0.8h,   v0.8h,   v2.8h
4743        add             v4.8h,   v4.8h,   v6.8h
4744        shl             v0.8h,   v0.8h,   #1
4745        shl             v2.8h,   v4.8h,   #1
4746        dup             v1.8h,   v0.h[7]
4747        dup             v3.8h,   v2.h[7]
4748        subs            w8,  w8,  #2
4749        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4750        add             v16.8h,  v16.8h,  v0.8h
4751        add             v17.8h,  v17.8h,  v1.8h
4752        add             v18.8h,  v18.8h,  v2.8h
4753        add             v19.8h,  v19.8h,  v3.8h
4754        b.gt            1b
4755        mov             v0.16b,  v2.16b
4756        mov             v1.16b,  v3.16b
4757        b               L(ipred_cfl_ac_420_w16_hpad)
4758
4759L(ipred_cfl_ac_420_w16_wpad3):
4760        AARCH64_VALID_JUMP_TARGET
47611:      // Copy and subsample input, padding 12
4762        ld1             {v0.8b}, [x1],  x2
4763        ld1             {v2.8b}, [x10], x2
4764        ld1             {v4.8b}, [x1],  x2
4765        uaddlp          v0.4h,   v0.8b
4766        ld1             {v6.8b}, [x10], x2
4767        uaddlp          v2.4h,   v2.8b
4768        uaddlp          v4.4h,   v4.8b
4769        uaddlp          v6.4h,   v6.8b
4770        add             v0.4h,   v0.4h,   v2.4h
4771        add             v4.4h,   v4.4h,   v6.4h
4772        shl             v0.4h,   v0.4h,   #1
4773        shl             v2.4h,   v4.4h,   #1
4774        dup             v1.8h,   v0.h[3]
4775        dup             v3.8h,   v2.h[3]
4776        trn1            v0.2d,   v0.2d,   v1.2d
4777        trn1            v2.2d,   v2.2d,   v3.2d
4778        subs            w8,  w8,  #2
4779        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4780        add             v16.8h,  v16.8h,  v0.8h
4781        add             v17.8h,  v17.8h,  v1.8h
4782        add             v18.8h,  v18.8h,  v2.8h
4783        add             v19.8h,  v19.8h,  v3.8h
4784        b.gt            1b
4785        mov             v0.16b,  v2.16b
4786        mov             v1.16b,  v3.16b
4787
4788L(ipred_cfl_ac_420_w16_hpad):
4789        cbz             w4,  3f
47902:      // Vertical padding (h_pad > 0)
4791        subs            w4,  w4,  #4
4792        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4793        add             v16.8h,  v16.8h,  v0.8h
4794        add             v17.8h,  v17.8h,  v1.8h
4795        add             v18.8h,  v18.8h,  v2.8h
4796        add             v19.8h,  v19.8h,  v3.8h
4797        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4798        add             v16.8h,  v16.8h,  v0.8h
4799        add             v17.8h,  v17.8h,  v1.8h
4800        add             v18.8h,  v18.8h,  v2.8h
4801        add             v19.8h,  v19.8h,  v3.8h
4802        b.gt            2b
48033:
4804
4805        // Double the height and reuse the w8 summing/subtracting
4806        lsl             w6,  w6,  #1
4807        b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
4808endfunc
4809
4810jumptable ipred_cfl_ac_420_tbl
4811        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
4812        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
4813        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
4814endjumptable
4815
4816jumptable ipred_cfl_ac_420_w16_tbl
4817        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
4818        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
4819        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
4820        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
4821endjumptable
4822
4823// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
4824//                           const ptrdiff_t stride, const int w_pad,
4825//                           const int h_pad, const int cw, const int ch);
4826function ipred_cfl_ac_422_8bpc_neon, export=1
4827        clz             w8,  w5
4828        lsl             w4,  w4,  #2
4829        movrel          x7,  ipred_cfl_ac_422_tbl
4830        sub             w8,  w8,  #27
4831        ldrsw           x8,  [x7, w8, uxtw #2]
4832        movi            v16.8h,  #0
4833        movi            v17.8h,  #0
4834        movi            v18.8h,  #0
4835        movi            v19.8h,  #0
4836        add             x7,  x7,  x8
4837        sub             w8,  w6,  w4         // height - h_pad
4838        rbit            w9,  w5              // rbit(width)
4839        rbit            w10, w6              // rbit(height)
4840        clz             w9,  w9              // ctz(width)
4841        clz             w10, w10             // ctz(height)
4842        add             w9,  w9,  w10        // log2sz
4843        add             x10, x1,  x2
4844        dup             v31.4s,  w9
4845        lsl             x2,  x2,  #1
4846        neg             v31.4s,  v31.4s      // -log2sz
4847        br              x7
4848
4849L(ipred_cfl_ac_422_w4):
4850        AARCH64_VALID_JUMP_TARGET
48511:      // Copy and subsample input
4852        ld1             {v0.8b},   [x1],  x2
4853        ld1             {v0.d}[1], [x10], x2
4854        ld1             {v1.8b},   [x1],  x2
4855        ld1             {v1.d}[1], [x10], x2
4856        uaddlp          v0.8h,   v0.16b
4857        uaddlp          v1.8h,   v1.16b
4858        shl             v0.8h,   v0.8h,   #2
4859        shl             v1.8h,   v1.8h,   #2
4860        subs            w8,  w8,  #4
4861        add             v16.8h,  v16.8h,  v0.8h
4862        add             v17.8h,  v17.8h,  v1.8h
4863        st1             {v0.8h, v1.8h}, [x0], #32
4864        b.gt            1b
4865        trn2            v0.2d,   v1.2d,   v1.2d
4866        trn2            v1.2d,   v1.2d,   v1.2d
4867        b               L(ipred_cfl_ac_420_w4_hpad)
4868
4869L(ipred_cfl_ac_422_w8):
4870        AARCH64_VALID_JUMP_TARGET
4871        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
48721:      // Copy and subsample input, without padding
4873        ld1             {v0.16b}, [x1],  x2
4874        ld1             {v1.16b}, [x10], x2
4875        ld1             {v2.16b}, [x1],  x2
4876        uaddlp          v0.8h,   v0.16b
4877        ld1             {v3.16b}, [x10], x2
4878        uaddlp          v1.8h,   v1.16b
4879        uaddlp          v2.8h,   v2.16b
4880        uaddlp          v3.8h,   v3.16b
4881        shl             v0.8h,   v0.8h,   #2
4882        shl             v1.8h,   v1.8h,   #2
4883        shl             v2.8h,   v2.8h,   #2
4884        shl             v3.8h,   v3.8h,   #2
4885        subs            w8,  w8,  #4
4886        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4887        add             v16.8h,  v16.8h,  v0.8h
4888        add             v17.8h,  v17.8h,  v1.8h
4889        add             v18.8h,  v18.8h,  v2.8h
4890        add             v19.8h,  v19.8h,  v3.8h
4891        b.gt            1b
4892        mov             v0.16b,  v3.16b
4893        mov             v1.16b,  v3.16b
4894        b               L(ipred_cfl_ac_420_w8_hpad)
4895
4896L(ipred_cfl_ac_422_w8_wpad):
48971:      // Copy and subsample input, padding 4
4898        ld1             {v0.8b},   [x1],  x2
4899        ld1             {v0.d}[1], [x10], x2
4900        ld1             {v2.8b},   [x1],  x2
4901        ld1             {v2.d}[1], [x10], x2
4902        uaddlp          v0.8h,   v0.16b
4903        uaddlp          v2.8h,   v2.16b
4904        shl             v0.8h,   v0.8h,   #2
4905        shl             v2.8h,   v2.8h,   #2
4906        dup             v4.4h,   v0.h[3]
4907        dup             v5.8h,   v0.h[7]
4908        dup             v6.4h,   v2.h[3]
4909        dup             v7.8h,   v2.h[7]
4910        trn2            v1.2d,   v0.2d,   v5.2d
4911        trn1            v0.2d,   v0.2d,   v4.2d
4912        trn2            v3.2d,   v2.2d,   v7.2d
4913        trn1            v2.2d,   v2.2d,   v6.2d
4914        subs            w8,  w8,  #4
4915        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4916        add             v16.8h,  v16.8h,  v0.8h
4917        add             v17.8h,  v17.8h,  v1.8h
4918        add             v18.8h,  v18.8h,  v2.8h
4919        add             v19.8h,  v19.8h,  v3.8h
4920        b.gt            1b
4921        mov             v0.16b,  v3.16b
4922        mov             v1.16b,  v3.16b
4923        b               L(ipred_cfl_ac_420_w8_hpad)
4924
4925L(ipred_cfl_ac_422_w16):
4926        AARCH64_VALID_JUMP_TARGET
4927        movrel          x7,  ipred_cfl_ac_422_w16_tbl
4928        ldrsw           x3,  [x7, w3, uxtw #2]
4929        add             x7,  x7,  x3
4930        br              x7
4931
4932L(ipred_cfl_ac_422_w16_wpad0):
4933        AARCH64_VALID_JUMP_TARGET
49341:      // Copy and subsample input, without padding
4935        ld1             {v0.16b, v1.16b}, [x1],  x2
4936        ld1             {v2.16b, v3.16b}, [x10], x2
4937        uaddlp          v0.8h,   v0.16b
4938        uaddlp          v1.8h,   v1.16b
4939        uaddlp          v2.8h,   v2.16b
4940        uaddlp          v3.8h,   v3.16b
4941        shl             v0.8h,   v0.8h,   #2
4942        shl             v1.8h,   v1.8h,   #2
4943        shl             v2.8h,   v2.8h,   #2
4944        shl             v3.8h,   v3.8h,   #2
4945        subs            w8,  w8,  #2
4946        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4947        add             v16.8h,  v16.8h,  v0.8h
4948        add             v17.8h,  v17.8h,  v1.8h
4949        add             v18.8h,  v18.8h,  v2.8h
4950        add             v19.8h,  v19.8h,  v3.8h
4951        b.gt            1b
4952        mov             v0.16b,  v2.16b
4953        mov             v1.16b,  v3.16b
4954        b               L(ipred_cfl_ac_420_w16_hpad)
4955
4956L(ipred_cfl_ac_422_w16_wpad1):
4957        AARCH64_VALID_JUMP_TARGET
49581:      // Copy and subsample input, padding 4
4959        ldr             d1,  [x1,  #16]
4960        ld1             {v0.16b}, [x1],  x2
4961        ldr             d3,  [x10, #16]
4962        ld1             {v2.16b}, [x10], x2
4963        uaddlp          v1.4h,   v1.8b
4964        uaddlp          v0.8h,   v0.16b
4965        uaddlp          v3.4h,   v3.8b
4966        uaddlp          v2.8h,   v2.16b
4967        shl             v1.4h,   v1.4h,   #2
4968        shl             v0.8h,   v0.8h,   #2
4969        shl             v3.4h,   v3.4h,   #2
4970        shl             v2.8h,   v2.8h,   #2
4971        dup             v4.4h,   v1.h[3]
4972        dup             v5.4h,   v3.h[3]
4973        trn1            v1.2d,   v1.2d,   v4.2d
4974        trn1            v3.2d,   v3.2d,   v5.2d
4975        subs            w8,  w8,  #2
4976        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4977        add             v16.8h,  v16.8h,  v0.8h
4978        add             v17.8h,  v17.8h,  v1.8h
4979        add             v18.8h,  v18.8h,  v2.8h
4980        add             v19.8h,  v19.8h,  v3.8h
4981        b.gt            1b
4982        mov             v0.16b,  v2.16b
4983        mov             v1.16b,  v3.16b
4984        b               L(ipred_cfl_ac_420_w16_hpad)
4985
4986L(ipred_cfl_ac_422_w16_wpad2):
4987        AARCH64_VALID_JUMP_TARGET
49881:      // Copy and subsample input, padding 8
4989        ld1             {v0.16b}, [x1],  x2
4990        ld1             {v2.16b}, [x10], x2
4991        uaddlp          v0.8h,   v0.16b
4992        uaddlp          v2.8h,   v2.16b
4993        shl             v0.8h,   v0.8h,   #2
4994        shl             v2.8h,   v2.8h,   #2
4995        dup             v1.8h,   v0.h[7]
4996        dup             v3.8h,   v2.h[7]
4997        subs            w8,  w8,  #2
4998        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4999        add             v16.8h,  v16.8h,  v0.8h
5000        add             v17.8h,  v17.8h,  v1.8h
5001        add             v18.8h,  v18.8h,  v2.8h
5002        add             v19.8h,  v19.8h,  v3.8h
5003        b.gt            1b
5004        mov             v0.16b,  v2.16b
5005        mov             v1.16b,  v3.16b
5006        b               L(ipred_cfl_ac_420_w16_hpad)
5007
5008L(ipred_cfl_ac_422_w16_wpad3):
5009        AARCH64_VALID_JUMP_TARGET
50101:      // Copy and subsample input, padding 12
5011        ld1             {v0.8b}, [x1],  x2
5012        ld1             {v2.8b}, [x10], x2
5013        uaddlp          v0.4h,   v0.8b
5014        uaddlp          v2.4h,   v2.8b
5015        shl             v0.4h,   v0.4h,   #2
5016        shl             v2.4h,   v2.4h,   #2
5017        dup             v1.8h,   v0.h[3]
5018        dup             v3.8h,   v2.h[3]
5019        trn1            v0.2d,   v0.2d,   v1.2d
5020        trn1            v2.2d,   v2.2d,   v3.2d
5021        subs            w8,  w8,  #2
5022        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5023        add             v16.8h,  v16.8h,  v0.8h
5024        add             v17.8h,  v17.8h,  v1.8h
5025        add             v18.8h,  v18.8h,  v2.8h
5026        add             v19.8h,  v19.8h,  v3.8h
5027        b.gt            1b
5028        mov             v0.16b,  v2.16b
5029        mov             v1.16b,  v3.16b
5030        b               L(ipred_cfl_ac_420_w16_hpad)
5031endfunc
5032
5033jumptable ipred_cfl_ac_422_tbl
5034        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
5035        .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl
5036        .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl
5037endjumptable
5038
5039jumptable ipred_cfl_ac_422_w16_tbl
5040        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
5041        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
5042        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
5043        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
5044endjumptable
5045
5046// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
5047//                           const ptrdiff_t stride, const int w_pad,
5048//                           const int h_pad, const int cw, const int ch);
5049function ipred_cfl_ac_444_8bpc_neon, export=1
5050        clz             w8,  w5
5051        lsl             w4,  w4,  #2
5052        movrel          x7,  ipred_cfl_ac_444_tbl
5053        sub             w8,  w8,  #26
5054        ldrsw           x8,  [x7, w8, uxtw #2]
5055        movi            v16.8h,  #0
5056        movi            v17.8h,  #0
5057        movi            v18.8h,  #0
5058        movi            v19.8h,  #0
5059        add             x7,  x7,  x8
5060        sub             w8,  w6,  w4         // height - h_pad
5061        rbit            w9,  w5              // rbit(width)
5062        rbit            w10, w6              // rbit(height)
5063        clz             w9,  w9              // ctz(width)
5064        clz             w10, w10             // ctz(height)
5065        add             w9,  w9,  w10        // log2sz
5066        add             x10, x1,  x2
5067        dup             v31.4s,  w9
5068        lsl             x2,  x2,  #1
5069        neg             v31.4s,  v31.4s      // -log2sz
5070        br              x7
5071
5072L(ipred_cfl_ac_444_w4):
5073        AARCH64_VALID_JUMP_TARGET
50741:      // Copy and expand input
5075        ld1             {v0.s}[0], [x1],  x2
5076        ld1             {v0.s}[1], [x10], x2
5077        ld1             {v1.s}[0], [x1],  x2
5078        ld1             {v1.s}[1], [x10], x2
5079        ushll           v0.8h,   v0.8b,   #3
5080        ushll           v1.8h,   v1.8b,   #3
5081        subs            w8,  w8,  #4
5082        add             v16.8h,  v16.8h,  v0.8h
5083        add             v17.8h,  v17.8h,  v1.8h
5084        st1             {v0.8h, v1.8h}, [x0], #32
5085        b.gt            1b
5086        trn2            v0.2d,   v1.2d,   v1.2d
5087        trn2            v1.2d,   v1.2d,   v1.2d
5088        b               L(ipred_cfl_ac_420_w4_hpad)
5089
5090L(ipred_cfl_ac_444_w8):
5091        AARCH64_VALID_JUMP_TARGET
50921:      // Copy and expand input
5093        ld1             {v0.8b}, [x1],  x2
5094        ld1             {v1.8b}, [x10], x2
5095        ld1             {v2.8b}, [x1],  x2
5096        ushll           v0.8h,   v0.8b,   #3
5097        ld1             {v3.8b}, [x10], x2
5098        ushll           v1.8h,   v1.8b,   #3
5099        ushll           v2.8h,   v2.8b,   #3
5100        ushll           v3.8h,   v3.8b,   #3
5101        subs            w8,  w8,  #4
5102        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5103        add             v16.8h,  v16.8h,  v0.8h
5104        add             v17.8h,  v17.8h,  v1.8h
5105        add             v18.8h,  v18.8h,  v2.8h
5106        add             v19.8h,  v19.8h,  v3.8h
5107        b.gt            1b
5108        mov             v0.16b,  v3.16b
5109        mov             v1.16b,  v3.16b
5110        b               L(ipred_cfl_ac_420_w8_hpad)
5111
5112L(ipred_cfl_ac_444_w16):
5113        AARCH64_VALID_JUMP_TARGET
5114        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
51151:      // Copy and expand input, without padding
5116        ld1             {v0.16b}, [x1],  x2
5117        ld1             {v2.16b}, [x10], x2
5118        ld1             {v4.16b}, [x1],  x2
5119        ushll2          v1.8h,   v0.16b,  #3
5120        ushll           v0.8h,   v0.8b,   #3
5121        ld1             {v6.16b}, [x10], x2
5122        ushll2          v3.8h,   v2.16b,  #3
5123        ushll           v2.8h,   v2.8b,   #3
5124        ushll2          v5.8h,   v4.16b,  #3
5125        ushll           v4.8h,   v4.8b,   #3
5126        ushll2          v7.8h,   v6.16b,  #3
5127        ushll           v6.8h,   v6.8b,   #3
5128        subs            w8,  w8,  #4
5129        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5130        add             v16.8h,  v16.8h,  v0.8h
5131        add             v17.8h,  v17.8h,  v1.8h
5132        add             v18.8h,  v18.8h,  v2.8h
5133        add             v19.8h,  v19.8h,  v3.8h
5134        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5135        add             v16.8h,  v16.8h,  v4.8h
5136        add             v17.8h,  v17.8h,  v5.8h
5137        add             v18.8h,  v18.8h,  v6.8h
5138        add             v19.8h,  v19.8h,  v7.8h
5139        b.gt            1b
5140        mov             v0.16b,  v6.16b
5141        mov             v1.16b,  v7.16b
5142        mov             v2.16b,  v6.16b
5143        mov             v3.16b,  v7.16b
5144        b               L(ipred_cfl_ac_420_w16_hpad)
5145
5146L(ipred_cfl_ac_444_w16_wpad):
51471:      // Copy and expand input, padding 8
5148        ld1             {v0.8b}, [x1],  x2
5149        ld1             {v2.8b}, [x10], x2
5150        ld1             {v4.8b}, [x1],  x2
5151        ld1             {v6.8b}, [x10], x2
5152        ushll           v0.8h,   v0.8b,   #3
5153        ushll           v2.8h,   v2.8b,   #3
5154        ushll           v4.8h,   v4.8b,   #3
5155        ushll           v6.8h,   v6.8b,   #3
5156        dup             v1.8h,   v0.h[7]
5157        dup             v3.8h,   v2.h[7]
5158        dup             v5.8h,   v4.h[7]
5159        dup             v7.8h,   v6.h[7]
5160        subs            w8,  w8,  #4
5161        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5162        add             v16.8h,  v16.8h,  v0.8h
5163        add             v17.8h,  v17.8h,  v1.8h
5164        add             v18.8h,  v18.8h,  v2.8h
5165        add             v19.8h,  v19.8h,  v3.8h
5166        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5167        add             v16.8h,  v16.8h,  v4.8h
5168        add             v17.8h,  v17.8h,  v5.8h
5169        add             v18.8h,  v18.8h,  v6.8h
5170        add             v19.8h,  v19.8h,  v7.8h
5171        b.gt            1b
5172        mov             v0.16b,  v6.16b
5173        mov             v1.16b,  v7.16b
5174        mov             v2.16b,  v6.16b
5175        mov             v3.16b,  v7.16b
5176        b               L(ipred_cfl_ac_420_w16_hpad)
5177
5178L(ipred_cfl_ac_444_w32):
5179        AARCH64_VALID_JUMP_TARGET
5180        movrel          x7,  ipred_cfl_ac_444_w32_tbl
5181        lsr             w3,  w3,  #1
5182        ldrsw           x3,  [x7, w3, uxtw #2]
5183        add             x7,  x7,  x3
5184        br              x7
5185
5186L(ipred_cfl_ac_444_w32_wpad0):
5187        AARCH64_VALID_JUMP_TARGET
51881:      // Copy and expand input, without padding
5189        ld1             {v2.16b, v3.16b}, [x1],  x2
5190        ld1             {v6.16b, v7.16b}, [x10], x2
5191        ushll           v0.8h,   v2.8b,   #3
5192        ushll2          v1.8h,   v2.16b,  #3
5193        ushll           v2.8h,   v3.8b,   #3
5194        ushll2          v3.8h,   v3.16b,  #3
5195        ushll           v4.8h,   v6.8b,   #3
5196        ushll2          v5.8h,   v6.16b,  #3
5197        ushll           v6.8h,   v7.8b,   #3
5198        ushll2          v7.8h,   v7.16b,  #3
5199        subs            w8,  w8,  #2
5200        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5201        add             v16.8h,  v16.8h,  v0.8h
5202        add             v17.8h,  v17.8h,  v1.8h
5203        add             v18.8h,  v18.8h,  v2.8h
5204        add             v19.8h,  v19.8h,  v3.8h
5205        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5206        add             v16.8h,  v16.8h,  v4.8h
5207        add             v17.8h,  v17.8h,  v5.8h
5208        add             v18.8h,  v18.8h,  v6.8h
5209        add             v19.8h,  v19.8h,  v7.8h
5210        b.gt            1b
5211        b               L(ipred_cfl_ac_444_w32_hpad)
5212
5213L(ipred_cfl_ac_444_w32_wpad2):
5214        AARCH64_VALID_JUMP_TARGET
52151:      // Copy and expand input, padding 8
5216        ldr             d2,  [x1,  #16]
5217        ld1             {v1.16b}, [x1],  x2
5218        ldr             d6,  [x10, #16]
5219        ld1             {v5.16b}, [x10], x2
5220        ushll           v2.8h,   v2.8b,   #3
5221        ushll           v0.8h,   v1.8b,   #3
5222        ushll2          v1.8h,   v1.16b,  #3
5223        ushll           v6.8h,   v6.8b,   #3
5224        ushll           v4.8h,   v5.8b,   #3
5225        ushll2          v5.8h,   v5.16b,  #3
5226        dup             v3.8h,   v2.h[7]
5227        dup             v7.8h,   v6.h[7]
5228        subs            w8,  w8,  #2
5229        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5230        add             v16.8h,  v16.8h,  v0.8h
5231        add             v17.8h,  v17.8h,  v1.8h
5232        add             v18.8h,  v18.8h,  v2.8h
5233        add             v19.8h,  v19.8h,  v3.8h
5234        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5235        add             v16.8h,  v16.8h,  v4.8h
5236        add             v17.8h,  v17.8h,  v5.8h
5237        add             v18.8h,  v18.8h,  v6.8h
5238        add             v19.8h,  v19.8h,  v7.8h
5239        b.gt            1b
5240        b               L(ipred_cfl_ac_444_w32_hpad)
5241
5242L(ipred_cfl_ac_444_w32_wpad4):
5243        AARCH64_VALID_JUMP_TARGET
52441:      // Copy and expand input, padding 16
5245        ld1             {v1.16b}, [x1],  x2
5246        ld1             {v5.16b}, [x10], x2
5247        ushll           v0.8h,   v1.8b,   #3
5248        ushll2          v1.8h,   v1.16b,  #3
5249        ushll           v4.8h,   v5.8b,   #3
5250        ushll2          v5.8h,   v5.16b,  #3
5251        dup             v2.8h,   v1.h[7]
5252        dup             v3.8h,   v1.h[7]
5253        dup             v6.8h,   v5.h[7]
5254        dup             v7.8h,   v5.h[7]
5255        subs            w8,  w8,  #2
5256        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5257        add             v16.8h,  v16.8h,  v0.8h
5258        add             v17.8h,  v17.8h,  v1.8h
5259        add             v18.8h,  v18.8h,  v2.8h
5260        add             v19.8h,  v19.8h,  v3.8h
5261        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5262        add             v16.8h,  v16.8h,  v4.8h
5263        add             v17.8h,  v17.8h,  v5.8h
5264        add             v18.8h,  v18.8h,  v6.8h
5265        add             v19.8h,  v19.8h,  v7.8h
5266        b.gt            1b
5267        b               L(ipred_cfl_ac_444_w32_hpad)
5268
5269L(ipred_cfl_ac_444_w32_wpad6):
5270        AARCH64_VALID_JUMP_TARGET
52711:      // Copy and expand input, padding 24
5272        ld1             {v0.8b}, [x1],  x2
5273        ld1             {v4.8b}, [x10], x2
5274        ushll           v0.8h,   v0.8b,   #3
5275        ushll           v4.8h,   v4.8b,   #3
5276        dup             v1.8h,   v0.h[7]
5277        dup             v2.8h,   v0.h[7]
5278        dup             v3.8h,   v0.h[7]
5279        dup             v5.8h,   v4.h[7]
5280        dup             v6.8h,   v4.h[7]
5281        dup             v7.8h,   v4.h[7]
5282        subs            w8,  w8,  #2
5283        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5284        add             v16.8h,  v16.8h,  v0.8h
5285        add             v17.8h,  v17.8h,  v1.8h
5286        add             v18.8h,  v18.8h,  v2.8h
5287        add             v19.8h,  v19.8h,  v3.8h
5288        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5289        add             v16.8h,  v16.8h,  v4.8h
5290        add             v17.8h,  v17.8h,  v5.8h
5291        add             v18.8h,  v18.8h,  v6.8h
5292        add             v19.8h,  v19.8h,  v7.8h
5293        b.gt            1b
5294
5295L(ipred_cfl_ac_444_w32_hpad):
5296        cbz             w4,  3f
52972:      // Vertical padding (h_pad > 0)
5298        subs            w4,  w4,  #2
5299        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5300        add             v16.8h,  v16.8h,  v4.8h
5301        add             v17.8h,  v17.8h,  v5.8h
5302        add             v18.8h,  v18.8h,  v6.8h
5303        add             v19.8h,  v19.8h,  v7.8h
5304        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5305        add             v16.8h,  v16.8h,  v4.8h
5306        add             v17.8h,  v17.8h,  v5.8h
5307        add             v18.8h,  v18.8h,  v6.8h
5308        add             v19.8h,  v19.8h,  v7.8h
5309        b.gt            2b
53103:
5311
5312        // Quadruple the height and reuse the w8 subtracting
5313        lsl             w6,  w6,  #2
5314        // Aggregate the sums, with wider intermediates earlier than in
5315        // ipred_cfl_ac_420_w8_calc_subtract_dc.
5316        uaddlp          v0.4s,   v16.8h
5317        uaddlp          v1.4s,   v17.8h
5318        uaddlp          v2.4s,   v18.8h
5319        uaddlp          v3.4s,   v19.8h
5320        add             v0.4s,   v0.4s,   v1.4s
5321        add             v2.4s,   v2.4s,   v3.4s
5322        add             v0.4s,   v0.4s,   v2.4s
5323        addv            s0,  v0.4s                // sum
5324        sub             x0,  x0,  w6, uxtw #4
5325        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
5326        dup             v4.8h,   v4.h[0]
5327        b               L(ipred_cfl_ac_420_w8_subtract_dc)
5328endfunc
5329
5330jumptable ipred_cfl_ac_444_tbl
5331        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
5332        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
5333        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
5334        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
5335endjumptable
5336
5337jumptable ipred_cfl_ac_444_w32_tbl
5338        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
5339        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
5340        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
5341        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
5342endjumptable
5343